| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 2562, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00117096018735363, | |
| "grad_norm": 2.397789478302002, | |
| "learning_rate": 1.5503875968992249e-07, | |
| "loss": 0.5513913631439209, | |
| "memory(GiB)": 137.67, | |
| "step": 1, | |
| "token_acc": 0.8478124608248715, | |
| "train_speed(iter/s)": 0.014244 | |
| }, | |
| { | |
| "epoch": 0.00585480093676815, | |
| "grad_norm": 2.213494300842285, | |
| "learning_rate": 7.751937984496125e-07, | |
| "loss": 0.5191692113876343, | |
| "memory(GiB)": 137.67, | |
| "step": 5, | |
| "token_acc": 0.848514893999071, | |
| "train_speed(iter/s)": 0.029361 | |
| }, | |
| { | |
| "epoch": 0.0117096018735363, | |
| "grad_norm": 2.0672056674957275, | |
| "learning_rate": 1.550387596899225e-06, | |
| "loss": 0.5157936096191407, | |
| "memory(GiB)": 137.67, | |
| "step": 10, | |
| "token_acc": 0.8392344826938901, | |
| "train_speed(iter/s)": 0.034106 | |
| }, | |
| { | |
| "epoch": 0.01756440281030445, | |
| "grad_norm": 1.588051676750183, | |
| "learning_rate": 2.3255813953488376e-06, | |
| "loss": 0.49305076599121095, | |
| "memory(GiB)": 137.67, | |
| "step": 15, | |
| "token_acc": 0.8437633920693741, | |
| "train_speed(iter/s)": 0.03643 | |
| }, | |
| { | |
| "epoch": 0.0234192037470726, | |
| "grad_norm": 0.7405409812927246, | |
| "learning_rate": 3.10077519379845e-06, | |
| "loss": 0.43950672149658204, | |
| "memory(GiB)": 137.67, | |
| "step": 20, | |
| "token_acc": 0.848157187048235, | |
| "train_speed(iter/s)": 0.03757 | |
| }, | |
| { | |
| "epoch": 0.02927400468384075, | |
| "grad_norm": 0.8562428951263428, | |
| "learning_rate": 3.875968992248063e-06, | |
| "loss": 0.4227635383605957, | |
| "memory(GiB)": 137.67, | |
| "step": 25, | |
| "token_acc": 0.8593663993232968, | |
| "train_speed(iter/s)": 0.038283 | |
| }, | |
| { | |
| "epoch": 0.0351288056206089, | |
| "grad_norm": 0.4966309666633606, | |
| "learning_rate": 4.651162790697675e-06, | |
| "loss": 0.4113954544067383, | |
| "memory(GiB)": 137.67, | |
| "step": 30, | |
| "token_acc": 0.8579081152325363, | |
| "train_speed(iter/s)": 0.038822 | |
| }, | |
| { | |
| "epoch": 0.040983606557377046, | |
| "grad_norm": 0.4413171410560608, | |
| "learning_rate": 5.4263565891472865e-06, | |
| "loss": 0.40917291641235354, | |
| "memory(GiB)": 137.67, | |
| "step": 35, | |
| "token_acc": 0.8563618960945223, | |
| "train_speed(iter/s)": 0.039192 | |
| }, | |
| { | |
| "epoch": 0.0468384074941452, | |
| "grad_norm": 0.37367990612983704, | |
| "learning_rate": 6.2015503875969e-06, | |
| "loss": 0.38341727256774905, | |
| "memory(GiB)": 137.67, | |
| "step": 40, | |
| "token_acc": 0.8598059924304837, | |
| "train_speed(iter/s)": 0.039486 | |
| }, | |
| { | |
| "epoch": 0.05269320843091335, | |
| "grad_norm": 0.2625274062156677, | |
| "learning_rate": 6.976744186046513e-06, | |
| "loss": 0.39299936294555665, | |
| "memory(GiB)": 137.67, | |
| "step": 45, | |
| "token_acc": 0.8545384055298668, | |
| "train_speed(iter/s)": 0.03968 | |
| }, | |
| { | |
| "epoch": 0.0585480093676815, | |
| "grad_norm": 0.27871787548065186, | |
| "learning_rate": 7.751937984496126e-06, | |
| "loss": 0.38351633548736574, | |
| "memory(GiB)": 137.67, | |
| "step": 50, | |
| "token_acc": 0.8680353205073448, | |
| "train_speed(iter/s)": 0.039861 | |
| }, | |
| { | |
| "epoch": 0.06440281030444965, | |
| "grad_norm": 0.2245069444179535, | |
| "learning_rate": 8.527131782945736e-06, | |
| "loss": 0.3764484882354736, | |
| "memory(GiB)": 137.67, | |
| "step": 55, | |
| "token_acc": 0.8676952168658857, | |
| "train_speed(iter/s)": 0.040018 | |
| }, | |
| { | |
| "epoch": 0.0702576112412178, | |
| "grad_norm": 0.22919970750808716, | |
| "learning_rate": 9.30232558139535e-06, | |
| "loss": 0.3956867218017578, | |
| "memory(GiB)": 137.67, | |
| "step": 60, | |
| "token_acc": 0.865152491108186, | |
| "train_speed(iter/s)": 0.040146 | |
| }, | |
| { | |
| "epoch": 0.07611241217798595, | |
| "grad_norm": 0.21093736588954926, | |
| "learning_rate": 1.0077519379844963e-05, | |
| "loss": 0.37714409828186035, | |
| "memory(GiB)": 137.67, | |
| "step": 65, | |
| "token_acc": 0.8760504070619795, | |
| "train_speed(iter/s)": 0.040253 | |
| }, | |
| { | |
| "epoch": 0.08196721311475409, | |
| "grad_norm": 0.21410879492759705, | |
| "learning_rate": 1.0852713178294573e-05, | |
| "loss": 0.3757580995559692, | |
| "memory(GiB)": 137.67, | |
| "step": 70, | |
| "token_acc": 0.8649565195567881, | |
| "train_speed(iter/s)": 0.040315 | |
| }, | |
| { | |
| "epoch": 0.08782201405152225, | |
| "grad_norm": 0.1979837864637375, | |
| "learning_rate": 1.1627906976744187e-05, | |
| "loss": 0.37558441162109374, | |
| "memory(GiB)": 137.67, | |
| "step": 75, | |
| "token_acc": 0.8532517495556191, | |
| "train_speed(iter/s)": 0.040405 | |
| }, | |
| { | |
| "epoch": 0.0936768149882904, | |
| "grad_norm": 0.207350954413414, | |
| "learning_rate": 1.24031007751938e-05, | |
| "loss": 0.3741091966629028, | |
| "memory(GiB)": 137.67, | |
| "step": 80, | |
| "token_acc": 0.8612590246358096, | |
| "train_speed(iter/s)": 0.040461 | |
| }, | |
| { | |
| "epoch": 0.09953161592505855, | |
| "grad_norm": 0.19452251493930817, | |
| "learning_rate": 1.3178294573643412e-05, | |
| "loss": 0.3656472682952881, | |
| "memory(GiB)": 137.67, | |
| "step": 85, | |
| "token_acc": 0.8822223551750307, | |
| "train_speed(iter/s)": 0.040557 | |
| }, | |
| { | |
| "epoch": 0.1053864168618267, | |
| "grad_norm": 0.20653362572193146, | |
| "learning_rate": 1.3953488372093025e-05, | |
| "loss": 0.3706169605255127, | |
| "memory(GiB)": 137.67, | |
| "step": 90, | |
| "token_acc": 0.8654753188641241, | |
| "train_speed(iter/s)": 0.04063 | |
| }, | |
| { | |
| "epoch": 0.11124121779859485, | |
| "grad_norm": 0.20383736491203308, | |
| "learning_rate": 1.4728682170542636e-05, | |
| "loss": 0.3718616485595703, | |
| "memory(GiB)": 137.67, | |
| "step": 95, | |
| "token_acc": 0.8700523810121971, | |
| "train_speed(iter/s)": 0.040694 | |
| }, | |
| { | |
| "epoch": 0.117096018735363, | |
| "grad_norm": 0.2144174873828888, | |
| "learning_rate": 1.550387596899225e-05, | |
| "loss": 0.3716637134552002, | |
| "memory(GiB)": 137.67, | |
| "step": 100, | |
| "token_acc": 0.871046915998142, | |
| "train_speed(iter/s)": 0.040754 | |
| }, | |
| { | |
| "epoch": 0.12295081967213115, | |
| "grad_norm": 0.2225562483072281, | |
| "learning_rate": 1.6279069767441862e-05, | |
| "loss": 0.3682845115661621, | |
| "memory(GiB)": 137.67, | |
| "step": 105, | |
| "token_acc": 0.8729440672893664, | |
| "train_speed(iter/s)": 0.040816 | |
| }, | |
| { | |
| "epoch": 0.1288056206088993, | |
| "grad_norm": 0.2207648605108261, | |
| "learning_rate": 1.7054263565891473e-05, | |
| "loss": 0.3570878982543945, | |
| "memory(GiB)": 137.67, | |
| "step": 110, | |
| "token_acc": 0.8706495975584588, | |
| "train_speed(iter/s)": 0.04088 | |
| }, | |
| { | |
| "epoch": 0.13466042154566746, | |
| "grad_norm": 0.2282887101173401, | |
| "learning_rate": 1.7829457364341087e-05, | |
| "loss": 0.3752657175064087, | |
| "memory(GiB)": 137.67, | |
| "step": 115, | |
| "token_acc": 0.8784262063618629, | |
| "train_speed(iter/s)": 0.040925 | |
| }, | |
| { | |
| "epoch": 0.1405152224824356, | |
| "grad_norm": 0.23532657325267792, | |
| "learning_rate": 1.86046511627907e-05, | |
| "loss": 0.3657325029373169, | |
| "memory(GiB)": 137.67, | |
| "step": 120, | |
| "token_acc": 0.8712829028328604, | |
| "train_speed(iter/s)": 0.040965 | |
| }, | |
| { | |
| "epoch": 0.14637002341920374, | |
| "grad_norm": 0.2132922112941742, | |
| "learning_rate": 1.937984496124031e-05, | |
| "loss": 0.3799854278564453, | |
| "memory(GiB)": 137.67, | |
| "step": 125, | |
| "token_acc": 0.8649469651038509, | |
| "train_speed(iter/s)": 0.041003 | |
| }, | |
| { | |
| "epoch": 0.1522248243559719, | |
| "grad_norm": 0.2445414662361145, | |
| "learning_rate": 1.9999991663467044e-05, | |
| "loss": 0.3770766258239746, | |
| "memory(GiB)": 137.67, | |
| "step": 130, | |
| "token_acc": 0.8692484710531911, | |
| "train_speed(iter/s)": 0.041036 | |
| }, | |
| { | |
| "epoch": 0.15807962529274006, | |
| "grad_norm": 0.2305486649274826, | |
| "learning_rate": 1.9999699886272926e-05, | |
| "loss": 0.3788888931274414, | |
| "memory(GiB)": 137.67, | |
| "step": 135, | |
| "token_acc": 0.8571357490266324, | |
| "train_speed(iter/s)": 0.041054 | |
| }, | |
| { | |
| "epoch": 0.16393442622950818, | |
| "grad_norm": 0.2297585904598236, | |
| "learning_rate": 1.9998991296330317e-05, | |
| "loss": 0.3768150806427002, | |
| "memory(GiB)": 137.67, | |
| "step": 140, | |
| "token_acc": 0.8707652096887886, | |
| "train_speed(iter/s)": 0.04107 | |
| }, | |
| { | |
| "epoch": 0.16978922716627634, | |
| "grad_norm": 0.22929546236991882, | |
| "learning_rate": 1.9997865923175027e-05, | |
| "loss": 0.3672610282897949, | |
| "memory(GiB)": 137.67, | |
| "step": 145, | |
| "token_acc": 0.8764070583454463, | |
| "train_speed(iter/s)": 0.041074 | |
| }, | |
| { | |
| "epoch": 0.1756440281030445, | |
| "grad_norm": 0.2531713843345642, | |
| "learning_rate": 1.999632381371545e-05, | |
| "loss": 0.3735011577606201, | |
| "memory(GiB)": 137.67, | |
| "step": 150, | |
| "token_acc": 0.8610904473031397, | |
| "train_speed(iter/s)": 0.041095 | |
| }, | |
| { | |
| "epoch": 0.18149882903981265, | |
| "grad_norm": 0.21190133690834045, | |
| "learning_rate": 1.999436503223061e-05, | |
| "loss": 0.37088618278503416, | |
| "memory(GiB)": 137.67, | |
| "step": 155, | |
| "token_acc": 0.869811065319577, | |
| "train_speed(iter/s)": 0.0411 | |
| }, | |
| { | |
| "epoch": 0.1873536299765808, | |
| "grad_norm": 0.24962091445922852, | |
| "learning_rate": 1.9991989660367463e-05, | |
| "loss": 0.3776357650756836, | |
| "memory(GiB)": 137.67, | |
| "step": 160, | |
| "token_acc": 0.8544295113661168, | |
| "train_speed(iter/s)": 0.041107 | |
| }, | |
| { | |
| "epoch": 0.19320843091334894, | |
| "grad_norm": 0.20956465601921082, | |
| "learning_rate": 1.998919779713751e-05, | |
| "loss": 0.3805836200714111, | |
| "memory(GiB)": 137.67, | |
| "step": 165, | |
| "token_acc": 0.8613002884067936, | |
| "train_speed(iter/s)": 0.041115 | |
| }, | |
| { | |
| "epoch": 0.1990632318501171, | |
| "grad_norm": 0.206803560256958, | |
| "learning_rate": 1.998598955891266e-05, | |
| "loss": 0.3702584505081177, | |
| "memory(GiB)": 137.67, | |
| "step": 170, | |
| "token_acc": 0.8749547416575101, | |
| "train_speed(iter/s)": 0.04113 | |
| }, | |
| { | |
| "epoch": 0.20491803278688525, | |
| "grad_norm": 0.23116904497146606, | |
| "learning_rate": 1.9982365079420382e-05, | |
| "loss": 0.3598947048187256, | |
| "memory(GiB)": 137.67, | |
| "step": 175, | |
| "token_acc": 0.8684363191646153, | |
| "train_speed(iter/s)": 0.041153 | |
| }, | |
| { | |
| "epoch": 0.2107728337236534, | |
| "grad_norm": 0.22105969488620758, | |
| "learning_rate": 1.9978324509738147e-05, | |
| "loss": 0.36261582374572754, | |
| "memory(GiB)": 137.67, | |
| "step": 180, | |
| "token_acc": 0.8722339081558761, | |
| "train_speed(iter/s)": 0.041173 | |
| }, | |
| { | |
| "epoch": 0.21662763466042154, | |
| "grad_norm": 0.21819841861724854, | |
| "learning_rate": 1.9973868018287093e-05, | |
| "loss": 0.3629172325134277, | |
| "memory(GiB)": 137.67, | |
| "step": 185, | |
| "token_acc": 0.8667994850156469, | |
| "train_speed(iter/s)": 0.041195 | |
| }, | |
| { | |
| "epoch": 0.2224824355971897, | |
| "grad_norm": 0.2083064317703247, | |
| "learning_rate": 1.9968995790825048e-05, | |
| "loss": 0.3675278902053833, | |
| "memory(GiB)": 137.67, | |
| "step": 190, | |
| "token_acc": 0.8575012434717731, | |
| "train_speed(iter/s)": 0.0412 | |
| }, | |
| { | |
| "epoch": 0.22833723653395785, | |
| "grad_norm": 0.21168376505374908, | |
| "learning_rate": 1.9963708030438754e-05, | |
| "loss": 0.3663478374481201, | |
| "memory(GiB)": 137.67, | |
| "step": 195, | |
| "token_acc": 0.8699046566256736, | |
| "train_speed(iter/s)": 0.041213 | |
| }, | |
| { | |
| "epoch": 0.234192037470726, | |
| "grad_norm": 0.21624095737934113, | |
| "learning_rate": 1.995800495753542e-05, | |
| "loss": 0.36658034324645994, | |
| "memory(GiB)": 137.67, | |
| "step": 200, | |
| "token_acc": 0.8611760598068374, | |
| "train_speed(iter/s)": 0.041221 | |
| }, | |
| { | |
| "epoch": 0.24004683840749413, | |
| "grad_norm": 0.21765926480293274, | |
| "learning_rate": 1.9951886809833537e-05, | |
| "loss": 0.37610225677490233, | |
| "memory(GiB)": 137.67, | |
| "step": 205, | |
| "token_acc": 0.8608684017275929, | |
| "train_speed(iter/s)": 0.041233 | |
| }, | |
| { | |
| "epoch": 0.2459016393442623, | |
| "grad_norm": 0.21804192662239075, | |
| "learning_rate": 1.9945353842352943e-05, | |
| "loss": 0.37209372520446776, | |
| "memory(GiB)": 137.67, | |
| "step": 210, | |
| "token_acc": 0.8637638606903014, | |
| "train_speed(iter/s)": 0.041242 | |
| }, | |
| { | |
| "epoch": 0.25175644028103045, | |
| "grad_norm": 0.21353310346603394, | |
| "learning_rate": 1.9938406327404233e-05, | |
| "loss": 0.36923999786376954, | |
| "memory(GiB)": 137.67, | |
| "step": 215, | |
| "token_acc": 0.8725016214590311, | |
| "train_speed(iter/s)": 0.041259 | |
| }, | |
| { | |
| "epoch": 0.2576112412177986, | |
| "grad_norm": 0.21438100934028625, | |
| "learning_rate": 1.9931044554577373e-05, | |
| "loss": 0.36598026752471924, | |
| "memory(GiB)": 137.67, | |
| "step": 220, | |
| "token_acc": 0.8663032304289586, | |
| "train_speed(iter/s)": 0.041275 | |
| }, | |
| { | |
| "epoch": 0.26346604215456676, | |
| "grad_norm": 0.21610133349895477, | |
| "learning_rate": 1.992326883072965e-05, | |
| "loss": 0.36849284172058105, | |
| "memory(GiB)": 137.67, | |
| "step": 225, | |
| "token_acc": 0.8614589650451081, | |
| "train_speed(iter/s)": 0.041281 | |
| }, | |
| { | |
| "epoch": 0.2693208430913349, | |
| "grad_norm": 0.2203439474105835, | |
| "learning_rate": 1.991507947997287e-05, | |
| "loss": 0.3765848636627197, | |
| "memory(GiB)": 137.67, | |
| "step": 230, | |
| "token_acc": 0.8680725737864995, | |
| "train_speed(iter/s)": 0.041291 | |
| }, | |
| { | |
| "epoch": 0.275175644028103, | |
| "grad_norm": 0.22208204865455627, | |
| "learning_rate": 1.9906476843659866e-05, | |
| "loss": 0.3718143939971924, | |
| "memory(GiB)": 137.67, | |
| "step": 235, | |
| "token_acc": 0.8758277835099897, | |
| "train_speed(iter/s)": 0.041301 | |
| }, | |
| { | |
| "epoch": 0.2810304449648712, | |
| "grad_norm": 0.20069433748722076, | |
| "learning_rate": 1.989746128037024e-05, | |
| "loss": 0.3583400249481201, | |
| "memory(GiB)": 137.67, | |
| "step": 240, | |
| "token_acc": 0.8676873362719415, | |
| "train_speed(iter/s)": 0.04131 | |
| }, | |
| { | |
| "epoch": 0.28688524590163933, | |
| "grad_norm": 0.19968946278095245, | |
| "learning_rate": 1.988803316589545e-05, | |
| "loss": 0.3672914505004883, | |
| "memory(GiB)": 137.67, | |
| "step": 245, | |
| "token_acc": 0.8662484056672067, | |
| "train_speed(iter/s)": 0.041328 | |
| }, | |
| { | |
| "epoch": 0.2927400468384075, | |
| "grad_norm": 0.21298536658287048, | |
| "learning_rate": 1.987819289322311e-05, | |
| "loss": 0.3696786403656006, | |
| "memory(GiB)": 137.67, | |
| "step": 250, | |
| "token_acc": 0.8654257420775034, | |
| "train_speed(iter/s)": 0.041348 | |
| }, | |
| { | |
| "epoch": 0.29859484777517564, | |
| "grad_norm": 0.2145387828350067, | |
| "learning_rate": 1.9867940872520646e-05, | |
| "loss": 0.3744542598724365, | |
| "memory(GiB)": 137.67, | |
| "step": 255, | |
| "token_acc": 0.8661229081704401, | |
| "train_speed(iter/s)": 0.041346 | |
| }, | |
| { | |
| "epoch": 0.3044496487119438, | |
| "grad_norm": 0.2132762223482132, | |
| "learning_rate": 1.9857277531118173e-05, | |
| "loss": 0.36826577186584475, | |
| "memory(GiB)": 137.67, | |
| "step": 260, | |
| "token_acc": 0.8788229158157335, | |
| "train_speed(iter/s)": 0.041353 | |
| }, | |
| { | |
| "epoch": 0.31030444964871196, | |
| "grad_norm": 0.2133207470178604, | |
| "learning_rate": 1.9846203313490697e-05, | |
| "loss": 0.35997600555419923, | |
| "memory(GiB)": 137.67, | |
| "step": 265, | |
| "token_acc": 0.8834285319525085, | |
| "train_speed(iter/s)": 0.041363 | |
| }, | |
| { | |
| "epoch": 0.3161592505854801, | |
| "grad_norm": 0.23535007238388062, | |
| "learning_rate": 1.983471868123958e-05, | |
| "loss": 0.3588090896606445, | |
| "memory(GiB)": 137.67, | |
| "step": 270, | |
| "token_acc": 0.8657706943523579, | |
| "train_speed(iter/s)": 0.041379 | |
| }, | |
| { | |
| "epoch": 0.32201405152224827, | |
| "grad_norm": 0.21440958976745605, | |
| "learning_rate": 1.98228241130733e-05, | |
| "loss": 0.38217363357543943, | |
| "memory(GiB)": 137.67, | |
| "step": 275, | |
| "token_acc": 0.8693404501511701, | |
| "train_speed(iter/s)": 0.041386 | |
| }, | |
| { | |
| "epoch": 0.32786885245901637, | |
| "grad_norm": 0.21196675300598145, | |
| "learning_rate": 1.98105201047875e-05, | |
| "loss": 0.35698800086975097, | |
| "memory(GiB)": 137.67, | |
| "step": 280, | |
| "token_acc": 0.8743185598247525, | |
| "train_speed(iter/s)": 0.041403 | |
| }, | |
| { | |
| "epoch": 0.3337236533957845, | |
| "grad_norm": 0.22762241959571838, | |
| "learning_rate": 1.9797807169244326e-05, | |
| "loss": 0.3626487016677856, | |
| "memory(GiB)": 137.67, | |
| "step": 285, | |
| "token_acc": 0.8661923737202862, | |
| "train_speed(iter/s)": 0.041406 | |
| }, | |
| { | |
| "epoch": 0.3395784543325527, | |
| "grad_norm": 0.21537438035011292, | |
| "learning_rate": 1.9784685836351045e-05, | |
| "loss": 0.37597248554229734, | |
| "memory(GiB)": 137.67, | |
| "step": 290, | |
| "token_acc": 0.8632790864113016, | |
| "train_speed(iter/s)": 0.041408 | |
| }, | |
| { | |
| "epoch": 0.34543325526932084, | |
| "grad_norm": 0.24162794649600983, | |
| "learning_rate": 1.9771156653037944e-05, | |
| "loss": 0.3674392461776733, | |
| "memory(GiB)": 137.67, | |
| "step": 295, | |
| "token_acc": 0.86579905677273, | |
| "train_speed(iter/s)": 0.041418 | |
| }, | |
| { | |
| "epoch": 0.351288056206089, | |
| "grad_norm": 0.19127634167671204, | |
| "learning_rate": 1.975722018323556e-05, | |
| "loss": 0.3606871604919434, | |
| "memory(GiB)": 137.67, | |
| "step": 300, | |
| "token_acc": 0.8730913571244476, | |
| "train_speed(iter/s)": 0.041416 | |
| }, | |
| { | |
| "epoch": 0.35714285714285715, | |
| "grad_norm": 0.21248631179332733, | |
| "learning_rate": 1.974287700785116e-05, | |
| "loss": 0.3568113327026367, | |
| "memory(GiB)": 137.67, | |
| "step": 305, | |
| "token_acc": 0.8697051358380598, | |
| "train_speed(iter/s)": 0.041425 | |
| }, | |
| { | |
| "epoch": 0.3629976580796253, | |
| "grad_norm": 0.20225107669830322, | |
| "learning_rate": 1.9728127724744516e-05, | |
| "loss": 0.3483549118041992, | |
| "memory(GiB)": 137.67, | |
| "step": 310, | |
| "token_acc": 0.8697423969369493, | |
| "train_speed(iter/s)": 0.041425 | |
| }, | |
| { | |
| "epoch": 0.36885245901639346, | |
| "grad_norm": 0.2230818122625351, | |
| "learning_rate": 1.9712972948703006e-05, | |
| "loss": 0.36976261138916017, | |
| "memory(GiB)": 137.67, | |
| "step": 315, | |
| "token_acc": 0.8751112598082228, | |
| "train_speed(iter/s)": 0.04143 | |
| }, | |
| { | |
| "epoch": 0.3747072599531616, | |
| "grad_norm": 0.1945132613182068, | |
| "learning_rate": 1.9697413311415967e-05, | |
| "loss": 0.364810585975647, | |
| "memory(GiB)": 137.67, | |
| "step": 320, | |
| "token_acc": 0.8484778468167483, | |
| "train_speed(iter/s)": 0.041435 | |
| }, | |
| { | |
| "epoch": 0.3805620608899297, | |
| "grad_norm": 0.19989554584026337, | |
| "learning_rate": 1.9681449461448386e-05, | |
| "loss": 0.3616858959197998, | |
| "memory(GiB)": 137.67, | |
| "step": 325, | |
| "token_acc": 0.8718356506795814, | |
| "train_speed(iter/s)": 0.041435 | |
| }, | |
| { | |
| "epoch": 0.3864168618266979, | |
| "grad_norm": 0.2084866315126419, | |
| "learning_rate": 1.9665082064213856e-05, | |
| "loss": 0.36598567962646483, | |
| "memory(GiB)": 137.67, | |
| "step": 330, | |
| "token_acc": 0.8664227187552337, | |
| "train_speed(iter/s)": 0.041441 | |
| }, | |
| { | |
| "epoch": 0.39227166276346603, | |
| "grad_norm": 0.20807960629463196, | |
| "learning_rate": 1.9648311801946823e-05, | |
| "loss": 0.3633120059967041, | |
| "memory(GiB)": 137.67, | |
| "step": 335, | |
| "token_acc": 0.8659399461174416, | |
| "train_speed(iter/s)": 0.041448 | |
| }, | |
| { | |
| "epoch": 0.3981264637002342, | |
| "grad_norm": 0.21306882798671722, | |
| "learning_rate": 1.9631139373674188e-05, | |
| "loss": 0.36129164695739746, | |
| "memory(GiB)": 137.67, | |
| "step": 340, | |
| "token_acc": 0.8666773452933952, | |
| "train_speed(iter/s)": 0.04145 | |
| }, | |
| { | |
| "epoch": 0.40398126463700235, | |
| "grad_norm": 0.21947889029979706, | |
| "learning_rate": 1.9613565495186126e-05, | |
| "loss": 0.35186495780944826, | |
| "memory(GiB)": 137.67, | |
| "step": 345, | |
| "token_acc": 0.8666396689403815, | |
| "train_speed(iter/s)": 0.041463 | |
| }, | |
| { | |
| "epoch": 0.4098360655737705, | |
| "grad_norm": 0.2155865728855133, | |
| "learning_rate": 1.9595590899006288e-05, | |
| "loss": 0.3684532880783081, | |
| "memory(GiB)": 137.67, | |
| "step": 350, | |
| "token_acc": 0.8713802951875973, | |
| "train_speed(iter/s)": 0.041462 | |
| }, | |
| { | |
| "epoch": 0.41569086651053866, | |
| "grad_norm": 0.2150585651397705, | |
| "learning_rate": 1.957721633436124e-05, | |
| "loss": 0.3669363260269165, | |
| "memory(GiB)": 137.67, | |
| "step": 355, | |
| "token_acc": 0.8683417743625568, | |
| "train_speed(iter/s)": 0.041459 | |
| }, | |
| { | |
| "epoch": 0.4215456674473068, | |
| "grad_norm": 0.22773627936840057, | |
| "learning_rate": 1.9558442567149244e-05, | |
| "loss": 0.36423306465148925, | |
| "memory(GiB)": 137.67, | |
| "step": 360, | |
| "token_acc": 0.8815313637998826, | |
| "train_speed(iter/s)": 0.041467 | |
| }, | |
| { | |
| "epoch": 0.4274004683840749, | |
| "grad_norm": 0.19997937977313995, | |
| "learning_rate": 1.953927037990834e-05, | |
| "loss": 0.3707897186279297, | |
| "memory(GiB)": 137.67, | |
| "step": 365, | |
| "token_acc": 0.8580402286389447, | |
| "train_speed(iter/s)": 0.041471 | |
| }, | |
| { | |
| "epoch": 0.4332552693208431, | |
| "grad_norm": 0.21174229681491852, | |
| "learning_rate": 1.9519700571783718e-05, | |
| "loss": 0.3715445280075073, | |
| "memory(GiB)": 137.67, | |
| "step": 370, | |
| "token_acc": 0.873243385426675, | |
| "train_speed(iter/s)": 0.041468 | |
| }, | |
| { | |
| "epoch": 0.43911007025761123, | |
| "grad_norm": 0.2164727747440338, | |
| "learning_rate": 1.9499733958494405e-05, | |
| "loss": 0.36826701164245607, | |
| "memory(GiB)": 137.67, | |
| "step": 375, | |
| "token_acc": 0.8624453058192736, | |
| "train_speed(iter/s)": 0.041471 | |
| }, | |
| { | |
| "epoch": 0.4449648711943794, | |
| "grad_norm": 0.2175064980983734, | |
| "learning_rate": 1.947937137229928e-05, | |
| "loss": 0.3610344648361206, | |
| "memory(GiB)": 137.67, | |
| "step": 380, | |
| "token_acc": 0.8791143721842437, | |
| "train_speed(iter/s)": 0.041474 | |
| }, | |
| { | |
| "epoch": 0.45081967213114754, | |
| "grad_norm": 0.21257779002189636, | |
| "learning_rate": 1.9458613661962366e-05, | |
| "loss": 0.36273534297943116, | |
| "memory(GiB)": 137.67, | |
| "step": 385, | |
| "token_acc": 0.8811885856547406, | |
| "train_speed(iter/s)": 0.041479 | |
| }, | |
| { | |
| "epoch": 0.4566744730679157, | |
| "grad_norm": 0.2007063329219818, | |
| "learning_rate": 1.943746169271746e-05, | |
| "loss": 0.36213395595550535, | |
| "memory(GiB)": 137.67, | |
| "step": 390, | |
| "token_acc": 0.8793212957081934, | |
| "train_speed(iter/s)": 0.041474 | |
| }, | |
| { | |
| "epoch": 0.46252927400468385, | |
| "grad_norm": 0.1982836127281189, | |
| "learning_rate": 1.941591634623206e-05, | |
| "loss": 0.3674773693084717, | |
| "memory(GiB)": 137.67, | |
| "step": 395, | |
| "token_acc": 0.8714787014744528, | |
| "train_speed(iter/s)": 0.04148 | |
| }, | |
| { | |
| "epoch": 0.468384074941452, | |
| "grad_norm": 0.21029749512672424, | |
| "learning_rate": 1.9393978520570638e-05, | |
| "loss": 0.35383853912353513, | |
| "memory(GiB)": 137.67, | |
| "step": 400, | |
| "token_acc": 0.8725135029354207, | |
| "train_speed(iter/s)": 0.041493 | |
| }, | |
| { | |
| "epoch": 0.47423887587822017, | |
| "grad_norm": 0.2057942897081375, | |
| "learning_rate": 1.9371649130157166e-05, | |
| "loss": 0.35016608238220215, | |
| "memory(GiB)": 137.67, | |
| "step": 405, | |
| "token_acc": 0.8716170696781026, | |
| "train_speed(iter/s)": 0.041495 | |
| }, | |
| { | |
| "epoch": 0.48009367681498827, | |
| "grad_norm": 0.21962089836597443, | |
| "learning_rate": 1.9348929105737044e-05, | |
| "loss": 0.3551772117614746, | |
| "memory(GiB)": 137.67, | |
| "step": 410, | |
| "token_acc": 0.8725112535977174, | |
| "train_speed(iter/s)": 0.041495 | |
| }, | |
| { | |
| "epoch": 0.4859484777517564, | |
| "grad_norm": 0.22210708260536194, | |
| "learning_rate": 1.932581939433827e-05, | |
| "loss": 0.3688118696212769, | |
| "memory(GiB)": 137.67, | |
| "step": 415, | |
| "token_acc": 0.8727626971050538, | |
| "train_speed(iter/s)": 0.041496 | |
| }, | |
| { | |
| "epoch": 0.4918032786885246, | |
| "grad_norm": 0.21538780629634857, | |
| "learning_rate": 1.9302320959231997e-05, | |
| "loss": 0.3600668430328369, | |
| "memory(GiB)": 137.67, | |
| "step": 420, | |
| "token_acc": 0.87065663645922, | |
| "train_speed(iter/s)": 0.041499 | |
| }, | |
| { | |
| "epoch": 0.49765807962529274, | |
| "grad_norm": 0.19987384974956512, | |
| "learning_rate": 1.927843477989234e-05, | |
| "loss": 0.3570875644683838, | |
| "memory(GiB)": 137.67, | |
| "step": 425, | |
| "token_acc": 0.8845410461012411, | |
| "train_speed(iter/s)": 0.041501 | |
| }, | |
| { | |
| "epoch": 0.5035128805620609, | |
| "grad_norm": 0.20627401769161224, | |
| "learning_rate": 1.9254161851955587e-05, | |
| "loss": 0.36909596920013427, | |
| "memory(GiB)": 137.67, | |
| "step": 430, | |
| "token_acc": 0.8750783836660981, | |
| "train_speed(iter/s)": 0.041507 | |
| }, | |
| { | |
| "epoch": 0.509367681498829, | |
| "grad_norm": 0.22353969514369965, | |
| "learning_rate": 1.9229503187178694e-05, | |
| "loss": 0.36271133422851565, | |
| "memory(GiB)": 137.67, | |
| "step": 435, | |
| "token_acc": 0.8696993866195712, | |
| "train_speed(iter/s)": 0.04151 | |
| }, | |
| { | |
| "epoch": 0.5152224824355972, | |
| "grad_norm": 0.20142175257205963, | |
| "learning_rate": 1.920445981339708e-05, | |
| "loss": 0.3614756345748901, | |
| "memory(GiB)": 137.67, | |
| "step": 440, | |
| "token_acc": 0.8678934891256075, | |
| "train_speed(iter/s)": 0.041514 | |
| }, | |
| { | |
| "epoch": 0.5210772833723654, | |
| "grad_norm": 0.2189430445432663, | |
| "learning_rate": 1.9179032774481822e-05, | |
| "loss": 0.3589394330978394, | |
| "memory(GiB)": 137.67, | |
| "step": 445, | |
| "token_acc": 0.8754360673743595, | |
| "train_speed(iter/s)": 0.04152 | |
| }, | |
| { | |
| "epoch": 0.5269320843091335, | |
| "grad_norm": 0.20788422226905823, | |
| "learning_rate": 1.9153223130296125e-05, | |
| "loss": 0.3571774005889893, | |
| "memory(GiB)": 137.67, | |
| "step": 450, | |
| "token_acc": 0.8775248547087467, | |
| "train_speed(iter/s)": 0.041526 | |
| }, | |
| { | |
| "epoch": 0.5327868852459017, | |
| "grad_norm": 0.19941285252571106, | |
| "learning_rate": 1.9127031956651153e-05, | |
| "loss": 0.36058688163757324, | |
| "memory(GiB)": 137.67, | |
| "step": 455, | |
| "token_acc": 0.8748390868215994, | |
| "train_speed(iter/s)": 0.041528 | |
| }, | |
| { | |
| "epoch": 0.5386416861826698, | |
| "grad_norm": 0.20794501900672913, | |
| "learning_rate": 1.9100460345261175e-05, | |
| "loss": 0.37292046546936036, | |
| "memory(GiB)": 137.67, | |
| "step": 460, | |
| "token_acc": 0.8686192757401499, | |
| "train_speed(iter/s)": 0.04152 | |
| }, | |
| { | |
| "epoch": 0.544496487119438, | |
| "grad_norm": 0.21598728001117706, | |
| "learning_rate": 1.9073509403698062e-05, | |
| "loss": 0.3684291124343872, | |
| "memory(GiB)": 137.67, | |
| "step": 465, | |
| "token_acc": 0.8756676919995869, | |
| "train_speed(iter/s)": 0.041523 | |
| }, | |
| { | |
| "epoch": 0.550351288056206, | |
| "grad_norm": 0.21292956173419952, | |
| "learning_rate": 1.9046180255345142e-05, | |
| "loss": 0.3640902042388916, | |
| "memory(GiB)": 137.67, | |
| "step": 470, | |
| "token_acc": 0.8750558298801518, | |
| "train_speed(iter/s)": 0.041525 | |
| }, | |
| { | |
| "epoch": 0.5562060889929742, | |
| "grad_norm": 0.21117296814918518, | |
| "learning_rate": 1.9018474039350342e-05, | |
| "loss": 0.3569709062576294, | |
| "memory(GiB)": 137.67, | |
| "step": 475, | |
| "token_acc": 0.8744779663053135, | |
| "train_speed(iter/s)": 0.041525 | |
| }, | |
| { | |
| "epoch": 0.5620608899297423, | |
| "grad_norm": 0.20366835594177246, | |
| "learning_rate": 1.899039191057872e-05, | |
| "loss": 0.35825061798095703, | |
| "memory(GiB)": 137.67, | |
| "step": 480, | |
| "token_acc": 0.8689726123486041, | |
| "train_speed(iter/s)": 0.041527 | |
| }, | |
| { | |
| "epoch": 0.5679156908665105, | |
| "grad_norm": 0.1856691688299179, | |
| "learning_rate": 1.8961935039564338e-05, | |
| "loss": 0.35746235847473146, | |
| "memory(GiB)": 137.67, | |
| "step": 485, | |
| "token_acc": 0.8688354549740689, | |
| "train_speed(iter/s)": 0.041532 | |
| }, | |
| { | |
| "epoch": 0.5737704918032787, | |
| "grad_norm": 0.23608598113059998, | |
| "learning_rate": 1.8933104612461454e-05, | |
| "loss": 0.35999622344970705, | |
| "memory(GiB)": 137.67, | |
| "step": 490, | |
| "token_acc": 0.8696445021552469, | |
| "train_speed(iter/s)": 0.041533 | |
| }, | |
| { | |
| "epoch": 0.5796252927400468, | |
| "grad_norm": 0.2125530242919922, | |
| "learning_rate": 1.8903901830995093e-05, | |
| "loss": 0.3631314754486084, | |
| "memory(GiB)": 137.67, | |
| "step": 495, | |
| "token_acc": 0.8666599882919743, | |
| "train_speed(iter/s)": 0.041531 | |
| }, | |
| { | |
| "epoch": 0.585480093676815, | |
| "grad_norm": 0.20335227251052856, | |
| "learning_rate": 1.8874327912410945e-05, | |
| "loss": 0.37455101013183595, | |
| "memory(GiB)": 137.67, | |
| "step": 500, | |
| "token_acc": 0.8691201544556442, | |
| "train_speed(iter/s)": 0.041538 | |
| }, | |
| { | |
| "epoch": 0.5913348946135831, | |
| "grad_norm": 0.2046995759010315, | |
| "learning_rate": 1.884438408942463e-05, | |
| "loss": 0.361937952041626, | |
| "memory(GiB)": 137.67, | |
| "step": 505, | |
| "token_acc": 0.8581575277197544, | |
| "train_speed(iter/s)": 0.041539 | |
| }, | |
| { | |
| "epoch": 0.5971896955503513, | |
| "grad_norm": 0.17991533875465393, | |
| "learning_rate": 1.881407161017033e-05, | |
| "loss": 0.35659379959106446, | |
| "memory(GiB)": 137.67, | |
| "step": 510, | |
| "token_acc": 0.8789336760280843, | |
| "train_speed(iter/s)": 0.041545 | |
| }, | |
| { | |
| "epoch": 0.6030444964871194, | |
| "grad_norm": 0.24344618618488312, | |
| "learning_rate": 1.8783391738148738e-05, | |
| "loss": 0.35185072422027586, | |
| "memory(GiB)": 137.67, | |
| "step": 515, | |
| "token_acc": 0.8730951113338136, | |
| "train_speed(iter/s)": 0.04155 | |
| }, | |
| { | |
| "epoch": 0.6088992974238876, | |
| "grad_norm": 0.21754887700080872, | |
| "learning_rate": 1.875234575217441e-05, | |
| "loss": 0.3508215665817261, | |
| "memory(GiB)": 137.67, | |
| "step": 520, | |
| "token_acc": 0.872153412139793, | |
| "train_speed(iter/s)": 0.041554 | |
| }, | |
| { | |
| "epoch": 0.6147540983606558, | |
| "grad_norm": 0.18687933683395386, | |
| "learning_rate": 1.8720934946322466e-05, | |
| "loss": 0.3653162240982056, | |
| "memory(GiB)": 137.67, | |
| "step": 525, | |
| "token_acc": 0.8658395285187296, | |
| "train_speed(iter/s)": 0.041556 | |
| }, | |
| { | |
| "epoch": 0.6206088992974239, | |
| "grad_norm": 0.1791500300168991, | |
| "learning_rate": 1.8689160629874622e-05, | |
| "loss": 0.3357256889343262, | |
| "memory(GiB)": 137.67, | |
| "step": 530, | |
| "token_acc": 0.8864503516899346, | |
| "train_speed(iter/s)": 0.041553 | |
| }, | |
| { | |
| "epoch": 0.6264637002341921, | |
| "grad_norm": 0.18553608655929565, | |
| "learning_rate": 1.865702412726465e-05, | |
| "loss": 0.34752044677734373, | |
| "memory(GiB)": 137.67, | |
| "step": 535, | |
| "token_acc": 0.882398003852215, | |
| "train_speed(iter/s)": 0.041558 | |
| }, | |
| { | |
| "epoch": 0.6323185011709602, | |
| "grad_norm": 0.19252535700798035, | |
| "learning_rate": 1.8624526778023142e-05, | |
| "loss": 0.3493391513824463, | |
| "memory(GiB)": 137.67, | |
| "step": 540, | |
| "token_acc": 0.8799156751797872, | |
| "train_speed(iter/s)": 0.04156 | |
| }, | |
| { | |
| "epoch": 0.6381733021077284, | |
| "grad_norm": 0.1979398876428604, | |
| "learning_rate": 1.85916699367217e-05, | |
| "loss": 0.35185253620147705, | |
| "memory(GiB)": 137.67, | |
| "step": 545, | |
| "token_acc": 0.8728044652187243, | |
| "train_speed(iter/s)": 0.041561 | |
| }, | |
| { | |
| "epoch": 0.6440281030444965, | |
| "grad_norm": 0.19005604088306427, | |
| "learning_rate": 1.855845497291646e-05, | |
| "loss": 0.3633576393127441, | |
| "memory(GiB)": 137.67, | |
| "step": 550, | |
| "token_acc": 0.8699871784073149, | |
| "train_speed(iter/s)": 0.041564 | |
| }, | |
| { | |
| "epoch": 0.6498829039812647, | |
| "grad_norm": 0.1815745234489441, | |
| "learning_rate": 1.8524883271091004e-05, | |
| "loss": 0.35262117385864256, | |
| "memory(GiB)": 137.67, | |
| "step": 555, | |
| "token_acc": 0.8783439310264622, | |
| "train_speed(iter/s)": 0.041562 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 0.17770066857337952, | |
| "learning_rate": 1.8490956230598668e-05, | |
| "loss": 0.3713988780975342, | |
| "memory(GiB)": 137.67, | |
| "step": 560, | |
| "token_acc": 0.8711786567892583, | |
| "train_speed(iter/s)": 0.041563 | |
| }, | |
| { | |
| "epoch": 0.6615925058548009, | |
| "grad_norm": 0.19120706617832184, | |
| "learning_rate": 1.8456675265604183e-05, | |
| "loss": 0.35135421752929685, | |
| "memory(GiB)": 137.67, | |
| "step": 565, | |
| "token_acc": 0.8704644071404868, | |
| "train_speed(iter/s)": 0.041568 | |
| }, | |
| { | |
| "epoch": 0.667447306791569, | |
| "grad_norm": 0.22995422780513763, | |
| "learning_rate": 1.842204180502476e-05, | |
| "loss": 0.3541764974594116, | |
| "memory(GiB)": 137.67, | |
| "step": 570, | |
| "token_acc": 0.8800552885370527, | |
| "train_speed(iter/s)": 0.04157 | |
| }, | |
| { | |
| "epoch": 0.6733021077283372, | |
| "grad_norm": 0.23910608887672424, | |
| "learning_rate": 1.8387057292470517e-05, | |
| "loss": 0.3688697576522827, | |
| "memory(GiB)": 137.67, | |
| "step": 575, | |
| "token_acc": 0.8699386694063074, | |
| "train_speed(iter/s)": 0.041571 | |
| }, | |
| { | |
| "epoch": 0.6791569086651054, | |
| "grad_norm": 0.18881316483020782, | |
| "learning_rate": 1.8351723186184295e-05, | |
| "loss": 0.358310866355896, | |
| "memory(GiB)": 137.67, | |
| "step": 580, | |
| "token_acc": 0.861880756666604, | |
| "train_speed(iter/s)": 0.041574 | |
| }, | |
| { | |
| "epoch": 0.6850117096018735, | |
| "grad_norm": 0.19772037863731384, | |
| "learning_rate": 1.8316040958980896e-05, | |
| "loss": 0.3566863536834717, | |
| "memory(GiB)": 137.67, | |
| "step": 585, | |
| "token_acc": 0.8841636264650852, | |
| "train_speed(iter/s)": 0.041578 | |
| }, | |
| { | |
| "epoch": 0.6908665105386417, | |
| "grad_norm": 0.20680150389671326, | |
| "learning_rate": 1.828001209818567e-05, | |
| "loss": 0.37308592796325685, | |
| "memory(GiB)": 137.67, | |
| "step": 590, | |
| "token_acc": 0.8693373139559628, | |
| "train_speed(iter/s)": 0.041581 | |
| }, | |
| { | |
| "epoch": 0.6967213114754098, | |
| "grad_norm": 0.21996839344501495, | |
| "learning_rate": 1.8243638105572547e-05, | |
| "loss": 0.3568426132202148, | |
| "memory(GiB)": 137.67, | |
| "step": 595, | |
| "token_acc": 0.8781027202445839, | |
| "train_speed(iter/s)": 0.041584 | |
| }, | |
| { | |
| "epoch": 0.702576112412178, | |
| "grad_norm": 0.19068636000156403, | |
| "learning_rate": 1.82069204973014e-05, | |
| "loss": 0.3520241975784302, | |
| "memory(GiB)": 137.67, | |
| "step": 600, | |
| "token_acc": 0.8848490938723728, | |
| "train_speed(iter/s)": 0.041592 | |
| }, | |
| { | |
| "epoch": 0.7084309133489461, | |
| "grad_norm": 0.19711260497570038, | |
| "learning_rate": 1.816986080385489e-05, | |
| "loss": 0.3704382419586182, | |
| "memory(GiB)": 137.67, | |
| "step": 605, | |
| "token_acc": 0.8542210685487001, | |
| "train_speed(iter/s)": 0.041592 | |
| }, | |
| { | |
| "epoch": 0.7142857142857143, | |
| "grad_norm": 0.2009887397289276, | |
| "learning_rate": 1.813246056997465e-05, | |
| "loss": 0.35552153587341306, | |
| "memory(GiB)": 137.67, | |
| "step": 610, | |
| "token_acc": 0.8681636421482087, | |
| "train_speed(iter/s)": 0.041595 | |
| }, | |
| { | |
| "epoch": 0.7201405152224825, | |
| "grad_norm": 0.2012893706560135, | |
| "learning_rate": 1.809472135459688e-05, | |
| "loss": 0.3568307399749756, | |
| "memory(GiB)": 137.67, | |
| "step": 615, | |
| "token_acc": 0.8715069766273564, | |
| "train_speed(iter/s)": 0.041596 | |
| }, | |
| { | |
| "epoch": 0.7259953161592506, | |
| "grad_norm": 0.19377882778644562, | |
| "learning_rate": 1.8056644730787412e-05, | |
| "loss": 0.3658033847808838, | |
| "memory(GiB)": 137.67, | |
| "step": 620, | |
| "token_acc": 0.8766388014057431, | |
| "train_speed(iter/s)": 0.041603 | |
| }, | |
| { | |
| "epoch": 0.7318501170960188, | |
| "grad_norm": 0.21672694385051727, | |
| "learning_rate": 1.8018232285676092e-05, | |
| "loss": 0.34650683403015137, | |
| "memory(GiB)": 137.67, | |
| "step": 625, | |
| "token_acc": 0.8730951833381114, | |
| "train_speed(iter/s)": 0.041609 | |
| }, | |
| { | |
| "epoch": 0.7377049180327869, | |
| "grad_norm": 0.20295600593090057, | |
| "learning_rate": 1.797948562039066e-05, | |
| "loss": 0.36364593505859377, | |
| "memory(GiB)": 137.67, | |
| "step": 630, | |
| "token_acc": 0.8673425158178014, | |
| "train_speed(iter/s)": 0.041604 | |
| }, | |
| { | |
| "epoch": 0.7435597189695551, | |
| "grad_norm": 0.20888152718544006, | |
| "learning_rate": 1.7940406349989987e-05, | |
| "loss": 0.3600362777709961, | |
| "memory(GiB)": 137.67, | |
| "step": 635, | |
| "token_acc": 0.8697917646394914, | |
| "train_speed(iter/s)": 0.04161 | |
| }, | |
| { | |
| "epoch": 0.7494145199063232, | |
| "grad_norm": 0.18725119531154633, | |
| "learning_rate": 1.7900996103396772e-05, | |
| "loss": 0.3525946617126465, | |
| "memory(GiB)": 137.67, | |
| "step": 640, | |
| "token_acc": 0.8778969516256544, | |
| "train_speed(iter/s)": 0.04161 | |
| }, | |
| { | |
| "epoch": 0.7552693208430913, | |
| "grad_norm": 0.2023143470287323, | |
| "learning_rate": 1.7861256523329634e-05, | |
| "loss": 0.35059380531311035, | |
| "memory(GiB)": 137.67, | |
| "step": 645, | |
| "token_acc": 0.867270463741052, | |
| "train_speed(iter/s)": 0.041608 | |
| }, | |
| { | |
| "epoch": 0.7611241217798594, | |
| "grad_norm": 0.18495850265026093, | |
| "learning_rate": 1.7821189266234647e-05, | |
| "loss": 0.35591151714324953, | |
| "memory(GiB)": 137.67, | |
| "step": 650, | |
| "token_acc": 0.8691064057960171, | |
| "train_speed(iter/s)": 0.041607 | |
| }, | |
| { | |
| "epoch": 0.7669789227166276, | |
| "grad_norm": 0.19239366054534912, | |
| "learning_rate": 1.7780796002216285e-05, | |
| "loss": 0.3489703893661499, | |
| "memory(GiB)": 137.67, | |
| "step": 655, | |
| "token_acc": 0.8661729229440642, | |
| "train_speed(iter/s)": 0.041609 | |
| }, | |
| { | |
| "epoch": 0.7728337236533958, | |
| "grad_norm": 0.19033724069595337, | |
| "learning_rate": 1.7740078414967817e-05, | |
| "loss": 0.35645670890808107, | |
| "memory(GiB)": 137.67, | |
| "step": 660, | |
| "token_acc": 0.8801652115008279, | |
| "train_speed(iter/s)": 0.041611 | |
| }, | |
| { | |
| "epoch": 0.7786885245901639, | |
| "grad_norm": 0.1858055591583252, | |
| "learning_rate": 1.7699038201701132e-05, | |
| "loss": 0.3495974063873291, | |
| "memory(GiB)": 137.67, | |
| "step": 665, | |
| "token_acc": 0.86732774248516, | |
| "train_speed(iter/s)": 0.041614 | |
| }, | |
| { | |
| "epoch": 0.7845433255269321, | |
| "grad_norm": 0.19249401986598969, | |
| "learning_rate": 1.7657677073075968e-05, | |
| "loss": 0.35628108978271483, | |
| "memory(GiB)": 137.67, | |
| "step": 670, | |
| "token_acc": 0.8711122587710429, | |
| "train_speed(iter/s)": 0.041616 | |
| }, | |
| { | |
| "epoch": 0.7903981264637002, | |
| "grad_norm": 0.1897304505109787, | |
| "learning_rate": 1.761599675312864e-05, | |
| "loss": 0.3588160514831543, | |
| "memory(GiB)": 137.67, | |
| "step": 675, | |
| "token_acc": 0.8833087010138474, | |
| "train_speed(iter/s)": 0.041616 | |
| }, | |
| { | |
| "epoch": 0.7962529274004684, | |
| "grad_norm": 0.19034340977668762, | |
| "learning_rate": 1.7573998979200163e-05, | |
| "loss": 0.3528533935546875, | |
| "memory(GiB)": 137.67, | |
| "step": 680, | |
| "token_acc": 0.873974659902577, | |
| "train_speed(iter/s)": 0.04162 | |
| }, | |
| { | |
| "epoch": 0.8021077283372365, | |
| "grad_norm": 0.17828524112701416, | |
| "learning_rate": 1.753168550186383e-05, | |
| "loss": 0.36130833625793457, | |
| "memory(GiB)": 137.67, | |
| "step": 685, | |
| "token_acc": 0.8767166579575643, | |
| "train_speed(iter/s)": 0.041622 | |
| }, | |
| { | |
| "epoch": 0.8079625292740047, | |
| "grad_norm": 0.18225735425949097, | |
| "learning_rate": 1.7489058084852247e-05, | |
| "loss": 0.3559986114501953, | |
| "memory(GiB)": 137.67, | |
| "step": 690, | |
| "token_acc": 0.8664611837818874, | |
| "train_speed(iter/s)": 0.041619 | |
| }, | |
| { | |
| "epoch": 0.8138173302107728, | |
| "grad_norm": 0.17824020981788635, | |
| "learning_rate": 1.744611850498383e-05, | |
| "loss": 0.3519934415817261, | |
| "memory(GiB)": 137.67, | |
| "step": 695, | |
| "token_acc": 0.8767726421318924, | |
| "train_speed(iter/s)": 0.04162 | |
| }, | |
| { | |
| "epoch": 0.819672131147541, | |
| "grad_norm": 0.19619260728359222, | |
| "learning_rate": 1.7402868552088724e-05, | |
| "loss": 0.34758720397949217, | |
| "memory(GiB)": 137.67, | |
| "step": 700, | |
| "token_acc": 0.8710738168196693, | |
| "train_speed(iter/s)": 0.041621 | |
| }, | |
| { | |
| "epoch": 0.8255269320843092, | |
| "grad_norm": 0.20193175971508026, | |
| "learning_rate": 1.73593100289342e-05, | |
| "loss": 0.3554750919342041, | |
| "memory(GiB)": 137.67, | |
| "step": 705, | |
| "token_acc": 0.8680475894967122, | |
| "train_speed(iter/s)": 0.041625 | |
| }, | |
| { | |
| "epoch": 0.8313817330210773, | |
| "grad_norm": 0.17672231793403625, | |
| "learning_rate": 1.7315444751149533e-05, | |
| "loss": 0.3531287670135498, | |
| "memory(GiB)": 137.67, | |
| "step": 710, | |
| "token_acc": 0.8739113086739942, | |
| "train_speed(iter/s)": 0.041629 | |
| }, | |
| { | |
| "epoch": 0.8372365339578455, | |
| "grad_norm": 0.18640753626823425, | |
| "learning_rate": 1.727127454715029e-05, | |
| "loss": 0.3531001329421997, | |
| "memory(GiB)": 137.67, | |
| "step": 715, | |
| "token_acc": 0.8807271048387348, | |
| "train_speed(iter/s)": 0.041632 | |
| }, | |
| { | |
| "epoch": 0.8430913348946136, | |
| "grad_norm": 0.18654407560825348, | |
| "learning_rate": 1.722680125806214e-05, | |
| "loss": 0.3535622119903564, | |
| "memory(GiB)": 137.67, | |
| "step": 720, | |
| "token_acc": 0.8664340845361018, | |
| "train_speed(iter/s)": 0.041633 | |
| }, | |
| { | |
| "epoch": 0.8489461358313818, | |
| "grad_norm": 0.19616912305355072, | |
| "learning_rate": 1.71820267376441e-05, | |
| "loss": 0.357543420791626, | |
| "memory(GiB)": 137.67, | |
| "step": 725, | |
| "token_acc": 0.8723300758960031, | |
| "train_speed(iter/s)": 0.041635 | |
| }, | |
| { | |
| "epoch": 0.8548009367681498, | |
| "grad_norm": 0.1865251064300537, | |
| "learning_rate": 1.7136952852211274e-05, | |
| "loss": 0.36123013496398926, | |
| "memory(GiB)": 137.67, | |
| "step": 730, | |
| "token_acc": 0.8610691821941981, | |
| "train_speed(iter/s)": 0.041638 | |
| }, | |
| { | |
| "epoch": 0.860655737704918, | |
| "grad_norm": 0.1886809915304184, | |
| "learning_rate": 1.7091581480557057e-05, | |
| "loss": 0.34960460662841797, | |
| "memory(GiB)": 137.67, | |
| "step": 735, | |
| "token_acc": 0.8703787498166635, | |
| "train_speed(iter/s)": 0.041639 | |
| }, | |
| { | |
| "epoch": 0.8665105386416861, | |
| "grad_norm": 0.19691921770572662, | |
| "learning_rate": 1.7045914513874815e-05, | |
| "loss": 0.3618565320968628, | |
| "memory(GiB)": 137.67, | |
| "step": 740, | |
| "token_acc": 0.8702042368549021, | |
| "train_speed(iter/s)": 0.041645 | |
| }, | |
| { | |
| "epoch": 0.8723653395784543, | |
| "grad_norm": 0.18920762836933136, | |
| "learning_rate": 1.699995385567907e-05, | |
| "loss": 0.3643482685089111, | |
| "memory(GiB)": 137.67, | |
| "step": 745, | |
| "token_acc": 0.8619865320910651, | |
| "train_speed(iter/s)": 0.041651 | |
| }, | |
| { | |
| "epoch": 0.8782201405152225, | |
| "grad_norm": 0.19481435418128967, | |
| "learning_rate": 1.695370142172614e-05, | |
| "loss": 0.3560521602630615, | |
| "memory(GiB)": 137.67, | |
| "step": 750, | |
| "token_acc": 0.8686031511447322, | |
| "train_speed(iter/s)": 0.041651 | |
| }, | |
| { | |
| "epoch": 0.8840749414519906, | |
| "grad_norm": 0.19207534193992615, | |
| "learning_rate": 1.690715913993429e-05, | |
| "loss": 0.3591322422027588, | |
| "memory(GiB)": 137.67, | |
| "step": 755, | |
| "token_acc": 0.8719703155846309, | |
| "train_speed(iter/s)": 0.041652 | |
| }, | |
| { | |
| "epoch": 0.8899297423887588, | |
| "grad_norm": 0.20057600736618042, | |
| "learning_rate": 1.6860328950303392e-05, | |
| "loss": 0.3394715070724487, | |
| "memory(GiB)": 137.67, | |
| "step": 760, | |
| "token_acc": 0.8781381296322522, | |
| "train_speed(iter/s)": 0.041655 | |
| }, | |
| { | |
| "epoch": 0.8957845433255269, | |
| "grad_norm": 0.19081991910934448, | |
| "learning_rate": 1.6813212804834033e-05, | |
| "loss": 0.3552083015441895, | |
| "memory(GiB)": 137.67, | |
| "step": 765, | |
| "token_acc": 0.8649747738343772, | |
| "train_speed(iter/s)": 0.041656 | |
| }, | |
| { | |
| "epoch": 0.9016393442622951, | |
| "grad_norm": 0.17996545135974884, | |
| "learning_rate": 1.676581266744615e-05, | |
| "loss": 0.3466797828674316, | |
| "memory(GiB)": 137.67, | |
| "step": 770, | |
| "token_acc": 0.8719778029670782, | |
| "train_speed(iter/s)": 0.041659 | |
| }, | |
| { | |
| "epoch": 0.9074941451990632, | |
| "grad_norm": 0.18470925092697144, | |
| "learning_rate": 1.6718130513897207e-05, | |
| "loss": 0.34652736186981203, | |
| "memory(GiB)": 137.67, | |
| "step": 775, | |
| "token_acc": 0.8761688115825458, | |
| "train_speed(iter/s)": 0.041661 | |
| }, | |
| { | |
| "epoch": 0.9133489461358314, | |
| "grad_norm": 0.1838730424642563, | |
| "learning_rate": 1.667016833169979e-05, | |
| "loss": 0.3616307258605957, | |
| "memory(GiB)": 137.67, | |
| "step": 780, | |
| "token_acc": 0.8749988214255409, | |
| "train_speed(iter/s)": 0.041664 | |
| }, | |
| { | |
| "epoch": 0.9192037470725996, | |
| "grad_norm": 0.1882750242948532, | |
| "learning_rate": 1.6621928120038806e-05, | |
| "loss": 0.35453338623046876, | |
| "memory(GiB)": 137.67, | |
| "step": 785, | |
| "token_acc": 0.8650788191817312, | |
| "train_speed(iter/s)": 0.041666 | |
| }, | |
| { | |
| "epoch": 0.9250585480093677, | |
| "grad_norm": 0.18011753261089325, | |
| "learning_rate": 1.657341188968811e-05, | |
| "loss": 0.3467398166656494, | |
| "memory(GiB)": 137.67, | |
| "step": 790, | |
| "token_acc": 0.8665571597898215, | |
| "train_speed(iter/s)": 0.041668 | |
| }, | |
| { | |
| "epoch": 0.9309133489461359, | |
| "grad_norm": 0.1889754831790924, | |
| "learning_rate": 1.6524621662926733e-05, | |
| "loss": 0.34622554779052733, | |
| "memory(GiB)": 137.67, | |
| "step": 795, | |
| "token_acc": 0.8836526658483215, | |
| "train_speed(iter/s)": 0.041671 | |
| }, | |
| { | |
| "epoch": 0.936768149882904, | |
| "grad_norm": 0.17811700701713562, | |
| "learning_rate": 1.6475559473454558e-05, | |
| "loss": 0.35440659523010254, | |
| "memory(GiB)": 137.67, | |
| "step": 800, | |
| "token_acc": 0.8802437890929187, | |
| "train_speed(iter/s)": 0.041672 | |
| }, | |
| { | |
| "epoch": 0.9426229508196722, | |
| "grad_norm": 0.19011390209197998, | |
| "learning_rate": 1.6426227366307563e-05, | |
| "loss": 0.3580695629119873, | |
| "memory(GiB)": 137.67, | |
| "step": 805, | |
| "token_acc": 0.8808476204925909, | |
| "train_speed(iter/s)": 0.04167 | |
| }, | |
| { | |
| "epoch": 0.9484777517564403, | |
| "grad_norm": 0.18688787519931793, | |
| "learning_rate": 1.6376627397772576e-05, | |
| "loss": 0.35615901947021483, | |
| "memory(GiB)": 137.67, | |
| "step": 810, | |
| "token_acc": 0.8656951211518713, | |
| "train_speed(iter/s)": 0.04167 | |
| }, | |
| { | |
| "epoch": 0.9543325526932084, | |
| "grad_norm": 0.19855861365795135, | |
| "learning_rate": 1.6326761635301572e-05, | |
| "loss": 0.3505072116851807, | |
| "memory(GiB)": 137.67, | |
| "step": 815, | |
| "token_acc": 0.8734695802546769, | |
| "train_speed(iter/s)": 0.041672 | |
| }, | |
| { | |
| "epoch": 0.9601873536299765, | |
| "grad_norm": 0.18500158190727234, | |
| "learning_rate": 1.6276632157425475e-05, | |
| "loss": 0.35810859203338624, | |
| "memory(GiB)": 137.67, | |
| "step": 820, | |
| "token_acc": 0.8688002942074786, | |
| "train_speed(iter/s)": 0.041672 | |
| }, | |
| { | |
| "epoch": 0.9660421545667447, | |
| "grad_norm": 0.2135351300239563, | |
| "learning_rate": 1.6226241053667536e-05, | |
| "loss": 0.3624737739562988, | |
| "memory(GiB)": 137.67, | |
| "step": 825, | |
| "token_acc": 0.8650754688071645, | |
| "train_speed(iter/s)": 0.041674 | |
| }, | |
| { | |
| "epoch": 0.9718969555035128, | |
| "grad_norm": 0.188192680478096, | |
| "learning_rate": 1.617559042445625e-05, | |
| "loss": 0.3624725818634033, | |
| "memory(GiB)": 137.67, | |
| "step": 830, | |
| "token_acc": 0.8755614748176581, | |
| "train_speed(iter/s)": 0.041674 | |
| }, | |
| { | |
| "epoch": 0.977751756440281, | |
| "grad_norm": 0.34307366609573364, | |
| "learning_rate": 1.6124682381037767e-05, | |
| "loss": 0.34985201358795165, | |
| "memory(GiB)": 137.67, | |
| "step": 835, | |
| "token_acc": 0.8732973013596538, | |
| "train_speed(iter/s)": 0.041675 | |
| }, | |
| { | |
| "epoch": 0.9836065573770492, | |
| "grad_norm": 0.19902247190475464, | |
| "learning_rate": 1.607351904538792e-05, | |
| "loss": 0.3641986846923828, | |
| "memory(GiB)": 137.67, | |
| "step": 840, | |
| "token_acc": 0.8725000467718097, | |
| "train_speed(iter/s)": 0.041673 | |
| }, | |
| { | |
| "epoch": 0.9894613583138173, | |
| "grad_norm": 0.18375855684280396, | |
| "learning_rate": 1.6022102550123775e-05, | |
| "loss": 0.3507267951965332, | |
| "memory(GiB)": 137.67, | |
| "step": 845, | |
| "token_acc": 0.868225976538805, | |
| "train_speed(iter/s)": 0.041674 | |
| }, | |
| { | |
| "epoch": 0.9953161592505855, | |
| "grad_norm": 0.19543269276618958, | |
| "learning_rate": 1.597043503841471e-05, | |
| "loss": 0.3511422395706177, | |
| "memory(GiB)": 137.67, | |
| "step": 850, | |
| "token_acc": 0.8818226402481499, | |
| "train_speed(iter/s)": 0.041674 | |
| }, | |
| { | |
| "epoch": 1.0011709601873535, | |
| "grad_norm": 0.2594313323497772, | |
| "learning_rate": 1.5918518663893124e-05, | |
| "loss": 0.3436767339706421, | |
| "memory(GiB)": 137.67, | |
| "step": 855, | |
| "token_acc": 0.8783253667380914, | |
| "train_speed(iter/s)": 0.041472 | |
| }, | |
| { | |
| "epoch": 1.0070257611241218, | |
| "grad_norm": 0.21433798968791962, | |
| "learning_rate": 1.5866355590564637e-05, | |
| "loss": 0.31752333641052244, | |
| "memory(GiB)": 137.67, | |
| "step": 860, | |
| "token_acc": 0.8950932956103179, | |
| "train_speed(iter/s)": 0.041464 | |
| }, | |
| { | |
| "epoch": 1.0128805620608898, | |
| "grad_norm": 0.20641100406646729, | |
| "learning_rate": 1.5813947992717894e-05, | |
| "loss": 0.3059502601623535, | |
| "memory(GiB)": 137.67, | |
| "step": 865, | |
| "token_acc": 0.8851299275012688, | |
| "train_speed(iter/s)": 0.041456 | |
| }, | |
| { | |
| "epoch": 1.018735362997658, | |
| "grad_norm": 0.2776026427745819, | |
| "learning_rate": 1.5761298054833947e-05, | |
| "loss": 0.31491961479187014, | |
| "memory(GiB)": 137.67, | |
| "step": 870, | |
| "token_acc": 0.8871431849329935, | |
| "train_speed(iter/s)": 0.041446 | |
| }, | |
| { | |
| "epoch": 1.0245901639344261, | |
| "grad_norm": 0.2104882299900055, | |
| "learning_rate": 1.5708407971495195e-05, | |
| "loss": 0.3215550422668457, | |
| "memory(GiB)": 137.67, | |
| "step": 875, | |
| "token_acc": 0.8840142068123856, | |
| "train_speed(iter/s)": 0.041441 | |
| }, | |
| { | |
| "epoch": 1.0304449648711944, | |
| "grad_norm": 0.2141922563314438, | |
| "learning_rate": 1.565527994729389e-05, | |
| "loss": 0.31157307624816893, | |
| "memory(GiB)": 137.67, | |
| "step": 880, | |
| "token_acc": 0.8925077955478237, | |
| "train_speed(iter/s)": 0.041435 | |
| }, | |
| { | |
| "epoch": 1.0362997658079625, | |
| "grad_norm": 0.19829437136650085, | |
| "learning_rate": 1.5601916196740283e-05, | |
| "loss": 0.30809755325317384, | |
| "memory(GiB)": 137.67, | |
| "step": 885, | |
| "token_acc": 0.890301896874165, | |
| "train_speed(iter/s)": 0.04143 | |
| }, | |
| { | |
| "epoch": 1.0421545667447307, | |
| "grad_norm": 0.1938631683588028, | |
| "learning_rate": 1.5548318944170276e-05, | |
| "loss": 0.30415992736816405, | |
| "memory(GiB)": 137.67, | |
| "step": 890, | |
| "token_acc": 0.8950597362393585, | |
| "train_speed(iter/s)": 0.041423 | |
| }, | |
| { | |
| "epoch": 1.0480093676814988, | |
| "grad_norm": 0.18822869658470154, | |
| "learning_rate": 1.5494490423652732e-05, | |
| "loss": 0.30409889221191405, | |
| "memory(GiB)": 137.67, | |
| "step": 895, | |
| "token_acc": 0.8878764647902749, | |
| "train_speed(iter/s)": 0.041414 | |
| }, | |
| { | |
| "epoch": 1.053864168618267, | |
| "grad_norm": 0.18639546632766724, | |
| "learning_rate": 1.544043287889635e-05, | |
| "loss": 0.29631519317626953, | |
| "memory(GiB)": 137.67, | |
| "step": 900, | |
| "token_acc": 0.8972942289498581, | |
| "train_speed(iter/s)": 0.041408 | |
| }, | |
| { | |
| "epoch": 1.059718969555035, | |
| "grad_norm": 0.19313958287239075, | |
| "learning_rate": 1.538614856315614e-05, | |
| "loss": 0.3089482307434082, | |
| "memory(GiB)": 137.67, | |
| "step": 905, | |
| "token_acc": 0.8947345206627453, | |
| "train_speed(iter/s)": 0.041403 | |
| }, | |
| { | |
| "epoch": 1.0655737704918034, | |
| "grad_norm": 0.1918047070503235, | |
| "learning_rate": 1.5331639739139477e-05, | |
| "loss": 0.30376482009887695, | |
| "memory(GiB)": 137.67, | |
| "step": 910, | |
| "token_acc": 0.878863108904361, | |
| "train_speed(iter/s)": 0.041394 | |
| }, | |
| { | |
| "epoch": 1.0714285714285714, | |
| "grad_norm": 0.17692717909812927, | |
| "learning_rate": 1.5276908678911837e-05, | |
| "loss": 0.3011662006378174, | |
| "memory(GiB)": 137.67, | |
| "step": 915, | |
| "token_acc": 0.8932026746024828, | |
| "train_speed(iter/s)": 0.041388 | |
| }, | |
| { | |
| "epoch": 1.0772833723653397, | |
| "grad_norm": 0.1763262152671814, | |
| "learning_rate": 1.5221957663802043e-05, | |
| "loss": 0.31141071319580077, | |
| "memory(GiB)": 137.67, | |
| "step": 920, | |
| "token_acc": 0.8920435427389305, | |
| "train_speed(iter/s)": 0.041376 | |
| }, | |
| { | |
| "epoch": 1.0831381733021077, | |
| "grad_norm": 0.1730634868144989, | |
| "learning_rate": 1.5166788984307204e-05, | |
| "loss": 0.3161822557449341, | |
| "memory(GiB)": 137.67, | |
| "step": 925, | |
| "token_acc": 0.8866250173014735, | |
| "train_speed(iter/s)": 0.041367 | |
| }, | |
| { | |
| "epoch": 1.088992974238876, | |
| "grad_norm": 0.20834501087665558, | |
| "learning_rate": 1.5111404939997227e-05, | |
| "loss": 0.3130020618438721, | |
| "memory(GiB)": 137.67, | |
| "step": 930, | |
| "token_acc": 0.8872231505297611, | |
| "train_speed(iter/s)": 0.04136 | |
| }, | |
| { | |
| "epoch": 1.094847775175644, | |
| "grad_norm": 0.20543096959590912, | |
| "learning_rate": 1.5055807839418966e-05, | |
| "loss": 0.29431891441345215, | |
| "memory(GiB)": 137.67, | |
| "step": 935, | |
| "token_acc": 0.8923718607539866, | |
| "train_speed(iter/s)": 0.041352 | |
| }, | |
| { | |
| "epoch": 1.100702576112412, | |
| "grad_norm": 0.1818283647298813, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.31560554504394533, | |
| "memory(GiB)": 137.67, | |
| "step": 940, | |
| "token_acc": 0.8944428660187143, | |
| "train_speed(iter/s)": 0.041347 | |
| }, | |
| { | |
| "epoch": 1.1065573770491803, | |
| "grad_norm": 0.18734754621982574, | |
| "learning_rate": 1.494398374795204e-05, | |
| "loss": 0.30426225662231443, | |
| "memory(GiB)": 137.67, | |
| "step": 945, | |
| "token_acc": 0.8848180693302514, | |
| "train_speed(iter/s)": 0.041343 | |
| }, | |
| { | |
| "epoch": 1.1124121779859484, | |
| "grad_norm": 0.19308467209339142, | |
| "learning_rate": 1.4887761418173947e-05, | |
| "loss": 0.32167963981628417, | |
| "memory(GiB)": 137.67, | |
| "step": 950, | |
| "token_acc": 0.8939139882185966, | |
| "train_speed(iter/s)": 0.041337 | |
| }, | |
| { | |
| "epoch": 1.1182669789227166, | |
| "grad_norm": 0.2532450258731842, | |
| "learning_rate": 1.4831335354154444e-05, | |
| "loss": 0.30830209255218505, | |
| "memory(GiB)": 137.67, | |
| "step": 955, | |
| "token_acc": 0.887962551140468, | |
| "train_speed(iter/s)": 0.041333 | |
| }, | |
| { | |
| "epoch": 1.1241217798594847, | |
| "grad_norm": 0.18927785754203796, | |
| "learning_rate": 1.4774707907874392e-05, | |
| "loss": 0.30596270561218264, | |
| "memory(GiB)": 137.67, | |
| "step": 960, | |
| "token_acc": 0.8945483075403462, | |
| "train_speed(iter/s)": 0.041324 | |
| }, | |
| { | |
| "epoch": 1.129976580796253, | |
| "grad_norm": 0.18746164441108704, | |
| "learning_rate": 1.4717881439708786e-05, | |
| "loss": 0.3073431491851807, | |
| "memory(GiB)": 137.67, | |
| "step": 965, | |
| "token_acc": 0.8779535897835228, | |
| "train_speed(iter/s)": 0.041318 | |
| }, | |
| { | |
| "epoch": 1.135831381733021, | |
| "grad_norm": 0.19065742194652557, | |
| "learning_rate": 1.4660858318328348e-05, | |
| "loss": 0.30925755500793456, | |
| "memory(GiB)": 137.67, | |
| "step": 970, | |
| "token_acc": 0.8771556147038887, | |
| "train_speed(iter/s)": 0.041311 | |
| }, | |
| { | |
| "epoch": 1.1416861826697893, | |
| "grad_norm": 0.19082236289978027, | |
| "learning_rate": 1.4603640920600813e-05, | |
| "loss": 0.31507372856140137, | |
| "memory(GiB)": 137.67, | |
| "step": 975, | |
| "token_acc": 0.8741312286488396, | |
| "train_speed(iter/s)": 0.041305 | |
| }, | |
| { | |
| "epoch": 1.1475409836065573, | |
| "grad_norm": 0.18480531871318817, | |
| "learning_rate": 1.4546231631491827e-05, | |
| "loss": 0.3110131025314331, | |
| "memory(GiB)": 137.67, | |
| "step": 980, | |
| "token_acc": 0.8829417142215302, | |
| "train_speed(iter/s)": 0.041296 | |
| }, | |
| { | |
| "epoch": 1.1533957845433256, | |
| "grad_norm": 0.17675240337848663, | |
| "learning_rate": 1.4488632843965573e-05, | |
| "loss": 0.3039939641952515, | |
| "memory(GiB)": 137.67, | |
| "step": 985, | |
| "token_acc": 0.8738143036386449, | |
| "train_speed(iter/s)": 0.041289 | |
| }, | |
| { | |
| "epoch": 1.1592505854800936, | |
| "grad_norm": 0.19089390337467194, | |
| "learning_rate": 1.4430846958884995e-05, | |
| "loss": 0.31295793056488036, | |
| "memory(GiB)": 137.67, | |
| "step": 990, | |
| "token_acc": 0.8817706633869632, | |
| "train_speed(iter/s)": 0.041282 | |
| }, | |
| { | |
| "epoch": 1.165105386416862, | |
| "grad_norm": 0.18563120067119598, | |
| "learning_rate": 1.4372876384911741e-05, | |
| "loss": 0.313909912109375, | |
| "memory(GiB)": 137.67, | |
| "step": 995, | |
| "token_acc": 0.8830196916072904, | |
| "train_speed(iter/s)": 0.041276 | |
| }, | |
| { | |
| "epoch": 1.17096018735363, | |
| "grad_norm": 0.21534429490566254, | |
| "learning_rate": 1.4314723538405752e-05, | |
| "loss": 0.3197300910949707, | |
| "memory(GiB)": 137.67, | |
| "step": 1000, | |
| "token_acc": 0.8747241787695568, | |
| "train_speed(iter/s)": 0.041271 | |
| }, | |
| { | |
| "epoch": 1.1768149882903982, | |
| "grad_norm": 0.19970309734344482, | |
| "learning_rate": 1.4256390843324556e-05, | |
| "loss": 0.3151378154754639, | |
| "memory(GiB)": 137.67, | |
| "step": 1005, | |
| "token_acc": 0.8791438877655459, | |
| "train_speed(iter/s)": 0.041267 | |
| }, | |
| { | |
| "epoch": 1.1826697892271663, | |
| "grad_norm": 0.1895560324192047, | |
| "learning_rate": 1.4197880731122221e-05, | |
| "loss": 0.312138032913208, | |
| "memory(GiB)": 137.67, | |
| "step": 1010, | |
| "token_acc": 0.8795711581097576, | |
| "train_speed(iter/s)": 0.041265 | |
| }, | |
| { | |
| "epoch": 1.1885245901639343, | |
| "grad_norm": 0.19073544442653656, | |
| "learning_rate": 1.4139195640648008e-05, | |
| "loss": 0.315081787109375, | |
| "memory(GiB)": 137.67, | |
| "step": 1015, | |
| "token_acc": 0.8921242173646963, | |
| "train_speed(iter/s)": 0.041259 | |
| }, | |
| { | |
| "epoch": 1.1943793911007026, | |
| "grad_norm": 0.17704617977142334, | |
| "learning_rate": 1.4080338018044712e-05, | |
| "loss": 0.319437837600708, | |
| "memory(GiB)": 137.67, | |
| "step": 1020, | |
| "token_acc": 0.8815218951006631, | |
| "train_speed(iter/s)": 0.041255 | |
| }, | |
| { | |
| "epoch": 1.2002341920374708, | |
| "grad_norm": 0.19636361300945282, | |
| "learning_rate": 1.4021310316646708e-05, | |
| "loss": 0.3087984561920166, | |
| "memory(GiB)": 137.67, | |
| "step": 1025, | |
| "token_acc": 0.8875915980726762, | |
| "train_speed(iter/s)": 0.041249 | |
| }, | |
| { | |
| "epoch": 1.2060889929742389, | |
| "grad_norm": 0.185128852725029, | |
| "learning_rate": 1.3962114996877685e-05, | |
| "loss": 0.29653804302215575, | |
| "memory(GiB)": 137.67, | |
| "step": 1030, | |
| "token_acc": 0.894042061938463, | |
| "train_speed(iter/s)": 0.041243 | |
| }, | |
| { | |
| "epoch": 1.211943793911007, | |
| "grad_norm": 0.18740731477737427, | |
| "learning_rate": 1.390275452614808e-05, | |
| "loss": 0.2996367454528809, | |
| "memory(GiB)": 137.67, | |
| "step": 1035, | |
| "token_acc": 0.8867371770872332, | |
| "train_speed(iter/s)": 0.041239 | |
| }, | |
| { | |
| "epoch": 1.2177985948477752, | |
| "grad_norm": 0.19739095866680145, | |
| "learning_rate": 1.3843231378752252e-05, | |
| "loss": 0.3056778907775879, | |
| "memory(GiB)": 137.67, | |
| "step": 1040, | |
| "token_acc": 0.8844194070047138, | |
| "train_speed(iter/s)": 0.041232 | |
| }, | |
| { | |
| "epoch": 1.2236533957845432, | |
| "grad_norm": 0.18625736236572266, | |
| "learning_rate": 1.3783548035765327e-05, | |
| "loss": 0.3101504802703857, | |
| "memory(GiB)": 137.67, | |
| "step": 1045, | |
| "token_acc": 0.8895319577252139, | |
| "train_speed(iter/s)": 0.041228 | |
| }, | |
| { | |
| "epoch": 1.2295081967213115, | |
| "grad_norm": 0.19391782581806183, | |
| "learning_rate": 1.3723706984939783e-05, | |
| "loss": 0.2983381271362305, | |
| "memory(GiB)": 137.67, | |
| "step": 1050, | |
| "token_acc": 0.8835933444611258, | |
| "train_speed(iter/s)": 0.041224 | |
| }, | |
| { | |
| "epoch": 1.2353629976580796, | |
| "grad_norm": 0.18108582496643066, | |
| "learning_rate": 1.366371072060177e-05, | |
| "loss": 0.3086691379547119, | |
| "memory(GiB)": 137.67, | |
| "step": 1055, | |
| "token_acc": 0.8736720857877966, | |
| "train_speed(iter/s)": 0.041218 | |
| }, | |
| { | |
| "epoch": 1.2412177985948478, | |
| "grad_norm": 0.18043167889118195, | |
| "learning_rate": 1.3603561743547125e-05, | |
| "loss": 0.30459914207458494, | |
| "memory(GiB)": 137.67, | |
| "step": 1060, | |
| "token_acc": 0.8805453249562779, | |
| "train_speed(iter/s)": 0.041215 | |
| }, | |
| { | |
| "epoch": 1.2470725995316159, | |
| "grad_norm": 0.2246876060962677, | |
| "learning_rate": 1.3543262560937135e-05, | |
| "loss": 0.3085703134536743, | |
| "memory(GiB)": 137.67, | |
| "step": 1065, | |
| "token_acc": 0.8846350880261892, | |
| "train_speed(iter/s)": 0.041212 | |
| }, | |
| { | |
| "epoch": 1.2529274004683841, | |
| "grad_norm": 0.19236041605472565, | |
| "learning_rate": 1.3482815686194033e-05, | |
| "loss": 0.2960092306137085, | |
| "memory(GiB)": 137.67, | |
| "step": 1070, | |
| "token_acc": 0.8907122097565549, | |
| "train_speed(iter/s)": 0.041208 | |
| }, | |
| { | |
| "epoch": 1.2587822014051522, | |
| "grad_norm": 0.1928793489933014, | |
| "learning_rate": 1.3422223638896235e-05, | |
| "loss": 0.3040574073791504, | |
| "memory(GiB)": 137.67, | |
| "step": 1075, | |
| "token_acc": 0.886298144007927, | |
| "train_speed(iter/s)": 0.041204 | |
| }, | |
| { | |
| "epoch": 1.2646370023419204, | |
| "grad_norm": 0.20902785658836365, | |
| "learning_rate": 1.3361488944673315e-05, | |
| "loss": 0.31267333030700684, | |
| "memory(GiB)": 137.67, | |
| "step": 1080, | |
| "token_acc": 0.8800496737817911, | |
| "train_speed(iter/s)": 0.041199 | |
| }, | |
| { | |
| "epoch": 1.2704918032786885, | |
| "grad_norm": 0.18985559046268463, | |
| "learning_rate": 1.3300614135100736e-05, | |
| "loss": 0.3105930805206299, | |
| "memory(GiB)": 137.67, | |
| "step": 1085, | |
| "token_acc": 0.8869882389382489, | |
| "train_speed(iter/s)": 0.041194 | |
| }, | |
| { | |
| "epoch": 1.2763466042154565, | |
| "grad_norm": 0.17671886086463928, | |
| "learning_rate": 1.3239601747594319e-05, | |
| "loss": 0.310105037689209, | |
| "memory(GiB)": 137.67, | |
| "step": 1090, | |
| "token_acc": 0.8870674524554854, | |
| "train_speed(iter/s)": 0.041187 | |
| }, | |
| { | |
| "epoch": 1.2822014051522248, | |
| "grad_norm": 0.17825712263584137, | |
| "learning_rate": 1.3178454325304472e-05, | |
| "loss": 0.31207849979400637, | |
| "memory(GiB)": 137.67, | |
| "step": 1095, | |
| "token_acc": 0.876942551728449, | |
| "train_speed(iter/s)": 0.041183 | |
| }, | |
| { | |
| "epoch": 1.288056206088993, | |
| "grad_norm": 0.1821722686290741, | |
| "learning_rate": 1.3117174417010213e-05, | |
| "loss": 0.2980069637298584, | |
| "memory(GiB)": 137.67, | |
| "step": 1100, | |
| "token_acc": 0.8805069421513594, | |
| "train_speed(iter/s)": 0.041179 | |
| }, | |
| { | |
| "epoch": 1.2939110070257611, | |
| "grad_norm": 0.18626025319099426, | |
| "learning_rate": 1.3055764577012892e-05, | |
| "loss": 0.3255163669586182, | |
| "memory(GiB)": 137.67, | |
| "step": 1105, | |
| "token_acc": 0.8920352101893313, | |
| "train_speed(iter/s)": 0.041176 | |
| }, | |
| { | |
| "epoch": 1.2997658079625292, | |
| "grad_norm": 0.18716710805892944, | |
| "learning_rate": 1.2994227365029752e-05, | |
| "loss": 0.30793008804321287, | |
| "memory(GiB)": 137.67, | |
| "step": 1110, | |
| "token_acc": 0.8887493130250451, | |
| "train_speed(iter/s)": 0.041173 | |
| }, | |
| { | |
| "epoch": 1.3056206088992974, | |
| "grad_norm": 0.19421324133872986, | |
| "learning_rate": 1.2932565346087218e-05, | |
| "loss": 0.3134599208831787, | |
| "memory(GiB)": 137.67, | |
| "step": 1115, | |
| "token_acc": 0.8847875557218118, | |
| "train_speed(iter/s)": 0.041168 | |
| }, | |
| { | |
| "epoch": 1.3114754098360657, | |
| "grad_norm": 0.18218953907489777, | |
| "learning_rate": 1.2870781090413991e-05, | |
| "loss": 0.3120888710021973, | |
| "memory(GiB)": 137.67, | |
| "step": 1120, | |
| "token_acc": 0.8869988305263882, | |
| "train_speed(iter/s)": 0.041162 | |
| }, | |
| { | |
| "epoch": 1.3173302107728337, | |
| "grad_norm": 0.19175498187541962, | |
| "learning_rate": 1.2808877173333896e-05, | |
| "loss": 0.30698199272155763, | |
| "memory(GiB)": 137.67, | |
| "step": 1125, | |
| "token_acc": 0.8941062176165803, | |
| "train_speed(iter/s)": 0.041159 | |
| }, | |
| { | |
| "epoch": 1.3231850117096018, | |
| "grad_norm": 0.18965595960617065, | |
| "learning_rate": 1.2746856175158556e-05, | |
| "loss": 0.31497323513031006, | |
| "memory(GiB)": 137.67, | |
| "step": 1130, | |
| "token_acc": 0.8871100459606847, | |
| "train_speed(iter/s)": 0.041157 | |
| }, | |
| { | |
| "epoch": 1.32903981264637, | |
| "grad_norm": 0.18627162277698517, | |
| "learning_rate": 1.2684720681079825e-05, | |
| "loss": 0.31060152053833007, | |
| "memory(GiB)": 137.67, | |
| "step": 1135, | |
| "token_acc": 0.871316468541155, | |
| "train_speed(iter/s)": 0.041153 | |
| }, | |
| { | |
| "epoch": 1.334894613583138, | |
| "grad_norm": 0.18565431237220764, | |
| "learning_rate": 1.2622473281062042e-05, | |
| "loss": 0.31475396156311036, | |
| "memory(GiB)": 137.67, | |
| "step": 1140, | |
| "token_acc": 0.8868342272670575, | |
| "train_speed(iter/s)": 0.04115 | |
| }, | |
| { | |
| "epoch": 1.3407494145199064, | |
| "grad_norm": 0.20739679038524628, | |
| "learning_rate": 1.256011656973406e-05, | |
| "loss": 0.32018194198608396, | |
| "memory(GiB)": 137.67, | |
| "step": 1145, | |
| "token_acc": 0.8872068230277186, | |
| "train_speed(iter/s)": 0.041147 | |
| }, | |
| { | |
| "epoch": 1.3466042154566744, | |
| "grad_norm": 0.1901317983865738, | |
| "learning_rate": 1.2497653146281113e-05, | |
| "loss": 0.3108601331710815, | |
| "memory(GiB)": 137.67, | |
| "step": 1150, | |
| "token_acc": 0.8855189570357069, | |
| "train_speed(iter/s)": 0.041141 | |
| }, | |
| { | |
| "epoch": 1.3524590163934427, | |
| "grad_norm": 0.16836309432983398, | |
| "learning_rate": 1.2435085614336459e-05, | |
| "loss": 0.315748405456543, | |
| "memory(GiB)": 137.67, | |
| "step": 1155, | |
| "token_acc": 0.8928414676966292, | |
| "train_speed(iter/s)": 0.041138 | |
| }, | |
| { | |
| "epoch": 1.3583138173302107, | |
| "grad_norm": 0.18492159247398376, | |
| "learning_rate": 1.2372416581872857e-05, | |
| "loss": 0.3051302909851074, | |
| "memory(GiB)": 137.67, | |
| "step": 1160, | |
| "token_acc": 0.8906577988281189, | |
| "train_speed(iter/s)": 0.041133 | |
| }, | |
| { | |
| "epoch": 1.364168618266979, | |
| "grad_norm": 0.17753958702087402, | |
| "learning_rate": 1.2309648661093878e-05, | |
| "loss": 0.3092564582824707, | |
| "memory(GiB)": 137.67, | |
| "step": 1165, | |
| "token_acc": 0.8921087343363074, | |
| "train_speed(iter/s)": 0.041129 | |
| }, | |
| { | |
| "epoch": 1.370023419203747, | |
| "grad_norm": 0.18764352798461914, | |
| "learning_rate": 1.2246784468324993e-05, | |
| "loss": 0.3163435935974121, | |
| "memory(GiB)": 137.67, | |
| "step": 1170, | |
| "token_acc": 0.8760536792329402, | |
| "train_speed(iter/s)": 0.041124 | |
| }, | |
| { | |
| "epoch": 1.3758782201405153, | |
| "grad_norm": 0.19416891038417816, | |
| "learning_rate": 1.218382662390454e-05, | |
| "loss": 0.3042860507965088, | |
| "memory(GiB)": 137.67, | |
| "step": 1175, | |
| "token_acc": 0.875018486527648, | |
| "train_speed(iter/s)": 0.041121 | |
| }, | |
| { | |
| "epoch": 1.3817330210772834, | |
| "grad_norm": 0.18030278384685516, | |
| "learning_rate": 1.2120777752074492e-05, | |
| "loss": 0.3132922172546387, | |
| "memory(GiB)": 137.67, | |
| "step": 1180, | |
| "token_acc": 0.8838601600050099, | |
| "train_speed(iter/s)": 0.041116 | |
| }, | |
| { | |
| "epoch": 1.3875878220140514, | |
| "grad_norm": 0.2763387858867645, | |
| "learning_rate": 1.2057640480871084e-05, | |
| "loss": 0.3143471240997314, | |
| "memory(GiB)": 137.67, | |
| "step": 1185, | |
| "token_acc": 0.8852224576271186, | |
| "train_speed(iter/s)": 0.041114 | |
| }, | |
| { | |
| "epoch": 1.3934426229508197, | |
| "grad_norm": 0.17999497056007385, | |
| "learning_rate": 1.1994417442015243e-05, | |
| "loss": 0.31265532970428467, | |
| "memory(GiB)": 137.67, | |
| "step": 1190, | |
| "token_acc": 0.8907372436335803, | |
| "train_speed(iter/s)": 0.041112 | |
| }, | |
| { | |
| "epoch": 1.399297423887588, | |
| "grad_norm": 0.18372628092765808, | |
| "learning_rate": 1.193111127080292e-05, | |
| "loss": 0.30383052825927737, | |
| "memory(GiB)": 137.67, | |
| "step": 1195, | |
| "token_acc": 0.8938835107946411, | |
| "train_speed(iter/s)": 0.041109 | |
| }, | |
| { | |
| "epoch": 1.405152224824356, | |
| "grad_norm": 0.1798890382051468, | |
| "learning_rate": 1.186772460599523e-05, | |
| "loss": 0.30336918830871584, | |
| "memory(GiB)": 137.67, | |
| "step": 1200, | |
| "token_acc": 0.891896889446055, | |
| "train_speed(iter/s)": 0.041105 | |
| }, | |
| { | |
| "epoch": 1.411007025761124, | |
| "grad_norm": 0.1862761676311493, | |
| "learning_rate": 1.1804260089708464e-05, | |
| "loss": 0.3127150535583496, | |
| "memory(GiB)": 137.67, | |
| "step": 1205, | |
| "token_acc": 0.8781827694454133, | |
| "train_speed(iter/s)": 0.041099 | |
| }, | |
| { | |
| "epoch": 1.4168618266978923, | |
| "grad_norm": 0.1872834414243698, | |
| "learning_rate": 1.1740720367303958e-05, | |
| "loss": 0.3076412916183472, | |
| "memory(GiB)": 137.67, | |
| "step": 1210, | |
| "token_acc": 0.8865224656924374, | |
| "train_speed(iter/s)": 0.041096 | |
| }, | |
| { | |
| "epoch": 1.4227166276346606, | |
| "grad_norm": 0.1868448704481125, | |
| "learning_rate": 1.1677108087277835e-05, | |
| "loss": 0.3139200210571289, | |
| "memory(GiB)": 137.67, | |
| "step": 1215, | |
| "token_acc": 0.8866469436643504, | |
| "train_speed(iter/s)": 0.041092 | |
| }, | |
| { | |
| "epoch": 1.4285714285714286, | |
| "grad_norm": 0.1959424465894699, | |
| "learning_rate": 1.1613425901150595e-05, | |
| "loss": 0.3134448051452637, | |
| "memory(GiB)": 137.67, | |
| "step": 1220, | |
| "token_acc": 0.8883061552452257, | |
| "train_speed(iter/s)": 0.041088 | |
| }, | |
| { | |
| "epoch": 1.4344262295081966, | |
| "grad_norm": 0.1766284704208374, | |
| "learning_rate": 1.15496764633566e-05, | |
| "loss": 0.3212412357330322, | |
| "memory(GiB)": 137.67, | |
| "step": 1225, | |
| "token_acc": 0.8780539320458743, | |
| "train_speed(iter/s)": 0.041084 | |
| }, | |
| { | |
| "epoch": 1.440281030444965, | |
| "grad_norm": 0.17711302638053894, | |
| "learning_rate": 1.1485862431133445e-05, | |
| "loss": 0.3123058795928955, | |
| "memory(GiB)": 137.67, | |
| "step": 1230, | |
| "token_acc": 0.8900835233492141, | |
| "train_speed(iter/s)": 0.041082 | |
| }, | |
| { | |
| "epoch": 1.446135831381733, | |
| "grad_norm": 0.1747256964445114, | |
| "learning_rate": 1.1421986464411169e-05, | |
| "loss": 0.31295697689056395, | |
| "memory(GiB)": 137.67, | |
| "step": 1235, | |
| "token_acc": 0.8767080016888458, | |
| "train_speed(iter/s)": 0.041075 | |
| }, | |
| { | |
| "epoch": 1.4519906323185012, | |
| "grad_norm": 0.18440908193588257, | |
| "learning_rate": 1.1358051225701404e-05, | |
| "loss": 0.30406386852264405, | |
| "memory(GiB)": 137.67, | |
| "step": 1240, | |
| "token_acc": 0.8795020947920581, | |
| "train_speed(iter/s)": 0.041071 | |
| }, | |
| { | |
| "epoch": 1.4578454332552693, | |
| "grad_norm": 0.17828240990638733, | |
| "learning_rate": 1.1294059379986384e-05, | |
| "loss": 0.3121625900268555, | |
| "memory(GiB)": 137.67, | |
| "step": 1245, | |
| "token_acc": 0.880069535801541, | |
| "train_speed(iter/s)": 0.041066 | |
| }, | |
| { | |
| "epoch": 1.4637002341920375, | |
| "grad_norm": 0.19148212671279907, | |
| "learning_rate": 1.1230013594607874e-05, | |
| "loss": 0.31345176696777344, | |
| "memory(GiB)": 137.67, | |
| "step": 1250, | |
| "token_acc": 0.8839757074137398, | |
| "train_speed(iter/s)": 0.041062 | |
| }, | |
| { | |
| "epoch": 1.4695550351288056, | |
| "grad_norm": 0.1828489750623703, | |
| "learning_rate": 1.1165916539155968e-05, | |
| "loss": 0.3104730129241943, | |
| "memory(GiB)": 137.67, | |
| "step": 1255, | |
| "token_acc": 0.8880499764055864, | |
| "train_speed(iter/s)": 0.04106 | |
| }, | |
| { | |
| "epoch": 1.4754098360655736, | |
| "grad_norm": 0.17934924364089966, | |
| "learning_rate": 1.1101770885357843e-05, | |
| "loss": 0.3066437244415283, | |
| "memory(GiB)": 137.67, | |
| "step": 1260, | |
| "token_acc": 0.8892594538641362, | |
| "train_speed(iter/s)": 0.041058 | |
| }, | |
| { | |
| "epoch": 1.481264637002342, | |
| "grad_norm": 0.16536173224449158, | |
| "learning_rate": 1.1037579306966365e-05, | |
| "loss": 0.3071906566619873, | |
| "memory(GiB)": 137.67, | |
| "step": 1265, | |
| "token_acc": 0.8958809106175363, | |
| "train_speed(iter/s)": 0.041054 | |
| }, | |
| { | |
| "epoch": 1.4871194379391102, | |
| "grad_norm": 0.18694446980953217, | |
| "learning_rate": 1.0973344479648652e-05, | |
| "loss": 0.3013455867767334, | |
| "memory(GiB)": 137.67, | |
| "step": 1270, | |
| "token_acc": 0.8899813852868301, | |
| "train_speed(iter/s)": 0.04105 | |
| }, | |
| { | |
| "epoch": 1.4929742388758782, | |
| "grad_norm": 0.17580904066562653, | |
| "learning_rate": 1.0909069080874556e-05, | |
| "loss": 0.30318174362182615, | |
| "memory(GiB)": 137.67, | |
| "step": 1275, | |
| "token_acc": 0.8817699648607147, | |
| "train_speed(iter/s)": 0.041047 | |
| }, | |
| { | |
| "epoch": 1.4988290398126463, | |
| "grad_norm": 0.18754124641418457, | |
| "learning_rate": 1.0844755789805042e-05, | |
| "loss": 0.31064305305480955, | |
| "memory(GiB)": 137.67, | |
| "step": 1280, | |
| "token_acc": 0.8804021416788542, | |
| "train_speed(iter/s)": 0.041044 | |
| }, | |
| { | |
| "epoch": 1.5046838407494145, | |
| "grad_norm": 0.19590285420417786, | |
| "learning_rate": 1.0780407287180526e-05, | |
| "loss": 0.3148102045059204, | |
| "memory(GiB)": 137.67, | |
| "step": 1285, | |
| "token_acc": 0.8805457351989244, | |
| "train_speed(iter/s)": 0.041039 | |
| }, | |
| { | |
| "epoch": 1.5105386416861828, | |
| "grad_norm": 0.19473980367183685, | |
| "learning_rate": 1.0716026255209124e-05, | |
| "loss": 0.3106101036071777, | |
| "memory(GiB)": 137.67, | |
| "step": 1290, | |
| "token_acc": 0.879328668153049, | |
| "train_speed(iter/s)": 0.041037 | |
| }, | |
| { | |
| "epoch": 1.5163934426229508, | |
| "grad_norm": 0.18378229439258575, | |
| "learning_rate": 1.0651615377454872e-05, | |
| "loss": 0.3110929250717163, | |
| "memory(GiB)": 137.67, | |
| "step": 1295, | |
| "token_acc": 0.8856033818930429, | |
| "train_speed(iter/s)": 0.041033 | |
| }, | |
| { | |
| "epoch": 1.5222482435597189, | |
| "grad_norm": 0.18482638895511627, | |
| "learning_rate": 1.0587177338725834e-05, | |
| "loss": 0.3163102626800537, | |
| "memory(GiB)": 137.67, | |
| "step": 1300, | |
| "token_acc": 0.8870778115329991, | |
| "train_speed(iter/s)": 0.04103 | |
| }, | |
| { | |
| "epoch": 1.5281030444964872, | |
| "grad_norm": 0.17333081364631653, | |
| "learning_rate": 1.0522714824962228e-05, | |
| "loss": 0.30377721786499023, | |
| "memory(GiB)": 137.67, | |
| "step": 1305, | |
| "token_acc": 0.8980077050082553, | |
| "train_speed(iter/s)": 0.041028 | |
| }, | |
| { | |
| "epoch": 1.5339578454332554, | |
| "grad_norm": 0.1912304162979126, | |
| "learning_rate": 1.0458230523124443e-05, | |
| "loss": 0.3162518501281738, | |
| "memory(GiB)": 137.67, | |
| "step": 1310, | |
| "token_acc": 0.8886457770855507, | |
| "train_speed(iter/s)": 0.041024 | |
| }, | |
| { | |
| "epoch": 1.5398126463700235, | |
| "grad_norm": 0.1846192628145218, | |
| "learning_rate": 1.0393727121081057e-05, | |
| "loss": 0.3126535892486572, | |
| "memory(GiB)": 137.67, | |
| "step": 1315, | |
| "token_acc": 0.8860128586991429, | |
| "train_speed(iter/s)": 0.041023 | |
| }, | |
| { | |
| "epoch": 1.5456674473067915, | |
| "grad_norm": 0.17747725546360016, | |
| "learning_rate": 1.0329207307496785e-05, | |
| "loss": 0.30208649635314944, | |
| "memory(GiB)": 137.67, | |
| "step": 1320, | |
| "token_acc": 0.8879456759093934, | |
| "train_speed(iter/s)": 0.04102 | |
| }, | |
| { | |
| "epoch": 1.5515222482435598, | |
| "grad_norm": 0.18443572521209717, | |
| "learning_rate": 1.0264673771720429e-05, | |
| "loss": 0.3092689037322998, | |
| "memory(GiB)": 137.67, | |
| "step": 1325, | |
| "token_acc": 0.892488839320581, | |
| "train_speed(iter/s)": 0.041016 | |
| }, | |
| { | |
| "epoch": 1.5573770491803278, | |
| "grad_norm": 0.18431353569030762, | |
| "learning_rate": 1.0200129203672754e-05, | |
| "loss": 0.3100308656692505, | |
| "memory(GiB)": 137.67, | |
| "step": 1330, | |
| "token_acc": 0.8782463261547713, | |
| "train_speed(iter/s)": 0.041012 | |
| }, | |
| { | |
| "epoch": 1.5632318501170959, | |
| "grad_norm": 0.1662471741437912, | |
| "learning_rate": 1.0135576293734381e-05, | |
| "loss": 0.30292906761169436, | |
| "memory(GiB)": 137.67, | |
| "step": 1335, | |
| "token_acc": 0.8942868271402976, | |
| "train_speed(iter/s)": 0.04101 | |
| }, | |
| { | |
| "epoch": 1.5690866510538641, | |
| "grad_norm": 0.1806328445672989, | |
| "learning_rate": 1.007101773263365e-05, | |
| "loss": 0.31366329193115233, | |
| "memory(GiB)": 137.67, | |
| "step": 1340, | |
| "token_acc": 0.8866166119192868, | |
| "train_speed(iter/s)": 0.041006 | |
| }, | |
| { | |
| "epoch": 1.5749414519906324, | |
| "grad_norm": 0.16915848851203918, | |
| "learning_rate": 1.0006456211334445e-05, | |
| "loss": 0.30766754150390624, | |
| "memory(GiB)": 137.67, | |
| "step": 1345, | |
| "token_acc": 0.8863719744503918, | |
| "train_speed(iter/s)": 0.041006 | |
| }, | |
| { | |
| "epoch": 1.5807962529274004, | |
| "grad_norm": 0.16690009832382202, | |
| "learning_rate": 9.941894420924044e-06, | |
| "loss": 0.3059431314468384, | |
| "memory(GiB)": 137.67, | |
| "step": 1350, | |
| "token_acc": 0.8971780549005762, | |
| "train_speed(iter/s)": 0.041001 | |
| }, | |
| { | |
| "epoch": 1.5866510538641685, | |
| "grad_norm": 0.17337647080421448, | |
| "learning_rate": 9.87733505250094e-06, | |
| "loss": 0.3098172664642334, | |
| "memory(GiB)": 137.67, | |
| "step": 1355, | |
| "token_acc": 0.8863237006126697, | |
| "train_speed(iter/s)": 0.040998 | |
| }, | |
| { | |
| "epoch": 1.5925058548009368, | |
| "grad_norm": 0.17512920498847961, | |
| "learning_rate": 9.812780797062678e-06, | |
| "loss": 0.30655522346496583, | |
| "memory(GiB)": 137.67, | |
| "step": 1360, | |
| "token_acc": 0.8899597184053006, | |
| "train_speed(iter/s)": 0.040993 | |
| }, | |
| { | |
| "epoch": 1.598360655737705, | |
| "grad_norm": 0.1765688955783844, | |
| "learning_rate": 9.748234345393672e-06, | |
| "loss": 0.3023026466369629, | |
| "memory(GiB)": 137.67, | |
| "step": 1365, | |
| "token_acc": 0.8879338667133921, | |
| "train_speed(iter/s)": 0.040989 | |
| }, | |
| { | |
| "epoch": 1.604215456674473, | |
| "grad_norm": 0.18416614830493927, | |
| "learning_rate": 9.68369838795306e-06, | |
| "loss": 0.30958683490753175, | |
| "memory(GiB)": 137.67, | |
| "step": 1370, | |
| "token_acc": 0.8849809108691687, | |
| "train_speed(iter/s)": 0.040984 | |
| }, | |
| { | |
| "epoch": 1.6100702576112411, | |
| "grad_norm": 0.17386697232723236, | |
| "learning_rate": 9.61917561476255e-06, | |
| "loss": 0.30420713424682616, | |
| "memory(GiB)": 137.67, | |
| "step": 1375, | |
| "token_acc": 0.8786233528080887, | |
| "train_speed(iter/s)": 0.040981 | |
| }, | |
| { | |
| "epoch": 1.6159250585480094, | |
| "grad_norm": 0.18169918656349182, | |
| "learning_rate": 9.554668715294305e-06, | |
| "loss": 0.31483819484710696, | |
| "memory(GiB)": 137.67, | |
| "step": 1380, | |
| "token_acc": 0.8864194675551166, | |
| "train_speed(iter/s)": 0.040979 | |
| }, | |
| { | |
| "epoch": 1.6217798594847777, | |
| "grad_norm": 0.1892368197441101, | |
| "learning_rate": 9.490180378358826e-06, | |
| "loss": 0.3172303676605225, | |
| "memory(GiB)": 137.67, | |
| "step": 1385, | |
| "token_acc": 0.8828729942067092, | |
| "train_speed(iter/s)": 0.040977 | |
| }, | |
| { | |
| "epoch": 1.6276346604215457, | |
| "grad_norm": 0.1751379817724228, | |
| "learning_rate": 9.425713291992878e-06, | |
| "loss": 0.30653929710388184, | |
| "memory(GiB)": 137.67, | |
| "step": 1390, | |
| "token_acc": 0.8895787320550146, | |
| "train_speed(iter/s)": 0.040974 | |
| }, | |
| { | |
| "epoch": 1.6334894613583137, | |
| "grad_norm": 0.18914154171943665, | |
| "learning_rate": 9.361270143347452e-06, | |
| "loss": 0.31959149837493894, | |
| "memory(GiB)": 137.67, | |
| "step": 1395, | |
| "token_acc": 0.8822264278089348, | |
| "train_speed(iter/s)": 0.040972 | |
| }, | |
| { | |
| "epoch": 1.639344262295082, | |
| "grad_norm": 0.16736507415771484, | |
| "learning_rate": 9.296853618575753e-06, | |
| "loss": 0.30730547904968264, | |
| "memory(GiB)": 137.67, | |
| "step": 1400, | |
| "token_acc": 0.8928722715040367, | |
| "train_speed(iter/s)": 0.04097 | |
| }, | |
| { | |
| "epoch": 1.6451990632318503, | |
| "grad_norm": 0.1708020716905594, | |
| "learning_rate": 9.232466402721241e-06, | |
| "loss": 0.31717801094055176, | |
| "memory(GiB)": 137.67, | |
| "step": 1405, | |
| "token_acc": 0.886989175916414, | |
| "train_speed(iter/s)": 0.040969 | |
| }, | |
| { | |
| "epoch": 1.651053864168618, | |
| "grad_norm": 0.17622792720794678, | |
| "learning_rate": 9.1681111796057e-06, | |
| "loss": 0.3083082675933838, | |
| "memory(GiB)": 137.67, | |
| "step": 1410, | |
| "token_acc": 0.8884494066990437, | |
| "train_speed(iter/s)": 0.040968 | |
| }, | |
| { | |
| "epoch": 1.6569086651053864, | |
| "grad_norm": 0.1885053962469101, | |
| "learning_rate": 9.103790631717375e-06, | |
| "loss": 0.32230064868927, | |
| "memory(GiB)": 137.67, | |
| "step": 1415, | |
| "token_acc": 0.878518037454961, | |
| "train_speed(iter/s)": 0.040965 | |
| }, | |
| { | |
| "epoch": 1.6627634660421546, | |
| "grad_norm": 0.17244482040405273, | |
| "learning_rate": 9.039507440099164e-06, | |
| "loss": 0.30806798934936525, | |
| "memory(GiB)": 137.67, | |
| "step": 1420, | |
| "token_acc": 0.8929606011942812, | |
| "train_speed(iter/s)": 0.040962 | |
| }, | |
| { | |
| "epoch": 1.6686182669789227, | |
| "grad_norm": 0.18172700703144073, | |
| "learning_rate": 8.975264284236866e-06, | |
| "loss": 0.30987024307250977, | |
| "memory(GiB)": 137.67, | |
| "step": 1425, | |
| "token_acc": 0.8885019605876434, | |
| "train_speed(iter/s)": 0.040961 | |
| }, | |
| { | |
| "epoch": 1.6744730679156907, | |
| "grad_norm": 0.18555694818496704, | |
| "learning_rate": 8.911063841947476e-06, | |
| "loss": 0.31224822998046875, | |
| "memory(GiB)": 137.67, | |
| "step": 1430, | |
| "token_acc": 0.8862099925232826, | |
| "train_speed(iter/s)": 0.040958 | |
| }, | |
| { | |
| "epoch": 1.680327868852459, | |
| "grad_norm": 0.18322236835956573, | |
| "learning_rate": 8.846908789267589e-06, | |
| "loss": 0.31196701526641846, | |
| "memory(GiB)": 137.67, | |
| "step": 1435, | |
| "token_acc": 0.8887980814742356, | |
| "train_speed(iter/s)": 0.040958 | |
| }, | |
| { | |
| "epoch": 1.6861826697892273, | |
| "grad_norm": 0.17747406661510468, | |
| "learning_rate": 8.78280180034184e-06, | |
| "loss": 0.3032996654510498, | |
| "memory(GiB)": 137.67, | |
| "step": 1440, | |
| "token_acc": 0.8822490977332802, | |
| "train_speed(iter/s)": 0.040955 | |
| }, | |
| { | |
| "epoch": 1.6920374707259953, | |
| "grad_norm": 0.18120799958705902, | |
| "learning_rate": 8.718745547311458e-06, | |
| "loss": 0.3137194633483887, | |
| "memory(GiB)": 137.67, | |
| "step": 1445, | |
| "token_acc": 0.8828540900663084, | |
| "train_speed(iter/s)": 0.040951 | |
| }, | |
| { | |
| "epoch": 1.6978922716627634, | |
| "grad_norm": 0.17743031680583954, | |
| "learning_rate": 8.654742700202849e-06, | |
| "loss": 0.31336297988891604, | |
| "memory(GiB)": 137.67, | |
| "step": 1450, | |
| "token_acc": 0.8851623130427727, | |
| "train_speed(iter/s)": 0.040949 | |
| }, | |
| { | |
| "epoch": 1.7037470725995316, | |
| "grad_norm": 0.1702745109796524, | |
| "learning_rate": 8.590795926816348e-06, | |
| "loss": 0.3027879953384399, | |
| "memory(GiB)": 137.67, | |
| "step": 1455, | |
| "token_acc": 0.8840805588371897, | |
| "train_speed(iter/s)": 0.040947 | |
| }, | |
| { | |
| "epoch": 1.7096018735362999, | |
| "grad_norm": 0.17240740358829498, | |
| "learning_rate": 8.526907892614986e-06, | |
| "loss": 0.3072841167449951, | |
| "memory(GiB)": 137.67, | |
| "step": 1460, | |
| "token_acc": 0.88948632592922, | |
| "train_speed(iter/s)": 0.040943 | |
| }, | |
| { | |
| "epoch": 1.715456674473068, | |
| "grad_norm": 0.17982088029384613, | |
| "learning_rate": 8.463081260613391e-06, | |
| "loss": 0.30924406051635744, | |
| "memory(GiB)": 137.67, | |
| "step": 1465, | |
| "token_acc": 0.8940978807037782, | |
| "train_speed(iter/s)": 0.04094 | |
| }, | |
| { | |
| "epoch": 1.721311475409836, | |
| "grad_norm": 0.19751447439193726, | |
| "learning_rate": 8.399318691266806e-06, | |
| "loss": 0.3119847774505615, | |
| "memory(GiB)": 137.67, | |
| "step": 1470, | |
| "token_acc": 0.8852366571009662, | |
| "train_speed(iter/s)": 0.040936 | |
| }, | |
| { | |
| "epoch": 1.7271662763466042, | |
| "grad_norm": 0.18603962659835815, | |
| "learning_rate": 8.335622842360168e-06, | |
| "loss": 0.3066195011138916, | |
| "memory(GiB)": 137.67, | |
| "step": 1475, | |
| "token_acc": 0.8890113777789009, | |
| "train_speed(iter/s)": 0.040933 | |
| }, | |
| { | |
| "epoch": 1.7330210772833725, | |
| "grad_norm": 0.2541693449020386, | |
| "learning_rate": 8.271996368897345e-06, | |
| "loss": 0.3128560781478882, | |
| "memory(GiB)": 137.67, | |
| "step": 1480, | |
| "token_acc": 0.8902386961489684, | |
| "train_speed(iter/s)": 0.040929 | |
| }, | |
| { | |
| "epoch": 1.7388758782201406, | |
| "grad_norm": 0.16992934048175812, | |
| "learning_rate": 8.208441922990454e-06, | |
| "loss": 0.3037855863571167, | |
| "memory(GiB)": 137.67, | |
| "step": 1485, | |
| "token_acc": 0.8849534643226473, | |
| "train_speed(iter/s)": 0.040926 | |
| }, | |
| { | |
| "epoch": 1.7447306791569086, | |
| "grad_norm": 0.17065441608428955, | |
| "learning_rate": 8.144962153749331e-06, | |
| "loss": 0.30540289878845217, | |
| "memory(GiB)": 137.67, | |
| "step": 1490, | |
| "token_acc": 0.8819315749736371, | |
| "train_speed(iter/s)": 0.040924 | |
| }, | |
| { | |
| "epoch": 1.7505854800936769, | |
| "grad_norm": 0.1787635236978531, | |
| "learning_rate": 8.081559707171094e-06, | |
| "loss": 0.31698925495147706, | |
| "memory(GiB)": 137.67, | |
| "step": 1495, | |
| "token_acc": 0.8824724072862914, | |
| "train_speed(iter/s)": 0.040923 | |
| }, | |
| { | |
| "epoch": 1.756440281030445, | |
| "grad_norm": 0.1751013845205307, | |
| "learning_rate": 8.01823722602986e-06, | |
| "loss": 0.30347585678100586, | |
| "memory(GiB)": 137.67, | |
| "step": 1500, | |
| "token_acc": 0.893298859486769, | |
| "train_speed(iter/s)": 0.040922 | |
| }, | |
| { | |
| "epoch": 1.762295081967213, | |
| "grad_norm": 0.17399156093597412, | |
| "learning_rate": 7.954997349766576e-06, | |
| "loss": 0.3116060972213745, | |
| "memory(GiB)": 137.67, | |
| "step": 1505, | |
| "token_acc": 0.8889070320988275, | |
| "train_speed(iter/s)": 0.040921 | |
| }, | |
| { | |
| "epoch": 1.7681498829039812, | |
| "grad_norm": 0.18837633728981018, | |
| "learning_rate": 7.891842714379027e-06, | |
| "loss": 0.29880785942077637, | |
| "memory(GiB)": 137.67, | |
| "step": 1510, | |
| "token_acc": 0.893647204719971, | |
| "train_speed(iter/s)": 0.040918 | |
| }, | |
| { | |
| "epoch": 1.7740046838407495, | |
| "grad_norm": 0.1845746487379074, | |
| "learning_rate": 7.828775952311921e-06, | |
| "loss": 0.30261945724487305, | |
| "memory(GiB)": 137.67, | |
| "step": 1515, | |
| "token_acc": 0.8851783808483535, | |
| "train_speed(iter/s)": 0.040914 | |
| }, | |
| { | |
| "epoch": 1.7798594847775175, | |
| "grad_norm": 0.16885152459144592, | |
| "learning_rate": 7.765799692347201e-06, | |
| "loss": 0.3042313575744629, | |
| "memory(GiB)": 137.67, | |
| "step": 1520, | |
| "token_acc": 0.8835214994418757, | |
| "train_speed(iter/s)": 0.040911 | |
| }, | |
| { | |
| "epoch": 1.7857142857142856, | |
| "grad_norm": 0.1790182739496231, | |
| "learning_rate": 7.702916559494444e-06, | |
| "loss": 0.31259956359863283, | |
| "memory(GiB)": 137.67, | |
| "step": 1525, | |
| "token_acc": 0.8878653758934018, | |
| "train_speed(iter/s)": 0.040909 | |
| }, | |
| { | |
| "epoch": 1.7915690866510539, | |
| "grad_norm": 0.17695166170597076, | |
| "learning_rate": 7.64012917488146e-06, | |
| "loss": 0.29359025955200196, | |
| "memory(GiB)": 137.67, | |
| "step": 1530, | |
| "token_acc": 0.9000399023492115, | |
| "train_speed(iter/s)": 0.040908 | |
| }, | |
| { | |
| "epoch": 1.7974238875878221, | |
| "grad_norm": 0.18347503244876862, | |
| "learning_rate": 7.577440155645028e-06, | |
| "loss": 0.30249216556549074, | |
| "memory(GiB)": 137.67, | |
| "step": 1535, | |
| "token_acc": 0.8902694639046774, | |
| "train_speed(iter/s)": 0.040904 | |
| }, | |
| { | |
| "epoch": 1.8032786885245902, | |
| "grad_norm": 0.1697729527950287, | |
| "learning_rate": 7.514852114821811e-06, | |
| "loss": 0.31291751861572265, | |
| "memory(GiB)": 137.67, | |
| "step": 1540, | |
| "token_acc": 0.8868685350765146, | |
| "train_speed(iter/s)": 0.040902 | |
| }, | |
| { | |
| "epoch": 1.8091334894613582, | |
| "grad_norm": 0.16477090120315552, | |
| "learning_rate": 7.452367661239433e-06, | |
| "loss": 0.29220216274261473, | |
| "memory(GiB)": 137.67, | |
| "step": 1545, | |
| "token_acc": 0.8877543630965312, | |
| "train_speed(iter/s)": 0.040899 | |
| }, | |
| { | |
| "epoch": 1.8149882903981265, | |
| "grad_norm": 0.19079044461250305, | |
| "learning_rate": 7.389989399407741e-06, | |
| "loss": 0.3156083106994629, | |
| "memory(GiB)": 137.67, | |
| "step": 1550, | |
| "token_acc": 0.8873283112245697, | |
| "train_speed(iter/s)": 0.040896 | |
| }, | |
| { | |
| "epoch": 1.8208430913348947, | |
| "grad_norm": 0.1723940074443817, | |
| "learning_rate": 7.3277199294102485e-06, | |
| "loss": 0.30045547485351565, | |
| "memory(GiB)": 137.67, | |
| "step": 1555, | |
| "token_acc": 0.8850201501823112, | |
| "train_speed(iter/s)": 0.040894 | |
| }, | |
| { | |
| "epoch": 1.8266978922716628, | |
| "grad_norm": 0.18594853579998016, | |
| "learning_rate": 7.265561846795741e-06, | |
| "loss": 0.3101131677627563, | |
| "memory(GiB)": 137.67, | |
| "step": 1560, | |
| "token_acc": 0.8868083283139077, | |
| "train_speed(iter/s)": 0.040889 | |
| }, | |
| { | |
| "epoch": 1.8325526932084308, | |
| "grad_norm": 0.1757504642009735, | |
| "learning_rate": 7.203517742470101e-06, | |
| "loss": 0.30873966217041016, | |
| "memory(GiB)": 137.67, | |
| "step": 1565, | |
| "token_acc": 0.8949954641669187, | |
| "train_speed(iter/s)": 0.040886 | |
| }, | |
| { | |
| "epoch": 1.838407494145199, | |
| "grad_norm": 0.2077726572751999, | |
| "learning_rate": 7.141590202588312e-06, | |
| "loss": 0.3127377986907959, | |
| "memory(GiB)": 137.67, | |
| "step": 1570, | |
| "token_acc": 0.888584743745537, | |
| "train_speed(iter/s)": 0.040885 | |
| }, | |
| { | |
| "epoch": 1.8442622950819674, | |
| "grad_norm": 0.17814461886882782, | |
| "learning_rate": 7.079781808446648e-06, | |
| "loss": 0.31596999168395995, | |
| "memory(GiB)": 137.67, | |
| "step": 1575, | |
| "token_acc": 0.8755756783669405, | |
| "train_speed(iter/s)": 0.040882 | |
| }, | |
| { | |
| "epoch": 1.8501170960187352, | |
| "grad_norm": 0.16512958705425262, | |
| "learning_rate": 7.018095136375089e-06, | |
| "loss": 0.3012762308120728, | |
| "memory(GiB)": 137.67, | |
| "step": 1580, | |
| "token_acc": 0.8862999993707803, | |
| "train_speed(iter/s)": 0.04088 | |
| }, | |
| { | |
| "epoch": 1.8559718969555035, | |
| "grad_norm": 0.18698780238628387, | |
| "learning_rate": 6.956532757629945e-06, | |
| "loss": 0.3080646514892578, | |
| "memory(GiB)": 137.67, | |
| "step": 1585, | |
| "token_acc": 0.8861714900322669, | |
| "train_speed(iter/s)": 0.040877 | |
| }, | |
| { | |
| "epoch": 1.8618266978922717, | |
| "grad_norm": 0.17041386663913727, | |
| "learning_rate": 6.89509723828665e-06, | |
| "loss": 0.3119032382965088, | |
| "memory(GiB)": 137.67, | |
| "step": 1590, | |
| "token_acc": 0.8861256952099799, | |
| "train_speed(iter/s)": 0.040875 | |
| }, | |
| { | |
| "epoch": 1.8676814988290398, | |
| "grad_norm": 0.18812042474746704, | |
| "learning_rate": 6.833791139132824e-06, | |
| "loss": 0.2984042167663574, | |
| "memory(GiB)": 137.67, | |
| "step": 1595, | |
| "token_acc": 0.8881694299555838, | |
| "train_speed(iter/s)": 0.040871 | |
| }, | |
| { | |
| "epoch": 1.8735362997658078, | |
| "grad_norm": 0.16663610935211182, | |
| "learning_rate": 6.772617015561529e-06, | |
| "loss": 0.3069270610809326, | |
| "memory(GiB)": 137.67, | |
| "step": 1600, | |
| "token_acc": 0.8785419403265153, | |
| "train_speed(iter/s)": 0.040869 | |
| }, | |
| { | |
| "epoch": 1.879391100702576, | |
| "grad_norm": 0.16731353104114532, | |
| "learning_rate": 6.7115774174647475e-06, | |
| "loss": 0.29993810653686526, | |
| "memory(GiB)": 137.67, | |
| "step": 1605, | |
| "token_acc": 0.8944355407195264, | |
| "train_speed(iter/s)": 0.040868 | |
| }, | |
| { | |
| "epoch": 1.8852459016393444, | |
| "grad_norm": 0.18671032786369324, | |
| "learning_rate": 6.6506748891271045e-06, | |
| "loss": 0.3104290723800659, | |
| "memory(GiB)": 137.67, | |
| "step": 1610, | |
| "token_acc": 0.893398089707724, | |
| "train_speed(iter/s)": 0.040866 | |
| }, | |
| { | |
| "epoch": 1.8911007025761124, | |
| "grad_norm": 0.17069920897483826, | |
| "learning_rate": 6.5899119691198025e-06, | |
| "loss": 0.30440511703491213, | |
| "memory(GiB)": 137.67, | |
| "step": 1615, | |
| "token_acc": 0.8883004841907675, | |
| "train_speed(iter/s)": 0.040865 | |
| }, | |
| { | |
| "epoch": 1.8969555035128804, | |
| "grad_norm": 0.1704709678888321, | |
| "learning_rate": 6.529291190194829e-06, | |
| "loss": 0.3084626436233521, | |
| "memory(GiB)": 137.67, | |
| "step": 1620, | |
| "token_acc": 0.887373335138147, | |
| "train_speed(iter/s)": 0.040864 | |
| }, | |
| { | |
| "epoch": 1.9028103044496487, | |
| "grad_norm": 0.1708633005619049, | |
| "learning_rate": 6.468815079179364e-06, | |
| "loss": 0.30423784255981445, | |
| "memory(GiB)": 137.67, | |
| "step": 1625, | |
| "token_acc": 0.8923868074324853, | |
| "train_speed(iter/s)": 0.040862 | |
| }, | |
| { | |
| "epoch": 1.908665105386417, | |
| "grad_norm": 0.17672830820083618, | |
| "learning_rate": 6.408486156870466e-06, | |
| "loss": 0.31655054092407225, | |
| "memory(GiB)": 137.67, | |
| "step": 1630, | |
| "token_acc": 0.8692423282788768, | |
| "train_speed(iter/s)": 0.04086 | |
| }, | |
| { | |
| "epoch": 1.914519906323185, | |
| "grad_norm": 0.1735108494758606, | |
| "learning_rate": 6.348306937929991e-06, | |
| "loss": 0.31425652503967283, | |
| "memory(GiB)": 137.67, | |
| "step": 1635, | |
| "token_acc": 0.882395514622517, | |
| "train_speed(iter/s)": 0.04086 | |
| }, | |
| { | |
| "epoch": 1.920374707259953, | |
| "grad_norm": 0.15910685062408447, | |
| "learning_rate": 6.288279930779789e-06, | |
| "loss": 0.29740355014801023, | |
| "memory(GiB)": 137.67, | |
| "step": 1640, | |
| "token_acc": 0.8963298424379659, | |
| "train_speed(iter/s)": 0.040858 | |
| }, | |
| { | |
| "epoch": 1.9262295081967213, | |
| "grad_norm": 0.17650458216667175, | |
| "learning_rate": 6.228407637497131e-06, | |
| "loss": 0.30800676345825195, | |
| "memory(GiB)": 137.67, | |
| "step": 1645, | |
| "token_acc": 0.8754677877967858, | |
| "train_speed(iter/s)": 0.040855 | |
| }, | |
| { | |
| "epoch": 1.9320843091334896, | |
| "grad_norm": 0.16745297610759735, | |
| "learning_rate": 6.1686925537104306e-06, | |
| "loss": 0.2977410316467285, | |
| "memory(GiB)": 137.67, | |
| "step": 1650, | |
| "token_acc": 0.8798736234089867, | |
| "train_speed(iter/s)": 0.040852 | |
| }, | |
| { | |
| "epoch": 1.9379391100702577, | |
| "grad_norm": 0.1728445142507553, | |
| "learning_rate": 6.109137168495205e-06, | |
| "loss": 0.304546856880188, | |
| "memory(GiB)": 137.67, | |
| "step": 1655, | |
| "token_acc": 0.9005831398969597, | |
| "train_speed(iter/s)": 0.040851 | |
| }, | |
| { | |
| "epoch": 1.9437939110070257, | |
| "grad_norm": 0.1682547777891159, | |
| "learning_rate": 6.049743964270336e-06, | |
| "loss": 0.3136142730712891, | |
| "memory(GiB)": 137.67, | |
| "step": 1660, | |
| "token_acc": 0.8856946741131322, | |
| "train_speed(iter/s)": 0.040848 | |
| }, | |
| { | |
| "epoch": 1.949648711943794, | |
| "grad_norm": 0.18915309011936188, | |
| "learning_rate": 5.990515416694591e-06, | |
| "loss": 0.3113490104675293, | |
| "memory(GiB)": 137.67, | |
| "step": 1665, | |
| "token_acc": 0.8886227731406503, | |
| "train_speed(iter/s)": 0.040845 | |
| }, | |
| { | |
| "epoch": 1.955503512880562, | |
| "grad_norm": 0.18081413209438324, | |
| "learning_rate": 5.931453994563434e-06, | |
| "loss": 0.30602524280548093, | |
| "memory(GiB)": 137.67, | |
| "step": 1670, | |
| "token_acc": 0.8937767328555647, | |
| "train_speed(iter/s)": 0.040844 | |
| }, | |
| { | |
| "epoch": 1.96135831381733, | |
| "grad_norm": 0.2595233917236328, | |
| "learning_rate": 5.872562159706116e-06, | |
| "loss": 0.309699273109436, | |
| "memory(GiB)": 137.67, | |
| "step": 1675, | |
| "token_acc": 0.883843976093111, | |
| "train_speed(iter/s)": 0.040842 | |
| }, | |
| { | |
| "epoch": 1.9672131147540983, | |
| "grad_norm": 0.17678314447402954, | |
| "learning_rate": 5.8138423668830605e-06, | |
| "loss": 0.30298714637756347, | |
| "memory(GiB)": 137.67, | |
| "step": 1680, | |
| "token_acc": 0.8865513684995878, | |
| "train_speed(iter/s)": 0.040842 | |
| }, | |
| { | |
| "epoch": 1.9730679156908666, | |
| "grad_norm": 0.1795545518398285, | |
| "learning_rate": 5.755297063683551e-06, | |
| "loss": 0.30653939247131345, | |
| "memory(GiB)": 137.67, | |
| "step": 1685, | |
| "token_acc": 0.8907540567138181, | |
| "train_speed(iter/s)": 0.040841 | |
| }, | |
| { | |
| "epoch": 1.9789227166276346, | |
| "grad_norm": 0.17241141200065613, | |
| "learning_rate": 5.696928690423693e-06, | |
| "loss": 0.30241034030914304, | |
| "memory(GiB)": 137.67, | |
| "step": 1690, | |
| "token_acc": 0.8856109987263056, | |
| "train_speed(iter/s)": 0.040841 | |
| }, | |
| { | |
| "epoch": 1.9847775175644027, | |
| "grad_norm": 0.1767030656337738, | |
| "learning_rate": 5.638739680044718e-06, | |
| "loss": 0.3159188270568848, | |
| "memory(GiB)": 137.67, | |
| "step": 1695, | |
| "token_acc": 0.8789045280418222, | |
| "train_speed(iter/s)": 0.040839 | |
| }, | |
| { | |
| "epoch": 1.990632318501171, | |
| "grad_norm": 0.1798180490732193, | |
| "learning_rate": 5.580732458011544e-06, | |
| "loss": 0.3054344654083252, | |
| "memory(GiB)": 137.67, | |
| "step": 1700, | |
| "token_acc": 0.8914613695909465, | |
| "train_speed(iter/s)": 0.040837 | |
| }, | |
| { | |
| "epoch": 1.9964871194379392, | |
| "grad_norm": 0.1673898547887802, | |
| "learning_rate": 5.522909442211708e-06, | |
| "loss": 0.3050167798995972, | |
| "memory(GiB)": 137.67, | |
| "step": 1705, | |
| "token_acc": 0.8836358249226172, | |
| "train_speed(iter/s)": 0.040834 | |
| }, | |
| { | |
| "epoch": 2.002341920374707, | |
| "grad_norm": 0.24459093809127808, | |
| "learning_rate": 5.465273042854551e-06, | |
| "loss": 0.2896696090698242, | |
| "memory(GiB)": 137.67, | |
| "step": 1710, | |
| "token_acc": 0.8956877534575909, | |
| "train_speed(iter/s)": 0.040723 | |
| }, | |
| { | |
| "epoch": 2.0081967213114753, | |
| "grad_norm": 0.19826985895633698, | |
| "learning_rate": 5.407825662370778e-06, | |
| "loss": 0.2708754301071167, | |
| "memory(GiB)": 137.67, | |
| "step": 1715, | |
| "token_acc": 0.8993573677984775, | |
| "train_speed(iter/s)": 0.040721 | |
| }, | |
| { | |
| "epoch": 2.0140515222482436, | |
| "grad_norm": 0.20230858027935028, | |
| "learning_rate": 5.350569695312313e-06, | |
| "loss": 0.27931761741638184, | |
| "memory(GiB)": 137.67, | |
| "step": 1720, | |
| "token_acc": 0.8964727026237073, | |
| "train_speed(iter/s)": 0.040718 | |
| }, | |
| { | |
| "epoch": 2.019906323185012, | |
| "grad_norm": 0.17940187454223633, | |
| "learning_rate": 5.293507528252474e-06, | |
| "loss": 0.2833970308303833, | |
| "memory(GiB)": 137.67, | |
| "step": 1725, | |
| "token_acc": 0.8971622665586578, | |
| "train_speed(iter/s)": 0.040716 | |
| }, | |
| { | |
| "epoch": 2.0257611241217797, | |
| "grad_norm": 0.2274295687675476, | |
| "learning_rate": 5.236641539686518e-06, | |
| "loss": 0.2709039211273193, | |
| "memory(GiB)": 137.67, | |
| "step": 1730, | |
| "token_acc": 0.8940215607642851, | |
| "train_speed(iter/s)": 0.040716 | |
| }, | |
| { | |
| "epoch": 2.031615925058548, | |
| "grad_norm": 0.17937658727169037, | |
| "learning_rate": 5.179974099932472e-06, | |
| "loss": 0.2649374961853027, | |
| "memory(GiB)": 137.67, | |
| "step": 1735, | |
| "token_acc": 0.8949033413934375, | |
| "train_speed(iter/s)": 0.040713 | |
| }, | |
| { | |
| "epoch": 2.037470725995316, | |
| "grad_norm": 0.1847214251756668, | |
| "learning_rate": 5.12350757103236e-06, | |
| "loss": 0.26505355834960936, | |
| "memory(GiB)": 137.67, | |
| "step": 1740, | |
| "token_acc": 0.8981974914281606, | |
| "train_speed(iter/s)": 0.040712 | |
| }, | |
| { | |
| "epoch": 2.0433255269320845, | |
| "grad_norm": 0.1737840622663498, | |
| "learning_rate": 5.067244306653736e-06, | |
| "loss": 0.27186686992645265, | |
| "memory(GiB)": 137.67, | |
| "step": 1745, | |
| "token_acc": 0.9053836113307479, | |
| "train_speed(iter/s)": 0.040711 | |
| }, | |
| { | |
| "epoch": 2.0491803278688523, | |
| "grad_norm": 0.1807735711336136, | |
| "learning_rate": 5.0111866519915575e-06, | |
| "loss": 0.2668013334274292, | |
| "memory(GiB)": 137.67, | |
| "step": 1750, | |
| "token_acc": 0.8954151927308955, | |
| "train_speed(iter/s)": 0.040709 | |
| }, | |
| { | |
| "epoch": 2.0550351288056206, | |
| "grad_norm": 0.17946134507656097, | |
| "learning_rate": 4.95533694367047e-06, | |
| "loss": 0.26618137359619143, | |
| "memory(GiB)": 137.67, | |
| "step": 1755, | |
| "token_acc": 0.8999696707241193, | |
| "train_speed(iter/s)": 0.040708 | |
| }, | |
| { | |
| "epoch": 2.060889929742389, | |
| "grad_norm": 0.17995508015155792, | |
| "learning_rate": 4.899697509647379e-06, | |
| "loss": 0.27054500579833984, | |
| "memory(GiB)": 137.67, | |
| "step": 1760, | |
| "token_acc": 0.8920381030958765, | |
| "train_speed(iter/s)": 0.040707 | |
| }, | |
| { | |
| "epoch": 2.066744730679157, | |
| "grad_norm": 0.22271017730236053, | |
| "learning_rate": 4.844270669114424e-06, | |
| "loss": 0.2727907657623291, | |
| "memory(GiB)": 137.67, | |
| "step": 1765, | |
| "token_acc": 0.9031526316777533, | |
| "train_speed(iter/s)": 0.040706 | |
| }, | |
| { | |
| "epoch": 2.072599531615925, | |
| "grad_norm": 0.18377523124217987, | |
| "learning_rate": 4.789058732402319e-06, | |
| "loss": 0.26617846488952634, | |
| "memory(GiB)": 137.67, | |
| "step": 1770, | |
| "token_acc": 0.8968159437280188, | |
| "train_speed(iter/s)": 0.040704 | |
| }, | |
| { | |
| "epoch": 2.078454332552693, | |
| "grad_norm": 0.18358266353607178, | |
| "learning_rate": 4.734064000884044e-06, | |
| "loss": 0.2815399646759033, | |
| "memory(GiB)": 137.67, | |
| "step": 1775, | |
| "token_acc": 0.8860162596527972, | |
| "train_speed(iter/s)": 0.040703 | |
| }, | |
| { | |
| "epoch": 2.0843091334894615, | |
| "grad_norm": 0.17939767241477966, | |
| "learning_rate": 4.679288766878908e-06, | |
| "loss": 0.2770793914794922, | |
| "memory(GiB)": 137.67, | |
| "step": 1780, | |
| "token_acc": 0.8990350010749907, | |
| "train_speed(iter/s)": 0.0407 | |
| }, | |
| { | |
| "epoch": 2.0901639344262297, | |
| "grad_norm": 0.18252268433570862, | |
| "learning_rate": 4.624735313557019e-06, | |
| "loss": 0.27314205169677735, | |
| "memory(GiB)": 137.67, | |
| "step": 1785, | |
| "token_acc": 0.9036665729722977, | |
| "train_speed(iter/s)": 0.040699 | |
| }, | |
| { | |
| "epoch": 2.0960187353629975, | |
| "grad_norm": 0.17692163586616516, | |
| "learning_rate": 4.570405914844105e-06, | |
| "loss": 0.26518521308898924, | |
| "memory(GiB)": 137.67, | |
| "step": 1790, | |
| "token_acc": 0.9007013796506218, | |
| "train_speed(iter/s)": 0.040696 | |
| }, | |
| { | |
| "epoch": 2.101873536299766, | |
| "grad_norm": 0.1812998205423355, | |
| "learning_rate": 4.516302835326723e-06, | |
| "loss": 0.27246594429016113, | |
| "memory(GiB)": 137.67, | |
| "step": 1795, | |
| "token_acc": 0.9057411329497284, | |
| "train_speed(iter/s)": 0.040694 | |
| }, | |
| { | |
| "epoch": 2.107728337236534, | |
| "grad_norm": 0.17790301144123077, | |
| "learning_rate": 4.462428330157886e-06, | |
| "loss": 0.2635958671569824, | |
| "memory(GiB)": 137.67, | |
| "step": 1800, | |
| "token_acc": 0.9060071718018364, | |
| "train_speed(iter/s)": 0.040692 | |
| }, | |
| { | |
| "epoch": 2.113583138173302, | |
| "grad_norm": 0.1772291511297226, | |
| "learning_rate": 4.4087846449630475e-06, | |
| "loss": 0.2673187732696533, | |
| "memory(GiB)": 137.67, | |
| "step": 1805, | |
| "token_acc": 0.902466497498459, | |
| "train_speed(iter/s)": 0.040691 | |
| }, | |
| { | |
| "epoch": 2.11943793911007, | |
| "grad_norm": 0.1833985149860382, | |
| "learning_rate": 4.355374015746493e-06, | |
| "loss": 0.26436376571655273, | |
| "memory(GiB)": 137.67, | |
| "step": 1810, | |
| "token_acc": 0.8990824248093747, | |
| "train_speed(iter/s)": 0.040688 | |
| }, | |
| { | |
| "epoch": 2.1252927400468384, | |
| "grad_norm": 0.1888750046491623, | |
| "learning_rate": 4.302198668798159e-06, | |
| "loss": 0.2690884113311768, | |
| "memory(GiB)": 137.67, | |
| "step": 1815, | |
| "token_acc": 0.8948256326325066, | |
| "train_speed(iter/s)": 0.040688 | |
| }, | |
| { | |
| "epoch": 2.1311475409836067, | |
| "grad_norm": 0.1726667881011963, | |
| "learning_rate": 4.249260820600813e-06, | |
| "loss": 0.2568142175674438, | |
| "memory(GiB)": 137.67, | |
| "step": 1820, | |
| "token_acc": 0.9027062619756462, | |
| "train_speed(iter/s)": 0.040686 | |
| }, | |
| { | |
| "epoch": 2.1370023419203745, | |
| "grad_norm": 0.18242421746253967, | |
| "learning_rate": 4.1965626777376766e-06, | |
| "loss": 0.26575822830200196, | |
| "memory(GiB)": 137.67, | |
| "step": 1825, | |
| "token_acc": 0.9058191422116245, | |
| "train_speed(iter/s)": 0.040685 | |
| }, | |
| { | |
| "epoch": 2.142857142857143, | |
| "grad_norm": 0.17865152657032013, | |
| "learning_rate": 4.144106436800453e-06, | |
| "loss": 0.2705830097198486, | |
| "memory(GiB)": 137.67, | |
| "step": 1830, | |
| "token_acc": 0.9064275903781455, | |
| "train_speed(iter/s)": 0.040686 | |
| }, | |
| { | |
| "epoch": 2.148711943793911, | |
| "grad_norm": 0.1739743947982788, | |
| "learning_rate": 4.091894284297758e-06, | |
| "loss": 0.262749981880188, | |
| "memory(GiB)": 137.67, | |
| "step": 1835, | |
| "token_acc": 0.8932282627390278, | |
| "train_speed(iter/s)": 0.040684 | |
| }, | |
| { | |
| "epoch": 2.1545667447306793, | |
| "grad_norm": 0.18693114817142487, | |
| "learning_rate": 4.039928396563983e-06, | |
| "loss": 0.27836999893188474, | |
| "memory(GiB)": 137.67, | |
| "step": 1840, | |
| "token_acc": 0.8999278596166879, | |
| "train_speed(iter/s)": 0.040683 | |
| }, | |
| { | |
| "epoch": 2.160421545667447, | |
| "grad_norm": 0.18225987255573273, | |
| "learning_rate": 3.9882109396685845e-06, | |
| "loss": 0.25630941390991213, | |
| "memory(GiB)": 137.67, | |
| "step": 1845, | |
| "token_acc": 0.8964322481719588, | |
| "train_speed(iter/s)": 0.04068 | |
| }, | |
| { | |
| "epoch": 2.1662763466042154, | |
| "grad_norm": 0.1680818498134613, | |
| "learning_rate": 3.936744069325797e-06, | |
| "loss": 0.25788373947143556, | |
| "memory(GiB)": 137.67, | |
| "step": 1850, | |
| "token_acc": 0.9047133964952628, | |
| "train_speed(iter/s)": 0.040677 | |
| }, | |
| { | |
| "epoch": 2.1721311475409837, | |
| "grad_norm": 0.17563344538211823, | |
| "learning_rate": 3.885529930804768e-06, | |
| "loss": 0.2534646987915039, | |
| "memory(GiB)": 137.67, | |
| "step": 1855, | |
| "token_acc": 0.895904841548197, | |
| "train_speed(iter/s)": 0.040675 | |
| }, | |
| { | |
| "epoch": 2.177985948477752, | |
| "grad_norm": 0.2031351625919342, | |
| "learning_rate": 3.834570658840152e-06, | |
| "loss": 0.2712204933166504, | |
| "memory(GiB)": 137.67, | |
| "step": 1860, | |
| "token_acc": 0.8943131411791787, | |
| "train_speed(iter/s)": 0.040674 | |
| }, | |
| { | |
| "epoch": 2.1838407494145198, | |
| "grad_norm": 0.1767955720424652, | |
| "learning_rate": 3.7838683775431106e-06, | |
| "loss": 0.26442804336547854, | |
| "memory(GiB)": 137.67, | |
| "step": 1865, | |
| "token_acc": 0.9006802168952266, | |
| "train_speed(iter/s)": 0.040673 | |
| }, | |
| { | |
| "epoch": 2.189695550351288, | |
| "grad_norm": 0.17129677534103394, | |
| "learning_rate": 3.733425200312797e-06, | |
| "loss": 0.2669063091278076, | |
| "memory(GiB)": 137.67, | |
| "step": 1870, | |
| "token_acc": 0.8917139826542709, | |
| "train_speed(iter/s)": 0.040672 | |
| }, | |
| { | |
| "epoch": 2.1955503512880563, | |
| "grad_norm": 0.17820899188518524, | |
| "learning_rate": 3.683243229748249e-06, | |
| "loss": 0.2608784198760986, | |
| "memory(GiB)": 137.67, | |
| "step": 1875, | |
| "token_acc": 0.8967133346325762, | |
| "train_speed(iter/s)": 0.04067 | |
| }, | |
| { | |
| "epoch": 2.201405152224824, | |
| "grad_norm": 0.18119502067565918, | |
| "learning_rate": 3.633324557560747e-06, | |
| "loss": 0.265275239944458, | |
| "memory(GiB)": 137.67, | |
| "step": 1880, | |
| "token_acc": 0.9029575814389501, | |
| "train_speed(iter/s)": 0.040669 | |
| }, | |
| { | |
| "epoch": 2.2072599531615924, | |
| "grad_norm": 0.17707428336143494, | |
| "learning_rate": 3.5836712644866277e-06, | |
| "loss": 0.2611743450164795, | |
| "memory(GiB)": 137.67, | |
| "step": 1885, | |
| "token_acc": 0.8965409189329774, | |
| "train_speed(iter/s)": 0.040668 | |
| }, | |
| { | |
| "epoch": 2.2131147540983607, | |
| "grad_norm": 0.1768161803483963, | |
| "learning_rate": 3.5342854202005696e-06, | |
| "loss": 0.26110024452209474, | |
| "memory(GiB)": 137.67, | |
| "step": 1890, | |
| "token_acc": 0.9035024093649873, | |
| "train_speed(iter/s)": 0.040667 | |
| }, | |
| { | |
| "epoch": 2.218969555035129, | |
| "grad_norm": 0.17210449278354645, | |
| "learning_rate": 3.485169083229293e-06, | |
| "loss": 0.26915616989135743, | |
| "memory(GiB)": 137.67, | |
| "step": 1895, | |
| "token_acc": 0.9061759392893929, | |
| "train_speed(iter/s)": 0.040667 | |
| }, | |
| { | |
| "epoch": 2.2248243559718968, | |
| "grad_norm": 0.16969619691371918, | |
| "learning_rate": 3.4363243008657842e-06, | |
| "loss": 0.2634119987487793, | |
| "memory(GiB)": 137.67, | |
| "step": 1900, | |
| "token_acc": 0.8916742749773309, | |
| "train_speed(iter/s)": 0.040664 | |
| }, | |
| { | |
| "epoch": 2.230679156908665, | |
| "grad_norm": 0.17764930427074432, | |
| "learning_rate": 3.3877531090839478e-06, | |
| "loss": 0.2685534000396729, | |
| "memory(GiB)": 137.67, | |
| "step": 1905, | |
| "token_acc": 0.8940042290704804, | |
| "train_speed(iter/s)": 0.040663 | |
| }, | |
| { | |
| "epoch": 2.2365339578454333, | |
| "grad_norm": 0.17651669681072235, | |
| "learning_rate": 3.3394575324537327e-06, | |
| "loss": 0.27190165519714354, | |
| "memory(GiB)": 137.67, | |
| "step": 1910, | |
| "token_acc": 0.8928626982497402, | |
| "train_speed(iter/s)": 0.04066 | |
| }, | |
| { | |
| "epoch": 2.2423887587822016, | |
| "grad_norm": 0.16508856415748596, | |
| "learning_rate": 3.2914395840567605e-06, | |
| "loss": 0.2606737852096558, | |
| "memory(GiB)": 137.67, | |
| "step": 1915, | |
| "token_acc": 0.9028335241642236, | |
| "train_speed(iter/s)": 0.040658 | |
| }, | |
| { | |
| "epoch": 2.2482435597189694, | |
| "grad_norm": 0.16644766926765442, | |
| "learning_rate": 3.2437012654024057e-06, | |
| "loss": 0.2660099983215332, | |
| "memory(GiB)": 137.67, | |
| "step": 1920, | |
| "token_acc": 0.9046304613618784, | |
| "train_speed(iter/s)": 0.040656 | |
| }, | |
| { | |
| "epoch": 2.2540983606557377, | |
| "grad_norm": 0.16391952335834503, | |
| "learning_rate": 3.1962445663443643e-06, | |
| "loss": 0.2678091287612915, | |
| "memory(GiB)": 137.67, | |
| "step": 1925, | |
| "token_acc": 0.8979980130091664, | |
| "train_speed(iter/s)": 0.040653 | |
| }, | |
| { | |
| "epoch": 2.259953161592506, | |
| "grad_norm": 0.1803101897239685, | |
| "learning_rate": 3.1490714649977196e-06, | |
| "loss": 0.27110137939453127, | |
| "memory(GiB)": 137.67, | |
| "step": 1930, | |
| "token_acc": 0.905863734174048, | |
| "train_speed(iter/s)": 0.04065 | |
| }, | |
| { | |
| "epoch": 2.265807962529274, | |
| "grad_norm": 0.17323030531406403, | |
| "learning_rate": 3.102183927656488e-06, | |
| "loss": 0.26174540519714357, | |
| "memory(GiB)": 137.67, | |
| "step": 1935, | |
| "token_acc": 0.8902694797112273, | |
| "train_speed(iter/s)": 0.040649 | |
| }, | |
| { | |
| "epoch": 2.271662763466042, | |
| "grad_norm": 0.18379603326320648, | |
| "learning_rate": 3.0555839087116547e-06, | |
| "loss": 0.27245678901672366, | |
| "memory(GiB)": 137.67, | |
| "step": 1940, | |
| "token_acc": 0.90194375, | |
| "train_speed(iter/s)": 0.040648 | |
| }, | |
| { | |
| "epoch": 2.2775175644028103, | |
| "grad_norm": 0.1765807718038559, | |
| "learning_rate": 3.009273350569705e-06, | |
| "loss": 0.2700004816055298, | |
| "memory(GiB)": 137.67, | |
| "step": 1945, | |
| "token_acc": 0.9060629034421867, | |
| "train_speed(iter/s)": 0.040648 | |
| }, | |
| { | |
| "epoch": 2.2833723653395785, | |
| "grad_norm": 0.17609137296676636, | |
| "learning_rate": 2.963254183571682e-06, | |
| "loss": 0.2663255214691162, | |
| "memory(GiB)": 137.67, | |
| "step": 1950, | |
| "token_acc": 0.9028553183442811, | |
| "train_speed(iter/s)": 0.040646 | |
| }, | |
| { | |
| "epoch": 2.289227166276347, | |
| "grad_norm": 0.1761084645986557, | |
| "learning_rate": 2.9175283259126943e-06, | |
| "loss": 0.2662710428237915, | |
| "memory(GiB)": 137.67, | |
| "step": 1955, | |
| "token_acc": 0.9068832885430957, | |
| "train_speed(iter/s)": 0.040645 | |
| }, | |
| { | |
| "epoch": 2.2950819672131146, | |
| "grad_norm": 0.16875940561294556, | |
| "learning_rate": 2.872097683561986e-06, | |
| "loss": 0.2650928497314453, | |
| "memory(GiB)": 137.67, | |
| "step": 1960, | |
| "token_acc": 0.9107070141504632, | |
| "train_speed(iter/s)": 0.040644 | |
| }, | |
| { | |
| "epoch": 2.300936768149883, | |
| "grad_norm": 0.18349847197532654, | |
| "learning_rate": 2.8269641501834834e-06, | |
| "loss": 0.2731610298156738, | |
| "memory(GiB)": 137.67, | |
| "step": 1965, | |
| "token_acc": 0.8929668563025367, | |
| "train_speed(iter/s)": 0.040644 | |
| }, | |
| { | |
| "epoch": 2.306791569086651, | |
| "grad_norm": 0.17049305140972137, | |
| "learning_rate": 2.782129607056848e-06, | |
| "loss": 0.2668560028076172, | |
| "memory(GiB)": 137.67, | |
| "step": 1970, | |
| "token_acc": 0.8946301039908395, | |
| "train_speed(iter/s)": 0.040643 | |
| }, | |
| { | |
| "epoch": 2.312646370023419, | |
| "grad_norm": 0.17511935532093048, | |
| "learning_rate": 2.7375959229990856e-06, | |
| "loss": 0.25858211517333984, | |
| "memory(GiB)": 137.67, | |
| "step": 1975, | |
| "token_acc": 0.9011111249984377, | |
| "train_speed(iter/s)": 0.040641 | |
| }, | |
| { | |
| "epoch": 2.3185011709601873, | |
| "grad_norm": 0.16913901269435883, | |
| "learning_rate": 2.6933649542866326e-06, | |
| "loss": 0.2623398780822754, | |
| "memory(GiB)": 137.67, | |
| "step": 1980, | |
| "token_acc": 0.8980817363368075, | |
| "train_speed(iter/s)": 0.04064 | |
| }, | |
| { | |
| "epoch": 2.3243559718969555, | |
| "grad_norm": 0.16392305493354797, | |
| "learning_rate": 2.649438544577977e-06, | |
| "loss": 0.25210521221160886, | |
| "memory(GiB)": 137.67, | |
| "step": 1985, | |
| "token_acc": 0.9006790772077851, | |
| "train_speed(iter/s)": 0.040639 | |
| }, | |
| { | |
| "epoch": 2.330210772833724, | |
| "grad_norm": 0.16555212438106537, | |
| "learning_rate": 2.6058185248368317e-06, | |
| "loss": 0.26413559913635254, | |
| "memory(GiB)": 137.67, | |
| "step": 1990, | |
| "token_acc": 0.9057566877776727, | |
| "train_speed(iter/s)": 0.040637 | |
| }, | |
| { | |
| "epoch": 2.3360655737704916, | |
| "grad_norm": 0.17122185230255127, | |
| "learning_rate": 2.562506713255789e-06, | |
| "loss": 0.2596926689147949, | |
| "memory(GiB)": 137.67, | |
| "step": 1995, | |
| "token_acc": 0.9047409789878514, | |
| "train_speed(iter/s)": 0.040636 | |
| }, | |
| { | |
| "epoch": 2.34192037470726, | |
| "grad_norm": 0.17818881571292877, | |
| "learning_rate": 2.519504915180555e-06, | |
| "loss": 0.2623495101928711, | |
| "memory(GiB)": 137.67, | |
| "step": 2000, | |
| "token_acc": 0.9031698814490531, | |
| "train_speed(iter/s)": 0.040635 | |
| }, | |
| { | |
| "epoch": 2.347775175644028, | |
| "grad_norm": 0.17120912671089172, | |
| "learning_rate": 2.4768149230346917e-06, | |
| "loss": 0.2763922929763794, | |
| "memory(GiB)": 137.67, | |
| "step": 2005, | |
| "token_acc": 0.90147262555157, | |
| "train_speed(iter/s)": 0.040633 | |
| }, | |
| { | |
| "epoch": 2.3536299765807964, | |
| "grad_norm": 0.1725643426179886, | |
| "learning_rate": 2.4344385162448924e-06, | |
| "loss": 0.26347975730895995, | |
| "memory(GiB)": 137.67, | |
| "step": 2010, | |
| "token_acc": 0.9056239470479484, | |
| "train_speed(iter/s)": 0.040632 | |
| }, | |
| { | |
| "epoch": 2.3594847775175642, | |
| "grad_norm": 0.17098568379878998, | |
| "learning_rate": 2.392377461166826e-06, | |
| "loss": 0.26201567649841306, | |
| "memory(GiB)": 137.67, | |
| "step": 2015, | |
| "token_acc": 0.9030459083951856, | |
| "train_speed(iter/s)": 0.040631 | |
| }, | |
| { | |
| "epoch": 2.3653395784543325, | |
| "grad_norm": 0.17561163008213043, | |
| "learning_rate": 2.350633511011511e-06, | |
| "loss": 0.26811957359313965, | |
| "memory(GiB)": 137.67, | |
| "step": 2020, | |
| "token_acc": 0.8995977151723318, | |
| "train_speed(iter/s)": 0.040628 | |
| }, | |
| { | |
| "epoch": 2.371194379391101, | |
| "grad_norm": 0.1689569056034088, | |
| "learning_rate": 2.309208405772221e-06, | |
| "loss": 0.2759255409240723, | |
| "memory(GiB)": 137.67, | |
| "step": 2025, | |
| "token_acc": 0.9044138910892334, | |
| "train_speed(iter/s)": 0.040628 | |
| }, | |
| { | |
| "epoch": 2.3770491803278686, | |
| "grad_norm": 0.26568159461021423, | |
| "learning_rate": 2.2681038721519768e-06, | |
| "loss": 0.2785911560058594, | |
| "memory(GiB)": 137.67, | |
| "step": 2030, | |
| "token_acc": 0.8982950398323113, | |
| "train_speed(iter/s)": 0.040625 | |
| }, | |
| { | |
| "epoch": 2.382903981264637, | |
| "grad_norm": 0.18388140201568604, | |
| "learning_rate": 2.227321623491563e-06, | |
| "loss": 0.26940011978149414, | |
| "memory(GiB)": 137.67, | |
| "step": 2035, | |
| "token_acc": 0.8968315203642803, | |
| "train_speed(iter/s)": 0.040624 | |
| }, | |
| { | |
| "epoch": 2.388758782201405, | |
| "grad_norm": 0.16938382387161255, | |
| "learning_rate": 2.186863359698108e-06, | |
| "loss": 0.26633501052856445, | |
| "memory(GiB)": 137.67, | |
| "step": 2040, | |
| "token_acc": 0.9180211235459854, | |
| "train_speed(iter/s)": 0.040622 | |
| }, | |
| { | |
| "epoch": 2.3946135831381734, | |
| "grad_norm": 0.17878937721252441, | |
| "learning_rate": 2.1467307671742377e-06, | |
| "loss": 0.2687513828277588, | |
| "memory(GiB)": 137.67, | |
| "step": 2045, | |
| "token_acc": 0.8974434682640148, | |
| "train_speed(iter/s)": 0.040621 | |
| }, | |
| { | |
| "epoch": 2.4004683840749417, | |
| "grad_norm": 0.1779458373785019, | |
| "learning_rate": 2.106925518747779e-06, | |
| "loss": 0.26202917098999023, | |
| "memory(GiB)": 137.67, | |
| "step": 2050, | |
| "token_acc": 0.9011938413047829, | |
| "train_speed(iter/s)": 0.04062 | |
| }, | |
| { | |
| "epoch": 2.4063231850117095, | |
| "grad_norm": 0.17342902719974518, | |
| "learning_rate": 2.06744927360202e-06, | |
| "loss": 0.26468615531921386, | |
| "memory(GiB)": 137.67, | |
| "step": 2055, | |
| "token_acc": 0.8999491938022672, | |
| "train_speed(iter/s)": 0.040617 | |
| }, | |
| { | |
| "epoch": 2.4121779859484778, | |
| "grad_norm": 0.17159196734428406, | |
| "learning_rate": 2.0283036772065712e-06, | |
| "loss": 0.26631085872650145, | |
| "memory(GiB)": 137.67, | |
| "step": 2060, | |
| "token_acc": 0.904679059271446, | |
| "train_speed(iter/s)": 0.040615 | |
| }, | |
| { | |
| "epoch": 2.418032786885246, | |
| "grad_norm": 0.19288575649261475, | |
| "learning_rate": 1.9894903612487683e-06, | |
| "loss": 0.2730381488800049, | |
| "memory(GiB)": 137.67, | |
| "step": 2065, | |
| "token_acc": 0.8923981017844846, | |
| "train_speed(iter/s)": 0.040614 | |
| }, | |
| { | |
| "epoch": 2.423887587822014, | |
| "grad_norm": 0.17374974489212036, | |
| "learning_rate": 1.9510109435656457e-06, | |
| "loss": 0.27329106330871583, | |
| "memory(GiB)": 137.67, | |
| "step": 2070, | |
| "token_acc": 0.9024526900268184, | |
| "train_speed(iter/s)": 0.040613 | |
| }, | |
| { | |
| "epoch": 2.429742388758782, | |
| "grad_norm": 0.1817113608121872, | |
| "learning_rate": 1.9128670280765283e-06, | |
| "loss": 0.27490620613098143, | |
| "memory(GiB)": 137.67, | |
| "step": 2075, | |
| "token_acc": 0.8959030374086766, | |
| "train_speed(iter/s)": 0.040611 | |
| }, | |
| { | |
| "epoch": 2.4355971896955504, | |
| "grad_norm": 0.17148195207118988, | |
| "learning_rate": 1.8750602047161603e-06, | |
| "loss": 0.26430578231811525, | |
| "memory(GiB)": 137.67, | |
| "step": 2080, | |
| "token_acc": 0.9074351491670378, | |
| "train_speed(iter/s)": 0.040609 | |
| }, | |
| { | |
| "epoch": 2.4414519906323187, | |
| "grad_norm": 0.1715674251317978, | |
| "learning_rate": 1.8375920493684264e-06, | |
| "loss": 0.2722649574279785, | |
| "memory(GiB)": 137.67, | |
| "step": 2085, | |
| "token_acc": 0.8960112888052681, | |
| "train_speed(iter/s)": 0.040609 | |
| }, | |
| { | |
| "epoch": 2.4473067915690865, | |
| "grad_norm": 0.1820991337299347, | |
| "learning_rate": 1.8004641238006815e-06, | |
| "loss": 0.2675884485244751, | |
| "memory(GiB)": 137.67, | |
| "step": 2090, | |
| "token_acc": 0.9040590405904059, | |
| "train_speed(iter/s)": 0.040607 | |
| }, | |
| { | |
| "epoch": 2.4531615925058547, | |
| "grad_norm": 0.1691906452178955, | |
| "learning_rate": 1.7636779755986443e-06, | |
| "loss": 0.2732096195220947, | |
| "memory(GiB)": 137.67, | |
| "step": 2095, | |
| "token_acc": 0.8958253626778894, | |
| "train_speed(iter/s)": 0.040605 | |
| }, | |
| { | |
| "epoch": 2.459016393442623, | |
| "grad_norm": 0.17061816155910492, | |
| "learning_rate": 1.7272351381018792e-06, | |
| "loss": 0.2712996482849121, | |
| "memory(GiB)": 137.67, | |
| "step": 2100, | |
| "token_acc": 0.8880485387880261, | |
| "train_speed(iter/s)": 0.040603 | |
| }, | |
| { | |
| "epoch": 2.4648711943793913, | |
| "grad_norm": 0.17594653367996216, | |
| "learning_rate": 1.6911371303399048e-06, | |
| "loss": 0.2586531162261963, | |
| "memory(GiB)": 137.67, | |
| "step": 2105, | |
| "token_acc": 0.9022650028060307, | |
| "train_speed(iter/s)": 0.0406 | |
| }, | |
| { | |
| "epoch": 2.470725995316159, | |
| "grad_norm": 0.18380020558834076, | |
| "learning_rate": 1.6553854569688632e-06, | |
| "loss": 0.2727813720703125, | |
| "memory(GiB)": 137.67, | |
| "step": 2110, | |
| "token_acc": 0.8974262645615947, | |
| "train_speed(iter/s)": 0.040598 | |
| }, | |
| { | |
| "epoch": 2.4765807962529274, | |
| "grad_norm": 0.16742826998233795, | |
| "learning_rate": 1.619981608208796e-06, | |
| "loss": 0.2734941244125366, | |
| "memory(GiB)": 137.67, | |
| "step": 2115, | |
| "token_acc": 0.8847918638392509, | |
| "train_speed(iter/s)": 0.040597 | |
| }, | |
| { | |
| "epoch": 2.4824355971896956, | |
| "grad_norm": 0.17516812682151794, | |
| "learning_rate": 1.584927059781548e-06, | |
| "loss": 0.2728161334991455, | |
| "memory(GiB)": 137.67, | |
| "step": 2120, | |
| "token_acc": 0.8936656628114019, | |
| "train_speed(iter/s)": 0.040595 | |
| }, | |
| { | |
| "epoch": 2.4882903981264635, | |
| "grad_norm": 0.17867887020111084, | |
| "learning_rate": 1.5502232728492362e-06, | |
| "loss": 0.264336085319519, | |
| "memory(GiB)": 137.67, | |
| "step": 2125, | |
| "token_acc": 0.9031589138208336, | |
| "train_speed(iter/s)": 0.040594 | |
| }, | |
| { | |
| "epoch": 2.4941451990632317, | |
| "grad_norm": 0.17173421382904053, | |
| "learning_rate": 1.5158716939533524e-06, | |
| "loss": 0.27242002487182615, | |
| "memory(GiB)": 137.67, | |
| "step": 2130, | |
| "token_acc": 0.8990930988723483, | |
| "train_speed(iter/s)": 0.040593 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.1708640456199646, | |
| "learning_rate": 1.4818737549544725e-06, | |
| "loss": 0.27319111824035647, | |
| "memory(GiB)": 137.67, | |
| "step": 2135, | |
| "token_acc": 0.8916305799253722, | |
| "train_speed(iter/s)": 0.040593 | |
| }, | |
| { | |
| "epoch": 2.5058548009367683, | |
| "grad_norm": 0.17307148873806, | |
| "learning_rate": 1.448230872972568e-06, | |
| "loss": 0.2695432424545288, | |
| "memory(GiB)": 137.67, | |
| "step": 2140, | |
| "token_acc": 0.905385863209386, | |
| "train_speed(iter/s)": 0.04059 | |
| }, | |
| { | |
| "epoch": 2.5117096018735365, | |
| "grad_norm": 0.17106083035469055, | |
| "learning_rate": 1.4149444503279297e-06, | |
| "loss": 0.27602252960205076, | |
| "memory(GiB)": 137.67, | |
| "step": 2145, | |
| "token_acc": 0.8923752322136868, | |
| "train_speed(iter/s)": 0.040589 | |
| }, | |
| { | |
| "epoch": 2.5175644028103044, | |
| "grad_norm": 0.17844541370868683, | |
| "learning_rate": 1.382015874482735e-06, | |
| "loss": 0.2688480615615845, | |
| "memory(GiB)": 137.67, | |
| "step": 2150, | |
| "token_acc": 0.8990480241183902, | |
| "train_speed(iter/s)": 0.040588 | |
| }, | |
| { | |
| "epoch": 2.5234192037470726, | |
| "grad_norm": 0.17703387141227722, | |
| "learning_rate": 1.3494465179831895e-06, | |
| "loss": 0.26667649745941163, | |
| "memory(GiB)": 137.67, | |
| "step": 2155, | |
| "token_acc": 0.8957748501946923, | |
| "train_speed(iter/s)": 0.040587 | |
| }, | |
| { | |
| "epoch": 2.529274004683841, | |
| "grad_norm": 0.1624777913093567, | |
| "learning_rate": 1.3172377384023393e-06, | |
| "loss": 0.26247563362121584, | |
| "memory(GiB)": 137.67, | |
| "step": 2160, | |
| "token_acc": 0.9005889918174871, | |
| "train_speed(iter/s)": 0.040586 | |
| }, | |
| { | |
| "epoch": 2.5351288056206087, | |
| "grad_norm": 0.17209553718566895, | |
| "learning_rate": 1.2853908782834722e-06, | |
| "loss": 0.2671672821044922, | |
| "memory(GiB)": 137.67, | |
| "step": 2165, | |
| "token_acc": 0.9070119235517494, | |
| "train_speed(iter/s)": 0.040583 | |
| }, | |
| { | |
| "epoch": 2.540983606557377, | |
| "grad_norm": 0.17611093819141388, | |
| "learning_rate": 1.2539072650841523e-06, | |
| "loss": 0.2725430250167847, | |
| "memory(GiB)": 137.67, | |
| "step": 2170, | |
| "token_acc": 0.8966264886593998, | |
| "train_speed(iter/s)": 0.040583 | |
| }, | |
| { | |
| "epoch": 2.5468384074941453, | |
| "grad_norm": 0.1783149093389511, | |
| "learning_rate": 1.2227882111209011e-06, | |
| "loss": 0.27568228244781495, | |
| "memory(GiB)": 137.67, | |
| "step": 2175, | |
| "token_acc": 0.8947381499658321, | |
| "train_speed(iter/s)": 0.040582 | |
| }, | |
| { | |
| "epoch": 2.552693208430913, | |
| "grad_norm": 0.17337878048419952, | |
| "learning_rate": 1.1920350135144898e-06, | |
| "loss": 0.269814133644104, | |
| "memory(GiB)": 137.67, | |
| "step": 2180, | |
| "token_acc": 0.9041164343092762, | |
| "train_speed(iter/s)": 0.040581 | |
| }, | |
| { | |
| "epoch": 2.5585480093676813, | |
| "grad_norm": 0.16845281422138214, | |
| "learning_rate": 1.1616489541358678e-06, | |
| "loss": 0.26679143905639646, | |
| "memory(GiB)": 137.67, | |
| "step": 2185, | |
| "token_acc": 0.8939169722162003, | |
| "train_speed(iter/s)": 0.04058 | |
| }, | |
| { | |
| "epoch": 2.5644028103044496, | |
| "grad_norm": 0.17022623121738434, | |
| "learning_rate": 1.1316312995527424e-06, | |
| "loss": 0.2700947761535645, | |
| "memory(GiB)": 137.67, | |
| "step": 2190, | |
| "token_acc": 0.8979253112033195, | |
| "train_speed(iter/s)": 0.040579 | |
| }, | |
| { | |
| "epoch": 2.570257611241218, | |
| "grad_norm": 0.16687875986099243, | |
| "learning_rate": 1.1019833009767744e-06, | |
| "loss": 0.268681001663208, | |
| "memory(GiB)": 137.67, | |
| "step": 2195, | |
| "token_acc": 0.8966215038230679, | |
| "train_speed(iter/s)": 0.040578 | |
| }, | |
| { | |
| "epoch": 2.576112412177986, | |
| "grad_norm": 0.17770424485206604, | |
| "learning_rate": 1.072706194211426e-06, | |
| "loss": 0.27028694152832033, | |
| "memory(GiB)": 137.67, | |
| "step": 2200, | |
| "token_acc": 0.9029025046417339, | |
| "train_speed(iter/s)": 0.040577 | |
| }, | |
| { | |
| "epoch": 2.581967213114754, | |
| "grad_norm": 0.17755696177482605, | |
| "learning_rate": 1.0438011996004581e-06, | |
| "loss": 0.269865894317627, | |
| "memory(GiB)": 137.67, | |
| "step": 2205, | |
| "token_acc": 0.8967394005666286, | |
| "train_speed(iter/s)": 0.040575 | |
| }, | |
| { | |
| "epoch": 2.5878220140515222, | |
| "grad_norm": 0.17752693593502045, | |
| "learning_rate": 1.0152695219770558e-06, | |
| "loss": 0.257364559173584, | |
| "memory(GiB)": 137.67, | |
| "step": 2210, | |
| "token_acc": 0.9068669110660224, | |
| "train_speed(iter/s)": 0.040573 | |
| }, | |
| { | |
| "epoch": 2.5936768149882905, | |
| "grad_norm": 0.16752499341964722, | |
| "learning_rate": 9.871123506136037e-07, | |
| "loss": 0.2638521194458008, | |
| "memory(GiB)": 137.67, | |
| "step": 2215, | |
| "token_acc": 0.9083980061833554, | |
| "train_speed(iter/s)": 0.040572 | |
| }, | |
| { | |
| "epoch": 2.5995316159250583, | |
| "grad_norm": 0.16032443940639496, | |
| "learning_rate": 9.593308591721274e-07, | |
| "loss": 0.2622210025787354, | |
| "memory(GiB)": 137.67, | |
| "step": 2220, | |
| "token_acc": 0.900316748757648, | |
| "train_speed(iter/s)": 0.040571 | |
| }, | |
| { | |
| "epoch": 2.6053864168618266, | |
| "grad_norm": 0.17415659129619598, | |
| "learning_rate": 9.319262056553602e-07, | |
| "loss": 0.2700244903564453, | |
| "memory(GiB)": 137.67, | |
| "step": 2225, | |
| "token_acc": 0.9051188644286028, | |
| "train_speed(iter/s)": 0.040569 | |
| }, | |
| { | |
| "epoch": 2.611241217798595, | |
| "grad_norm": 0.1722276359796524, | |
| "learning_rate": 9.048995323584764e-07, | |
| "loss": 0.2738530397415161, | |
| "memory(GiB)": 137.67, | |
| "step": 2230, | |
| "token_acc": 0.9079698943901274, | |
| "train_speed(iter/s)": 0.040568 | |
| }, | |
| { | |
| "epoch": 2.617096018735363, | |
| "grad_norm": 0.17455357313156128, | |
| "learning_rate": 8.78251965821485e-07, | |
| "loss": 0.25915350914001467, | |
| "memory(GiB)": 137.67, | |
| "step": 2235, | |
| "token_acc": 0.9004381754945836, | |
| "train_speed(iter/s)": 0.040566 | |
| }, | |
| { | |
| "epoch": 2.6229508196721314, | |
| "grad_norm": 0.17298012971878052, | |
| "learning_rate": 8.519846167822665e-07, | |
| "loss": 0.2638465404510498, | |
| "memory(GiB)": 137.67, | |
| "step": 2240, | |
| "token_acc": 0.9118884831119326, | |
| "train_speed(iter/s)": 0.040565 | |
| }, | |
| { | |
| "epoch": 2.628805620608899, | |
| "grad_norm": 0.1699805110692978, | |
| "learning_rate": 8.260985801302734e-07, | |
| "loss": 0.25593223571777346, | |
| "memory(GiB)": 137.67, | |
| "step": 2245, | |
| "token_acc": 0.8991087959330969, | |
| "train_speed(iter/s)": 0.040565 | |
| }, | |
| { | |
| "epoch": 2.6346604215456675, | |
| "grad_norm": 0.1722072809934616, | |
| "learning_rate": 8.005949348608977e-07, | |
| "loss": 0.2674243927001953, | |
| "memory(GiB)": 137.67, | |
| "step": 2250, | |
| "token_acc": 0.8965253065997911, | |
| "train_speed(iter/s)": 0.040563 | |
| }, | |
| { | |
| "epoch": 2.6405152224824358, | |
| "grad_norm": 0.1668199747800827, | |
| "learning_rate": 7.754747440304911e-07, | |
| "loss": 0.27177164554595945, | |
| "memory(GiB)": 137.67, | |
| "step": 2255, | |
| "token_acc": 0.8954008941320247, | |
| "train_speed(iter/s)": 0.040563 | |
| }, | |
| { | |
| "epoch": 2.6463700234192036, | |
| "grad_norm": 0.16813580691814423, | |
| "learning_rate": 7.507390547120541e-07, | |
| "loss": 0.2651193857192993, | |
| "memory(GiB)": 137.67, | |
| "step": 2260, | |
| "token_acc": 0.8984925665335315, | |
| "train_speed(iter/s)": 0.040562 | |
| }, | |
| { | |
| "epoch": 2.652224824355972, | |
| "grad_norm": 0.17678076028823853, | |
| "learning_rate": 7.263888979515954e-07, | |
| "loss": 0.27275819778442384, | |
| "memory(GiB)": 137.67, | |
| "step": 2265, | |
| "token_acc": 0.8936288874184706, | |
| "train_speed(iter/s)": 0.040562 | |
| }, | |
| { | |
| "epoch": 2.65807962529274, | |
| "grad_norm": 0.16264022886753082, | |
| "learning_rate": 7.024252887251548e-07, | |
| "loss": 0.2669191360473633, | |
| "memory(GiB)": 137.67, | |
| "step": 2270, | |
| "token_acc": 0.8972385552618926, | |
| "train_speed(iter/s)": 0.04056 | |
| }, | |
| { | |
| "epoch": 2.663934426229508, | |
| "grad_norm": 0.1690565049648285, | |
| "learning_rate": 6.788492258964896e-07, | |
| "loss": 0.2695984125137329, | |
| "memory(GiB)": 137.67, | |
| "step": 2275, | |
| "token_acc": 0.8963350061434133, | |
| "train_speed(iter/s)": 0.040559 | |
| }, | |
| { | |
| "epoch": 2.669789227166276, | |
| "grad_norm": 0.1730775386095047, | |
| "learning_rate": 6.556616921754489e-07, | |
| "loss": 0.26709651947021484, | |
| "memory(GiB)": 137.67, | |
| "step": 2280, | |
| "token_acc": 0.9004803898235022, | |
| "train_speed(iter/s)": 0.040558 | |
| }, | |
| { | |
| "epoch": 2.6756440281030445, | |
| "grad_norm": 0.1701081544160843, | |
| "learning_rate": 6.328636540770028e-07, | |
| "loss": 0.26933286190032957, | |
| "memory(GiB)": 137.67, | |
| "step": 2285, | |
| "token_acc": 0.898853457766213, | |
| "train_speed(iter/s)": 0.040557 | |
| }, | |
| { | |
| "epoch": 2.6814988290398127, | |
| "grad_norm": 0.19118832051753998, | |
| "learning_rate": 6.10456061880963e-07, | |
| "loss": 0.2741654396057129, | |
| "memory(GiB)": 137.67, | |
| "step": 2290, | |
| "token_acc": 0.9025216185680262, | |
| "train_speed(iter/s)": 0.040556 | |
| }, | |
| { | |
| "epoch": 2.687353629976581, | |
| "grad_norm": 0.17062994837760925, | |
| "learning_rate": 5.884398495923727e-07, | |
| "loss": 0.2640299558639526, | |
| "memory(GiB)": 137.67, | |
| "step": 2295, | |
| "token_acc": 0.8934425971755339, | |
| "train_speed(iter/s)": 0.040556 | |
| }, | |
| { | |
| "epoch": 2.693208430913349, | |
| "grad_norm": 0.18749327957630157, | |
| "learning_rate": 5.668159349025649e-07, | |
| "loss": 0.2795866966247559, | |
| "memory(GiB)": 137.67, | |
| "step": 2300, | |
| "token_acc": 0.8874596974206349, | |
| "train_speed(iter/s)": 0.040554 | |
| }, | |
| { | |
| "epoch": 2.699063231850117, | |
| "grad_norm": 0.1760568916797638, | |
| "learning_rate": 5.455852191509214e-07, | |
| "loss": 0.27616961002349855, | |
| "memory(GiB)": 137.67, | |
| "step": 2305, | |
| "token_acc": 0.8910418230197176, | |
| "train_speed(iter/s)": 0.040553 | |
| }, | |
| { | |
| "epoch": 2.7049180327868854, | |
| "grad_norm": 0.1760990172624588, | |
| "learning_rate": 5.247485872873026e-07, | |
| "loss": 0.26389687061309813, | |
| "memory(GiB)": 137.67, | |
| "step": 2310, | |
| "token_acc": 0.9032378371322547, | |
| "train_speed(iter/s)": 0.040552 | |
| }, | |
| { | |
| "epoch": 2.710772833723653, | |
| "grad_norm": 0.16184002161026, | |
| "learning_rate": 5.043069078351526e-07, | |
| "loss": 0.2583066463470459, | |
| "memory(GiB)": 137.67, | |
| "step": 2315, | |
| "token_acc": 0.9048499210110584, | |
| "train_speed(iter/s)": 0.040551 | |
| }, | |
| { | |
| "epoch": 2.7166276346604215, | |
| "grad_norm": 0.16953077912330627, | |
| "learning_rate": 4.842610328552999e-07, | |
| "loss": 0.26470949649810793, | |
| "memory(GiB)": 137.67, | |
| "step": 2320, | |
| "token_acc": 0.9023021945368386, | |
| "train_speed(iter/s)": 0.04055 | |
| }, | |
| { | |
| "epoch": 2.7224824355971897, | |
| "grad_norm": 0.16833004355430603, | |
| "learning_rate": 4.6461179791044806e-07, | |
| "loss": 0.26623120307922366, | |
| "memory(GiB)": 137.67, | |
| "step": 2325, | |
| "token_acc": 0.895680773698298, | |
| "train_speed(iter/s)": 0.04055 | |
| }, | |
| { | |
| "epoch": 2.728337236533958, | |
| "grad_norm": 0.1694810837507248, | |
| "learning_rate": 4.453600220303378e-07, | |
| "loss": 0.25267777442932127, | |
| "memory(GiB)": 137.67, | |
| "step": 2330, | |
| "token_acc": 0.8968080577917444, | |
| "train_speed(iter/s)": 0.04055 | |
| }, | |
| { | |
| "epoch": 2.7341920374707263, | |
| "grad_norm": 0.18032941222190857, | |
| "learning_rate": 4.2650650767761535e-07, | |
| "loss": 0.25408167839050294, | |
| "memory(GiB)": 137.67, | |
| "step": 2335, | |
| "token_acc": 0.9085095809749435, | |
| "train_speed(iter/s)": 0.040549 | |
| }, | |
| { | |
| "epoch": 2.740046838407494, | |
| "grad_norm": 0.18011276423931122, | |
| "learning_rate": 4.0805204071437953e-07, | |
| "loss": 0.27644264698028564, | |
| "memory(GiB)": 137.67, | |
| "step": 2340, | |
| "token_acc": 0.8965790537297598, | |
| "train_speed(iter/s)": 0.040547 | |
| }, | |
| { | |
| "epoch": 2.7459016393442623, | |
| "grad_norm": 0.16562311351299286, | |
| "learning_rate": 3.899973903694243e-07, | |
| "loss": 0.26986749172210694, | |
| "memory(GiB)": 137.67, | |
| "step": 2345, | |
| "token_acc": 0.9012060017454879, | |
| "train_speed(iter/s)": 0.040546 | |
| }, | |
| { | |
| "epoch": 2.7517564402810306, | |
| "grad_norm": 0.17436754703521729, | |
| "learning_rate": 3.72343309206179e-07, | |
| "loss": 0.26195201873779295, | |
| "memory(GiB)": 137.67, | |
| "step": 2350, | |
| "token_acc": 0.9009433222876742, | |
| "train_speed(iter/s)": 0.040545 | |
| }, | |
| { | |
| "epoch": 2.7576112412177984, | |
| "grad_norm": 0.1674078106880188, | |
| "learning_rate": 3.55090533091339e-07, | |
| "loss": 0.26260790824890134, | |
| "memory(GiB)": 137.67, | |
| "step": 2355, | |
| "token_acc": 0.9115999937809979, | |
| "train_speed(iter/s)": 0.040543 | |
| }, | |
| { | |
| "epoch": 2.7634660421545667, | |
| "grad_norm": 0.1657068282365799, | |
| "learning_rate": 3.382397811641858e-07, | |
| "loss": 0.25954129695892336, | |
| "memory(GiB)": 137.67, | |
| "step": 2360, | |
| "token_acc": 0.9021908567865544, | |
| "train_speed(iter/s)": 0.040543 | |
| }, | |
| { | |
| "epoch": 2.769320843091335, | |
| "grad_norm": 0.167274609208107, | |
| "learning_rate": 3.217917558066241e-07, | |
| "loss": 0.262769889831543, | |
| "memory(GiB)": 137.67, | |
| "step": 2365, | |
| "token_acc": 0.8952377080453587, | |
| "train_speed(iter/s)": 0.040542 | |
| }, | |
| { | |
| "epoch": 2.775175644028103, | |
| "grad_norm": 0.16418085992336273, | |
| "learning_rate": 3.057471426138958e-07, | |
| "loss": 0.2759857654571533, | |
| "memory(GiB)": 137.67, | |
| "step": 2370, | |
| "token_acc": 0.8904371253200432, | |
| "train_speed(iter/s)": 0.04054 | |
| }, | |
| { | |
| "epoch": 2.781030444964871, | |
| "grad_norm": 0.16312485933303833, | |
| "learning_rate": 2.901066103660033e-07, | |
| "loss": 0.26541569232940676, | |
| "memory(GiB)": 137.67, | |
| "step": 2375, | |
| "token_acc": 0.9018337335217314, | |
| "train_speed(iter/s)": 0.04054 | |
| }, | |
| { | |
| "epoch": 2.7868852459016393, | |
| "grad_norm": 0.17677490413188934, | |
| "learning_rate": 2.7487081099983435e-07, | |
| "loss": 0.27631726264953616, | |
| "memory(GiB)": 137.67, | |
| "step": 2380, | |
| "token_acc": 0.9002755878263168, | |
| "train_speed(iter/s)": 0.040539 | |
| }, | |
| { | |
| "epoch": 2.7927400468384076, | |
| "grad_norm": 0.1672162115573883, | |
| "learning_rate": 2.6004037958199167e-07, | |
| "loss": 0.26006388664245605, | |
| "memory(GiB)": 137.67, | |
| "step": 2385, | |
| "token_acc": 0.910639127168484, | |
| "train_speed(iter/s)": 0.040538 | |
| }, | |
| { | |
| "epoch": 2.798594847775176, | |
| "grad_norm": 0.1678304672241211, | |
| "learning_rate": 2.4561593428231165e-07, | |
| "loss": 0.26682395935058595, | |
| "memory(GiB)": 137.67, | |
| "step": 2390, | |
| "token_acc": 0.91889434727678, | |
| "train_speed(iter/s)": 0.040535 | |
| }, | |
| { | |
| "epoch": 2.8044496487119437, | |
| "grad_norm": 0.16077911853790283, | |
| "learning_rate": 2.3159807634811182e-07, | |
| "loss": 0.2570212364196777, | |
| "memory(GiB)": 137.67, | |
| "step": 2395, | |
| "token_acc": 0.9051587858378934, | |
| "train_speed(iter/s)": 0.040535 | |
| }, | |
| { | |
| "epoch": 2.810304449648712, | |
| "grad_norm": 0.16872599720954895, | |
| "learning_rate": 2.1798739007911517e-07, | |
| "loss": 0.27098655700683594, | |
| "memory(GiB)": 137.67, | |
| "step": 2400, | |
| "token_acc": 0.8959861646097005, | |
| "train_speed(iter/s)": 0.040533 | |
| }, | |
| { | |
| "epoch": 2.8161592505854802, | |
| "grad_norm": 0.16125863790512085, | |
| "learning_rate": 2.0478444280310206e-07, | |
| "loss": 0.26554141044616697, | |
| "memory(GiB)": 137.67, | |
| "step": 2405, | |
| "token_acc": 0.8993798050995196, | |
| "train_speed(iter/s)": 0.040533 | |
| }, | |
| { | |
| "epoch": 2.822014051522248, | |
| "grad_norm": 0.19162511825561523, | |
| "learning_rate": 1.919897848522656e-07, | |
| "loss": 0.26296229362487794, | |
| "memory(GiB)": 137.67, | |
| "step": 2410, | |
| "token_acc": 0.8993982865613145, | |
| "train_speed(iter/s)": 0.040532 | |
| }, | |
| { | |
| "epoch": 2.8278688524590163, | |
| "grad_norm": 0.20407338440418243, | |
| "learning_rate": 1.796039495402646e-07, | |
| "loss": 0.26827549934387207, | |
| "memory(GiB)": 137.67, | |
| "step": 2415, | |
| "token_acc": 0.9050311652650377, | |
| "train_speed(iter/s)": 0.04053 | |
| }, | |
| { | |
| "epoch": 2.8337236533957846, | |
| "grad_norm": 0.17013327777385712, | |
| "learning_rate": 1.6762745313999795e-07, | |
| "loss": 0.2727066516876221, | |
| "memory(GiB)": 137.67, | |
| "step": 2420, | |
| "token_acc": 0.8865242476220178, | |
| "train_speed(iter/s)": 0.040529 | |
| }, | |
| { | |
| "epoch": 2.839578454332553, | |
| "grad_norm": 0.1698453575372696, | |
| "learning_rate": 1.5606079486208846e-07, | |
| "loss": 0.2641671895980835, | |
| "memory(GiB)": 137.67, | |
| "step": 2425, | |
| "token_acc": 0.9000177898735047, | |
| "train_speed(iter/s)": 0.040529 | |
| }, | |
| { | |
| "epoch": 2.845433255269321, | |
| "grad_norm": 0.17142532765865326, | |
| "learning_rate": 1.449044568340663e-07, | |
| "loss": 0.2717731952667236, | |
| "memory(GiB)": 137.67, | |
| "step": 2430, | |
| "token_acc": 0.9031580860350494, | |
| "train_speed(iter/s)": 0.040528 | |
| }, | |
| { | |
| "epoch": 2.851288056206089, | |
| "grad_norm": 0.1803494244813919, | |
| "learning_rate": 1.3415890408027932e-07, | |
| "loss": 0.26016151905059814, | |
| "memory(GiB)": 137.67, | |
| "step": 2435, | |
| "token_acc": 0.9004292620366133, | |
| "train_speed(iter/s)": 0.040526 | |
| }, | |
| { | |
| "epoch": 2.857142857142857, | |
| "grad_norm": 0.17327673733234406, | |
| "learning_rate": 1.2382458450250657e-07, | |
| "loss": 0.2739871025085449, | |
| "memory(GiB)": 137.67, | |
| "step": 2440, | |
| "token_acc": 0.8937226907040563, | |
| "train_speed(iter/s)": 0.040526 | |
| }, | |
| { | |
| "epoch": 2.8629976580796255, | |
| "grad_norm": 0.1648455113172531, | |
| "learning_rate": 1.1390192886129304e-07, | |
| "loss": 0.26163692474365235, | |
| "memory(GiB)": 137.67, | |
| "step": 2445, | |
| "token_acc": 0.9109708459314515, | |
| "train_speed(iter/s)": 0.040525 | |
| }, | |
| { | |
| "epoch": 2.8688524590163933, | |
| "grad_norm": 0.17209313809871674, | |
| "learning_rate": 1.0439135075798634e-07, | |
| "loss": 0.2778321266174316, | |
| "memory(GiB)": 137.67, | |
| "step": 2450, | |
| "token_acc": 0.8971170667512587, | |
| "train_speed(iter/s)": 0.040525 | |
| }, | |
| { | |
| "epoch": 2.8747072599531616, | |
| "grad_norm": 0.16632598638534546, | |
| "learning_rate": 9.529324661750494e-08, | |
| "loss": 0.2714024305343628, | |
| "memory(GiB)": 137.67, | |
| "step": 2455, | |
| "token_acc": 0.8926179928835372, | |
| "train_speed(iter/s)": 0.040524 | |
| }, | |
| { | |
| "epoch": 2.88056206088993, | |
| "grad_norm": 0.17401184141635895, | |
| "learning_rate": 8.6607995671808e-08, | |
| "loss": 0.2663599967956543, | |
| "memory(GiB)": 137.67, | |
| "step": 2460, | |
| "token_acc": 0.8979368591641474, | |
| "train_speed(iter/s)": 0.040523 | |
| }, | |
| { | |
| "epoch": 2.8864168618266977, | |
| "grad_norm": 0.17087528109550476, | |
| "learning_rate": 7.833595994409248e-08, | |
| "loss": 0.2583767414093018, | |
| "memory(GiB)": 137.67, | |
| "step": 2465, | |
| "token_acc": 0.8988238974038161, | |
| "train_speed(iter/s)": 0.040522 | |
| }, | |
| { | |
| "epoch": 2.892271662763466, | |
| "grad_norm": 0.17502275109291077, | |
| "learning_rate": 7.047748423370193e-08, | |
| "loss": 0.27132668495178225, | |
| "memory(GiB)": 137.67, | |
| "step": 2470, | |
| "token_acc": 0.8950027089407572, | |
| "train_speed(iter/s)": 0.040522 | |
| }, | |
| { | |
| "epoch": 2.898126463700234, | |
| "grad_norm": 0.16457100212574005, | |
| "learning_rate": 6.303289610175233e-08, | |
| "loss": 0.262396240234375, | |
| "memory(GiB)": 137.67, | |
| "step": 2475, | |
| "token_acc": 0.9005705329153605, | |
| "train_speed(iter/s)": 0.040522 | |
| }, | |
| { | |
| "epoch": 2.9039812646370025, | |
| "grad_norm": 0.17186148464679718, | |
| "learning_rate": 5.6002505857480906e-08, | |
| "loss": 0.2651688098907471, | |
| "memory(GiB)": 137.67, | |
| "step": 2480, | |
| "token_acc": 0.903142540689707, | |
| "train_speed(iter/s)": 0.040521 | |
| }, | |
| { | |
| "epoch": 2.9098360655737707, | |
| "grad_norm": 0.16921843588352203, | |
| "learning_rate": 4.938660654530969e-08, | |
| "loss": 0.27781147956848146, | |
| "memory(GiB)": 137.67, | |
| "step": 2485, | |
| "token_acc": 0.8947337181986305, | |
| "train_speed(iter/s)": 0.040521 | |
| }, | |
| { | |
| "epoch": 2.9156908665105385, | |
| "grad_norm": 0.17168040573596954, | |
| "learning_rate": 4.318547393263317e-08, | |
| "loss": 0.27856767177581787, | |
| "memory(GiB)": 137.67, | |
| "step": 2490, | |
| "token_acc": 0.8994483098446597, | |
| "train_speed(iter/s)": 0.04052 | |
| }, | |
| { | |
| "epoch": 2.921545667447307, | |
| "grad_norm": 0.17257463932037354, | |
| "learning_rate": 3.739936649832188e-08, | |
| "loss": 0.26465725898742676, | |
| "memory(GiB)": 137.67, | |
| "step": 2495, | |
| "token_acc": 0.9003965374896801, | |
| "train_speed(iter/s)": 0.04052 | |
| }, | |
| { | |
| "epoch": 2.927400468384075, | |
| "grad_norm": 0.17007899284362793, | |
| "learning_rate": 3.2028525421946563e-08, | |
| "loss": 0.26408021450042723, | |
| "memory(GiB)": 137.67, | |
| "step": 2500, | |
| "token_acc": 0.9105243972950552, | |
| "train_speed(iter/s)": 0.04052 | |
| }, | |
| { | |
| "epoch": 2.933255269320843, | |
| "grad_norm": 0.16546528041362762, | |
| "learning_rate": 2.70731745737296e-08, | |
| "loss": 0.26817855834960935, | |
| "memory(GiB)": 137.67, | |
| "step": 2505, | |
| "token_acc": 0.9032225815017886, | |
| "train_speed(iter/s)": 0.040519 | |
| }, | |
| { | |
| "epoch": 2.939110070257611, | |
| "grad_norm": 0.1731211543083191, | |
| "learning_rate": 2.2533520505211294e-08, | |
| "loss": 0.26341302394866944, | |
| "memory(GiB)": 137.67, | |
| "step": 2510, | |
| "token_acc": 0.9048233016983017, | |
| "train_speed(iter/s)": 0.040519 | |
| }, | |
| { | |
| "epoch": 2.9449648711943794, | |
| "grad_norm": 0.16093143820762634, | |
| "learning_rate": 1.8409752440639027e-08, | |
| "loss": 0.25573346614837644, | |
| "memory(GiB)": 137.67, | |
| "step": 2515, | |
| "token_acc": 0.9019553343056392, | |
| "train_speed(iter/s)": 0.040518 | |
| }, | |
| { | |
| "epoch": 2.9508196721311473, | |
| "grad_norm": 0.16452209651470184, | |
| "learning_rate": 1.470204226908134e-08, | |
| "loss": 0.2707658767700195, | |
| "memory(GiB)": 137.67, | |
| "step": 2520, | |
| "token_acc": 0.904132819893002, | |
| "train_speed(iter/s)": 0.040517 | |
| }, | |
| { | |
| "epoch": 2.9566744730679155, | |
| "grad_norm": 0.1768556386232376, | |
| "learning_rate": 1.1410544537263645e-08, | |
| "loss": 0.27701735496520996, | |
| "memory(GiB)": 137.67, | |
| "step": 2525, | |
| "token_acc": 0.903024352910179, | |
| "train_speed(iter/s)": 0.040515 | |
| }, | |
| { | |
| "epoch": 2.962529274004684, | |
| "grad_norm": 0.16568534076213837, | |
| "learning_rate": 8.535396443124511e-09, | |
| "loss": 0.25813367366790774, | |
| "memory(GiB)": 137.67, | |
| "step": 2530, | |
| "token_acc": 0.9017673177727538, | |
| "train_speed(iter/s)": 0.040514 | |
| }, | |
| { | |
| "epoch": 2.968384074941452, | |
| "grad_norm": 0.16622532904148102, | |
| "learning_rate": 6.076717830098e-09, | |
| "loss": 0.260286283493042, | |
| "memory(GiB)": 137.67, | |
| "step": 2535, | |
| "token_acc": 0.9083364106929379, | |
| "train_speed(iter/s)": 0.040513 | |
| }, | |
| { | |
| "epoch": 2.9742388758782203, | |
| "grad_norm": 0.17745059728622437, | |
| "learning_rate": 4.034611182121007e-09, | |
| "loss": 0.26159353256225587, | |
| "memory(GiB)": 137.67, | |
| "step": 2540, | |
| "token_acc": 0.9072020079994492, | |
| "train_speed(iter/s)": 0.040512 | |
| }, | |
| { | |
| "epoch": 2.980093676814988, | |
| "grad_norm": 0.16991080343723297, | |
| "learning_rate": 2.40916161935445e-09, | |
| "loss": 0.26626038551330566, | |
| "memory(GiB)": 137.67, | |
| "step": 2545, | |
| "token_acc": 0.8986437875498561, | |
| "train_speed(iter/s)": 0.040511 | |
| }, | |
| { | |
| "epoch": 2.9859484777517564, | |
| "grad_norm": 0.16490155458450317, | |
| "learning_rate": 1.2004368946427758e-09, | |
| "loss": 0.2636513948440552, | |
| "memory(GiB)": 137.67, | |
| "step": 2550, | |
| "token_acc": 0.9014935708777286, | |
| "train_speed(iter/s)": 0.040511 | |
| }, | |
| { | |
| "epoch": 2.9918032786885247, | |
| "grad_norm": 0.1677451878786087, | |
| "learning_rate": 4.084873906851083e-10, | |
| "loss": 0.26745948791503904, | |
| "memory(GiB)": 137.67, | |
| "step": 2555, | |
| "token_acc": 0.9085500921651726, | |
| "train_speed(iter/s)": 0.04051 | |
| }, | |
| { | |
| "epoch": 2.9976580796252925, | |
| "grad_norm": 0.1645430028438568, | |
| "learning_rate": 3.334611793692766e-11, | |
| "loss": 0.26831555366516113, | |
| "memory(GiB)": 137.67, | |
| "step": 2560, | |
| "token_acc": 0.9117214925099609, | |
| "train_speed(iter/s)": 0.040508 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2562, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3414295945805824.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |