| { | |
| "best_global_step": 400, | |
| "best_metric": 0.38200808, | |
| "best_model_checkpoint": "/home/ubuntu/output/v1-20250507-032200/checkpoint-400", | |
| "epoch": 4.123711340206185, | |
| "eval_steps": 50, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.010309278350515464, | |
| "grad_norm": 2.8573532540412123, | |
| "learning_rate": 4e-08, | |
| "loss": 0.4424864649772644, | |
| "memory(GiB)": 37.04, | |
| "step": 1, | |
| "token_acc": 0.8862085383272497, | |
| "train_speed(iter/s)": 0.083623 | |
| }, | |
| { | |
| "epoch": 0.05154639175257732, | |
| "grad_norm": 2.8403080546549457, | |
| "learning_rate": 2e-07, | |
| "loss": 0.41638126969337463, | |
| "memory(GiB)": 37.04, | |
| "step": 5, | |
| "token_acc": 0.848388504457328, | |
| "train_speed(iter/s)": 0.199696 | |
| }, | |
| { | |
| "epoch": 0.10309278350515463, | |
| "grad_norm": 2.6629741598617374, | |
| "learning_rate": 4e-07, | |
| "loss": 0.4429419994354248, | |
| "memory(GiB)": 37.04, | |
| "step": 10, | |
| "token_acc": 0.8634871926011166, | |
| "train_speed(iter/s)": 0.246696 | |
| }, | |
| { | |
| "epoch": 0.15463917525773196, | |
| "grad_norm": 3.095692053663771, | |
| "learning_rate": 6e-07, | |
| "loss": 0.45945310592651367, | |
| "memory(GiB)": 37.04, | |
| "step": 15, | |
| "token_acc": 0.8555051349414855, | |
| "train_speed(iter/s)": 0.275511 | |
| }, | |
| { | |
| "epoch": 0.20618556701030927, | |
| "grad_norm": 2.637907165803134, | |
| "learning_rate": 8e-07, | |
| "loss": 0.4365420341491699, | |
| "memory(GiB)": 37.04, | |
| "step": 20, | |
| "token_acc": 0.8420170882894957, | |
| "train_speed(iter/s)": 0.28272 | |
| }, | |
| { | |
| "epoch": 0.25773195876288657, | |
| "grad_norm": 2.476440678075636, | |
| "learning_rate": 1e-06, | |
| "loss": 0.4216726779937744, | |
| "memory(GiB)": 37.04, | |
| "step": 25, | |
| "token_acc": 0.8617449014193673, | |
| "train_speed(iter/s)": 0.284174 | |
| }, | |
| { | |
| "epoch": 0.30927835051546393, | |
| "grad_norm": 2.2717829784567276, | |
| "learning_rate": 9.99708511183087e-07, | |
| "loss": 0.4195085048675537, | |
| "memory(GiB)": 37.04, | |
| "step": 30, | |
| "token_acc": 0.8650009974067425, | |
| "train_speed(iter/s)": 0.28951 | |
| }, | |
| { | |
| "epoch": 0.36082474226804123, | |
| "grad_norm": 2.077114274728423, | |
| "learning_rate": 9.988343845952696e-07, | |
| "loss": 0.3923480987548828, | |
| "memory(GiB)": 37.04, | |
| "step": 35, | |
| "token_acc": 0.85941012498876, | |
| "train_speed(iter/s)": 0.288231 | |
| }, | |
| { | |
| "epoch": 0.41237113402061853, | |
| "grad_norm": 1.734736391920931, | |
| "learning_rate": 9.973786394290473e-07, | |
| "loss": 0.3943674564361572, | |
| "memory(GiB)": 37.04, | |
| "step": 40, | |
| "token_acc": 0.8583430377640991, | |
| "train_speed(iter/s)": 0.28752 | |
| }, | |
| { | |
| "epoch": 0.4639175257731959, | |
| "grad_norm": 1.3825734749873666, | |
| "learning_rate": 9.953429730181652e-07, | |
| "loss": 0.39246172904968263, | |
| "memory(GiB)": 37.04, | |
| "step": 45, | |
| "token_acc": 0.86418609718893, | |
| "train_speed(iter/s)": 0.291708 | |
| }, | |
| { | |
| "epoch": 0.5154639175257731, | |
| "grad_norm": 0.8751053891908908, | |
| "learning_rate": 9.927297588585983e-07, | |
| "loss": 0.39423561096191406, | |
| "memory(GiB)": 37.04, | |
| "step": 50, | |
| "token_acc": 0.8659434116556725, | |
| "train_speed(iter/s)": 0.294091 | |
| }, | |
| { | |
| "epoch": 0.5154639175257731, | |
| "eval_loss": 0.41471877694129944, | |
| "eval_runtime": 0.9078, | |
| "eval_samples_per_second": 16.524, | |
| "eval_steps_per_second": 2.203, | |
| "eval_token_acc": 0.8572620288991509, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5670103092783505, | |
| "grad_norm": 0.8663484008448668, | |
| "learning_rate": 9.895420438411615e-07, | |
| "loss": 0.3726518154144287, | |
| "memory(GiB)": 37.04, | |
| "step": 55, | |
| "token_acc": 0.8743935309973045, | |
| "train_speed(iter/s)": 0.230989 | |
| }, | |
| { | |
| "epoch": 0.6185567010309279, | |
| "grad_norm": 0.667816147564415, | |
| "learning_rate": 9.857835446989707e-07, | |
| "loss": 0.36932010650634767, | |
| "memory(GiB)": 37.04, | |
| "step": 60, | |
| "token_acc": 0.8602125147579693, | |
| "train_speed(iter/s)": 0.23604 | |
| }, | |
| { | |
| "epoch": 0.6701030927835051, | |
| "grad_norm": 0.8293459879499964, | |
| "learning_rate": 9.814586436738997e-07, | |
| "loss": 0.3804448366165161, | |
| "memory(GiB)": 37.04, | |
| "step": 65, | |
| "token_acc": 0.8690644468875975, | |
| "train_speed(iter/s)": 0.241146 | |
| }, | |
| { | |
| "epoch": 0.7216494845360825, | |
| "grad_norm": 0.7787810832421912, | |
| "learning_rate": 9.765723834070804e-07, | |
| "loss": 0.36735010147094727, | |
| "memory(GiB)": 37.04, | |
| "step": 70, | |
| "token_acc": 0.8663579855385005, | |
| "train_speed(iter/s)": 0.243142 | |
| }, | |
| { | |
| "epoch": 0.7731958762886598, | |
| "grad_norm": 0.7336453374485993, | |
| "learning_rate": 9.711304610594102e-07, | |
| "loss": 0.3681507110595703, | |
| "memory(GiB)": 37.04, | |
| "step": 75, | |
| "token_acc": 0.8693066207544217, | |
| "train_speed(iter/s)": 0.245585 | |
| }, | |
| { | |
| "epoch": 0.8247422680412371, | |
| "grad_norm": 0.8189938563466513, | |
| "learning_rate": 9.651392216689165e-07, | |
| "loss": 0.3353311061859131, | |
| "memory(GiB)": 37.04, | |
| "step": 80, | |
| "token_acc": 0.8910798995442284, | |
| "train_speed(iter/s)": 0.249085 | |
| }, | |
| { | |
| "epoch": 0.8762886597938144, | |
| "grad_norm": 0.6779929134307079, | |
| "learning_rate": 9.586056507527264e-07, | |
| "loss": 0.34499826431274416, | |
| "memory(GiB)": 37.04, | |
| "step": 85, | |
| "token_acc": 0.8659364094610315, | |
| "train_speed(iter/s)": 0.252498 | |
| }, | |
| { | |
| "epoch": 0.9278350515463918, | |
| "grad_norm": 0.7719151294405667, | |
| "learning_rate": 9.515373661622663e-07, | |
| "loss": 0.3747685432434082, | |
| "memory(GiB)": 37.04, | |
| "step": 90, | |
| "token_acc": 0.8549843014128728, | |
| "train_speed(iter/s)": 0.253844 | |
| }, | |
| { | |
| "epoch": 0.979381443298969, | |
| "grad_norm": 0.6973169912038593, | |
| "learning_rate": 9.439426092011875e-07, | |
| "loss": 0.36587209701538087, | |
| "memory(GiB)": 37.04, | |
| "step": 95, | |
| "token_acc": 0.8687297229847767, | |
| "train_speed(iter/s)": 0.257063 | |
| }, | |
| { | |
| "epoch": 1.0309278350515463, | |
| "grad_norm": 0.692769567261216, | |
| "learning_rate": 9.358302350163756e-07, | |
| "loss": 0.3589401006698608, | |
| "memory(GiB)": 37.04, | |
| "step": 100, | |
| "token_acc": 0.8745442333525235, | |
| "train_speed(iter/s)": 0.259875 | |
| }, | |
| { | |
| "epoch": 1.0309278350515463, | |
| "eval_loss": 0.3933715224266052, | |
| "eval_runtime": 0.9247, | |
| "eval_samples_per_second": 16.222, | |
| "eval_steps_per_second": 2.163, | |
| "eval_token_acc": 0.862684343810517, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0824742268041236, | |
| "grad_norm": 0.6544219439665868, | |
| "learning_rate": 9.272097022732443e-07, | |
| "loss": 0.33444135189056395, | |
| "memory(GiB)": 37.04, | |
| "step": 105, | |
| "token_acc": 0.8807363927427961, | |
| "train_speed(iter/s)": 0.232768 | |
| }, | |
| { | |
| "epoch": 1.134020618556701, | |
| "grad_norm": 0.6367223115088674, | |
| "learning_rate": 9.180910621273555e-07, | |
| "loss": 0.34373011589050295, | |
| "memory(GiB)": 37.04, | |
| "step": 110, | |
| "token_acc": 0.8613336475023563, | |
| "train_speed(iter/s)": 0.235371 | |
| }, | |
| { | |
| "epoch": 1.1855670103092784, | |
| "grad_norm": 0.7492955478131063, | |
| "learning_rate": 9.084849465052209e-07, | |
| "loss": 0.3586832046508789, | |
| "memory(GiB)": 37.04, | |
| "step": 115, | |
| "token_acc": 0.8713512419689381, | |
| "train_speed(iter/s)": 0.237153 | |
| }, | |
| { | |
| "epoch": 1.2371134020618557, | |
| "grad_norm": 0.7071477506615176, | |
| "learning_rate": 8.984025557079522e-07, | |
| "loss": 0.3494635343551636, | |
| "memory(GiB)": 37.04, | |
| "step": 120, | |
| "token_acc": 0.8666221703098059, | |
| "train_speed(iter/s)": 0.239655 | |
| }, | |
| { | |
| "epoch": 1.2886597938144329, | |
| "grad_norm": 0.6092307857366024, | |
| "learning_rate": 8.878556453522099e-07, | |
| "loss": 0.3500272512435913, | |
| "memory(GiB)": 37.04, | |
| "step": 125, | |
| "token_acc": 0.8708257830701527, | |
| "train_speed(iter/s)": 0.241223 | |
| }, | |
| { | |
| "epoch": 1.3402061855670104, | |
| "grad_norm": 0.6624451351751124, | |
| "learning_rate": 8.768565126636805e-07, | |
| "loss": 0.3489999294281006, | |
| "memory(GiB)": 37.04, | |
| "step": 130, | |
| "token_acc": 0.8813361936867913, | |
| "train_speed(iter/s)": 0.243732 | |
| }, | |
| { | |
| "epoch": 1.3917525773195876, | |
| "grad_norm": 0.7828961232174202, | |
| "learning_rate": 8.654179821390621e-07, | |
| "loss": 0.3605631113052368, | |
| "memory(GiB)": 37.04, | |
| "step": 135, | |
| "token_acc": 0.8725261932479628, | |
| "train_speed(iter/s)": 0.245988 | |
| }, | |
| { | |
| "epoch": 1.443298969072165, | |
| "grad_norm": 0.7711241338182571, | |
| "learning_rate": 8.535533905932737e-07, | |
| "loss": 0.34404377937316893, | |
| "memory(GiB)": 37.04, | |
| "step": 140, | |
| "token_acc": 0.8685788787483703, | |
| "train_speed(iter/s)": 0.247325 | |
| }, | |
| { | |
| "epoch": 1.4948453608247423, | |
| "grad_norm": 0.6706712637235819, | |
| "learning_rate": 8.41276571609327e-07, | |
| "loss": 0.34629359245300295, | |
| "memory(GiB)": 37.04, | |
| "step": 145, | |
| "token_acc": 0.8681879534334402, | |
| "train_speed(iter/s)": 0.248924 | |
| }, | |
| { | |
| "epoch": 1.5463917525773194, | |
| "grad_norm": 0.7453364306944348, | |
| "learning_rate": 8.286018394089863e-07, | |
| "loss": 0.35725903511047363, | |
| "memory(GiB)": 37.04, | |
| "step": 150, | |
| "token_acc": 0.8822486705495062, | |
| "train_speed(iter/s)": 0.250797 | |
| }, | |
| { | |
| "epoch": 1.5463917525773194, | |
| "eval_loss": 0.38744473457336426, | |
| "eval_runtime": 0.9547, | |
| "eval_samples_per_second": 15.712, | |
| "eval_steps_per_second": 2.095, | |
| "eval_token_acc": 0.8649188142410249, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.597938144329897, | |
| "grad_norm": 0.6717542898329211, | |
| "learning_rate": 8.155439721630264e-07, | |
| "loss": 0.3512057065963745, | |
| "memory(GiB)": 37.04, | |
| "step": 155, | |
| "token_acc": 0.8510025260498895, | |
| "train_speed(iter/s)": 0.133083 | |
| }, | |
| { | |
| "epoch": 1.6494845360824741, | |
| "grad_norm": 0.6040559170916198, | |
| "learning_rate": 8.021181947605472e-07, | |
| "loss": 0.34543561935424805, | |
| "memory(GiB)": 37.23, | |
| "step": 160, | |
| "token_acc": 0.8643211778804999, | |
| "train_speed(iter/s)": 0.135409 | |
| }, | |
| { | |
| "epoch": 1.7010309278350515, | |
| "grad_norm": 0.6664553249603009, | |
| "learning_rate": 7.883401610574336e-07, | |
| "loss": 0.34488234519958494, | |
| "memory(GiB)": 37.23, | |
| "step": 165, | |
| "token_acc": 0.8770163214660686, | |
| "train_speed(iter/s)": 0.137732 | |
| }, | |
| { | |
| "epoch": 1.7525773195876289, | |
| "grad_norm": 0.8055880341832813, | |
| "learning_rate": 7.742259356246594e-07, | |
| "loss": 0.34050488471984863, | |
| "memory(GiB)": 37.23, | |
| "step": 170, | |
| "token_acc": 0.8739508700102354, | |
| "train_speed(iter/s)": 0.139961 | |
| }, | |
| { | |
| "epoch": 1.8041237113402062, | |
| "grad_norm": 0.7026822100715953, | |
| "learning_rate": 7.597919750177168e-07, | |
| "loss": 0.35553760528564454, | |
| "memory(GiB)": 37.23, | |
| "step": 175, | |
| "token_acc": 0.8798612963456922, | |
| "train_speed(iter/s)": 0.142407 | |
| }, | |
| { | |
| "epoch": 1.8556701030927836, | |
| "grad_norm": 0.6296090546523304, | |
| "learning_rate": 7.450551085890087e-07, | |
| "loss": 0.3472688913345337, | |
| "memory(GiB)": 37.23, | |
| "step": 180, | |
| "token_acc": 0.8816289399047467, | |
| "train_speed(iter/s)": 0.144588 | |
| }, | |
| { | |
| "epoch": 1.9072164948453607, | |
| "grad_norm": 0.7100452196024024, | |
| "learning_rate": 7.30032518865576e-07, | |
| "loss": 0.342409086227417, | |
| "memory(GiB)": 37.23, | |
| "step": 185, | |
| "token_acc": 0.8932488313556275, | |
| "train_speed(iter/s)": 0.146787 | |
| }, | |
| { | |
| "epoch": 1.9587628865979383, | |
| "grad_norm": 0.6751616783204145, | |
| "learning_rate": 7.14741721515041e-07, | |
| "loss": 0.3448726892471313, | |
| "memory(GiB)": 37.23, | |
| "step": 190, | |
| "token_acc": 0.87621425082759, | |
| "train_speed(iter/s)": 0.148964 | |
| }, | |
| { | |
| "epoch": 2.0103092783505154, | |
| "grad_norm": 0.683041065571367, | |
| "learning_rate": 6.992005449231207e-07, | |
| "loss": 0.3384934663772583, | |
| "memory(GiB)": 37.23, | |
| "step": 195, | |
| "token_acc": 0.8557946771857992, | |
| "train_speed(iter/s)": 0.15077 | |
| }, | |
| { | |
| "epoch": 2.0618556701030926, | |
| "grad_norm": 0.6516412497661351, | |
| "learning_rate": 6.834271094065282e-07, | |
| "loss": 0.34560070037841795, | |
| "memory(GiB)": 37.23, | |
| "step": 200, | |
| "token_acc": 0.8638397328881469, | |
| "train_speed(iter/s)": 0.152746 | |
| }, | |
| { | |
| "epoch": 2.0618556701030926, | |
| "eval_loss": 0.38463127613067627, | |
| "eval_runtime": 0.9665, | |
| "eval_samples_per_second": 15.52, | |
| "eval_steps_per_second": 2.069, | |
| "eval_token_acc": 0.865127364814539, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.11340206185567, | |
| "grad_norm": 0.7045810824940504, | |
| "learning_rate": 6.67439806085493e-07, | |
| "loss": 0.33291826248168943, | |
| "memory(GiB)": 37.77, | |
| "step": 205, | |
| "token_acc": 0.8808652368055048, | |
| "train_speed(iter/s)": 0.103353 | |
| }, | |
| { | |
| "epoch": 2.1649484536082473, | |
| "grad_norm": 0.744342227263682, | |
| "learning_rate": 6.512572754405379e-07, | |
| "loss": 0.3307349681854248, | |
| "memory(GiB)": 37.77, | |
| "step": 210, | |
| "token_acc": 0.8755916159567275, | |
| "train_speed(iter/s)": 0.104962 | |
| }, | |
| { | |
| "epoch": 2.216494845360825, | |
| "grad_norm": 0.6323077432320735, | |
| "learning_rate": 6.348983855785121e-07, | |
| "loss": 0.3156585216522217, | |
| "memory(GiB)": 37.77, | |
| "step": 215, | |
| "token_acc": 0.8785982478097623, | |
| "train_speed(iter/s)": 0.106558 | |
| }, | |
| { | |
| "epoch": 2.268041237113402, | |
| "grad_norm": 0.6915848622598538, | |
| "learning_rate": 6.183822102332234e-07, | |
| "loss": 0.31838016510009765, | |
| "memory(GiB)": 37.77, | |
| "step": 220, | |
| "token_acc": 0.8778223495702006, | |
| "train_speed(iter/s)": 0.108175 | |
| }, | |
| { | |
| "epoch": 2.319587628865979, | |
| "grad_norm": 0.5308281735282047, | |
| "learning_rate": 6.01728006526317e-07, | |
| "loss": 0.3378974437713623, | |
| "memory(GiB)": 37.77, | |
| "step": 225, | |
| "token_acc": 0.871986195961386, | |
| "train_speed(iter/s)": 0.109623 | |
| }, | |
| { | |
| "epoch": 2.3711340206185567, | |
| "grad_norm": 0.6489926882668993, | |
| "learning_rate": 5.849551925143333e-07, | |
| "loss": 0.32499127388000487, | |
| "memory(GiB)": 37.77, | |
| "step": 230, | |
| "token_acc": 0.8704572098475967, | |
| "train_speed(iter/s)": 0.111249 | |
| }, | |
| { | |
| "epoch": 2.422680412371134, | |
| "grad_norm": 0.5977786295969187, | |
| "learning_rate": 5.680833245481234e-07, | |
| "loss": 0.3224591493606567, | |
| "memory(GiB)": 37.77, | |
| "step": 235, | |
| "token_acc": 0.8655716346377553, | |
| "train_speed(iter/s)": 0.112868 | |
| }, | |
| { | |
| "epoch": 2.4742268041237114, | |
| "grad_norm": 0.6356644743549772, | |
| "learning_rate": 5.51132074471017e-07, | |
| "loss": 0.3337638139724731, | |
| "memory(GiB)": 37.77, | |
| "step": 240, | |
| "token_acc": 0.8858976428802067, | |
| "train_speed(iter/s)": 0.114243 | |
| }, | |
| { | |
| "epoch": 2.5257731958762886, | |
| "grad_norm": 0.6955635518347525, | |
| "learning_rate": 5.341212066823355e-07, | |
| "loss": 0.34955778121948244, | |
| "memory(GiB)": 37.77, | |
| "step": 245, | |
| "token_acc": 0.875490146152671, | |
| "train_speed(iter/s)": 0.115731 | |
| }, | |
| { | |
| "epoch": 2.5773195876288657, | |
| "grad_norm": 0.6678415617901132, | |
| "learning_rate": 5.170705550929839e-07, | |
| "loss": 0.33729376792907717, | |
| "memory(GiB)": 37.77, | |
| "step": 250, | |
| "token_acc": 0.875747248449634, | |
| "train_speed(iter/s)": 0.1172 | |
| }, | |
| { | |
| "epoch": 2.5773195876288657, | |
| "eval_loss": 0.38304245471954346, | |
| "eval_runtime": 0.9688, | |
| "eval_samples_per_second": 15.482, | |
| "eval_steps_per_second": 2.064, | |
| "eval_token_acc": 0.8655742589006405, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.6288659793814433, | |
| "grad_norm": 0.7100665104948269, | |
| "learning_rate": 5e-07, | |
| "loss": 0.3353480339050293, | |
| "memory(GiB)": 37.77, | |
| "step": 255, | |
| "token_acc": 0.877640879069947, | |
| "train_speed(iter/s)": 0.09114 | |
| }, | |
| { | |
| "epoch": 2.680412371134021, | |
| "grad_norm": 0.718021243798035, | |
| "learning_rate": 4.829294449070161e-07, | |
| "loss": 0.35503473281860354, | |
| "memory(GiB)": 37.77, | |
| "step": 260, | |
| "token_acc": 0.8858998533417138, | |
| "train_speed(iter/s)": 0.092445 | |
| }, | |
| { | |
| "epoch": 2.731958762886598, | |
| "grad_norm": 0.7452764083029736, | |
| "learning_rate": 4.6587879331766457e-07, | |
| "loss": 0.3445273876190186, | |
| "memory(GiB)": 37.77, | |
| "step": 265, | |
| "token_acc": 0.88517933679224, | |
| "train_speed(iter/s)": 0.093659 | |
| }, | |
| { | |
| "epoch": 2.783505154639175, | |
| "grad_norm": 0.7104117036082537, | |
| "learning_rate": 4.4886792552898283e-07, | |
| "loss": 0.33231029510498045, | |
| "memory(GiB)": 37.77, | |
| "step": 270, | |
| "token_acc": 0.8740161938735066, | |
| "train_speed(iter/s)": 0.094846 | |
| }, | |
| { | |
| "epoch": 2.8350515463917527, | |
| "grad_norm": 0.6272461565144517, | |
| "learning_rate": 4.3191667545187675e-07, | |
| "loss": 0.3508894920349121, | |
| "memory(GiB)": 37.77, | |
| "step": 275, | |
| "token_acc": 0.8654118758859662, | |
| "train_speed(iter/s)": 0.0961 | |
| }, | |
| { | |
| "epoch": 2.88659793814433, | |
| "grad_norm": 0.6361538625329014, | |
| "learning_rate": 4.150448074856667e-07, | |
| "loss": 0.3445762634277344, | |
| "memory(GiB)": 37.77, | |
| "step": 280, | |
| "token_acc": 0.8735829231875242, | |
| "train_speed(iter/s)": 0.097284 | |
| }, | |
| { | |
| "epoch": 2.9381443298969074, | |
| "grad_norm": 0.6794447157660283, | |
| "learning_rate": 3.9827199347368317e-07, | |
| "loss": 0.3385981321334839, | |
| "memory(GiB)": 37.77, | |
| "step": 285, | |
| "token_acc": 0.8693530615457264, | |
| "train_speed(iter/s)": 0.098521 | |
| }, | |
| { | |
| "epoch": 2.9896907216494846, | |
| "grad_norm": 0.5858240811392458, | |
| "learning_rate": 3.816177897667766e-07, | |
| "loss": 0.3372650623321533, | |
| "memory(GiB)": 37.77, | |
| "step": 290, | |
| "token_acc": 0.8661933904528764, | |
| "train_speed(iter/s)": 0.099708 | |
| }, | |
| { | |
| "epoch": 3.0412371134020617, | |
| "grad_norm": 0.6492571076999712, | |
| "learning_rate": 3.651016144214878e-07, | |
| "loss": 0.3159039974212646, | |
| "memory(GiB)": 37.77, | |
| "step": 295, | |
| "token_acc": 0.8897466689540802, | |
| "train_speed(iter/s)": 0.100872 | |
| }, | |
| { | |
| "epoch": 3.0927835051546393, | |
| "grad_norm": 0.5938135529575547, | |
| "learning_rate": 3.4874272455946216e-07, | |
| "loss": 0.3142085075378418, | |
| "memory(GiB)": 37.77, | |
| "step": 300, | |
| "token_acc": 0.898450120061122, | |
| "train_speed(iter/s)": 0.102031 | |
| }, | |
| { | |
| "epoch": 3.0927835051546393, | |
| "eval_loss": 0.3824613094329834, | |
| "eval_runtime": 0.9688, | |
| "eval_samples_per_second": 15.484, | |
| "eval_steps_per_second": 2.064, | |
| "eval_token_acc": 0.8658126024132281, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 3.1443298969072164, | |
| "grad_norm": 0.6425324377834628, | |
| "learning_rate": 3.325601939145069e-07, | |
| "loss": 0.331402063369751, | |
| "memory(GiB)": 37.77, | |
| "step": 305, | |
| "token_acc": 0.8719620011787955, | |
| "train_speed(iter/s)": 0.084419 | |
| }, | |
| { | |
| "epoch": 3.195876288659794, | |
| "grad_norm": 0.6991897942082559, | |
| "learning_rate": 3.165728905934718e-07, | |
| "loss": 0.3265227794647217, | |
| "memory(GiB)": 37.77, | |
| "step": 310, | |
| "token_acc": 0.8806500761808025, | |
| "train_speed(iter/s)": 0.085442 | |
| }, | |
| { | |
| "epoch": 3.247422680412371, | |
| "grad_norm": 0.6735292235062921, | |
| "learning_rate": 3.007994550768793e-07, | |
| "loss": 0.3452199935913086, | |
| "memory(GiB)": 37.77, | |
| "step": 315, | |
| "token_acc": 0.8813902802737735, | |
| "train_speed(iter/s)": 0.086446 | |
| }, | |
| { | |
| "epoch": 3.2989690721649483, | |
| "grad_norm": 0.6808191589842796, | |
| "learning_rate": 2.852582784849591e-07, | |
| "loss": 0.32866055965423585, | |
| "memory(GiB)": 37.77, | |
| "step": 320, | |
| "token_acc": 0.882211663969091, | |
| "train_speed(iter/s)": 0.087454 | |
| }, | |
| { | |
| "epoch": 3.350515463917526, | |
| "grad_norm": 0.7081778457873291, | |
| "learning_rate": 2.699674811344239e-07, | |
| "loss": 0.32704184055328367, | |
| "memory(GiB)": 37.77, | |
| "step": 325, | |
| "token_acc": 0.8746191346739792, | |
| "train_speed(iter/s)": 0.08836 | |
| }, | |
| { | |
| "epoch": 3.402061855670103, | |
| "grad_norm": 0.6412966995516326, | |
| "learning_rate": 2.549448914109915e-07, | |
| "loss": 0.3270967960357666, | |
| "memory(GiB)": 37.77, | |
| "step": 330, | |
| "token_acc": 0.8627403990174346, | |
| "train_speed(iter/s)": 0.08934 | |
| }, | |
| { | |
| "epoch": 3.4536082474226806, | |
| "grad_norm": 0.5539028709876479, | |
| "learning_rate": 2.4020802498228334e-07, | |
| "loss": 0.3313950538635254, | |
| "memory(GiB)": 37.77, | |
| "step": 335, | |
| "token_acc": 0.8844105805446123, | |
| "train_speed(iter/s)": 0.090289 | |
| }, | |
| { | |
| "epoch": 3.5051546391752577, | |
| "grad_norm": 0.691681066452575, | |
| "learning_rate": 2.257740643753405e-07, | |
| "loss": 0.32365880012512205, | |
| "memory(GiB)": 37.77, | |
| "step": 340, | |
| "token_acc": 0.8830835117773019, | |
| "train_speed(iter/s)": 0.091196 | |
| }, | |
| { | |
| "epoch": 3.556701030927835, | |
| "grad_norm": 0.6193458212688764, | |
| "learning_rate": 2.1165983894256646e-07, | |
| "loss": 0.3258372783660889, | |
| "memory(GiB)": 37.77, | |
| "step": 345, | |
| "token_acc": 0.9008801723823089, | |
| "train_speed(iter/s)": 0.092053 | |
| }, | |
| { | |
| "epoch": 3.6082474226804124, | |
| "grad_norm": 0.6415244001554736, | |
| "learning_rate": 1.9788180523945275e-07, | |
| "loss": 0.33638935089111327, | |
| "memory(GiB)": 37.77, | |
| "step": 350, | |
| "token_acc": 0.8890733326823552, | |
| "train_speed(iter/s)": 0.093029 | |
| }, | |
| { | |
| "epoch": 3.6082474226804124, | |
| "eval_loss": 0.3821544945240021, | |
| "eval_runtime": 0.9819, | |
| "eval_samples_per_second": 15.277, | |
| "eval_steps_per_second": 2.037, | |
| "eval_token_acc": 0.8655742589006405, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.6597938144329896, | |
| "grad_norm": 0.648833766542735, | |
| "learning_rate": 1.8445602783697373e-07, | |
| "loss": 0.31353211402893066, | |
| "memory(GiB)": 37.77, | |
| "step": 355, | |
| "token_acc": 0.8827717297390704, | |
| "train_speed(iter/s)": 0.080129 | |
| }, | |
| { | |
| "epoch": 3.711340206185567, | |
| "grad_norm": 0.6717087103762708, | |
| "learning_rate": 1.713981605910137e-07, | |
| "loss": 0.31485791206359864, | |
| "memory(GiB)": 37.77, | |
| "step": 360, | |
| "token_acc": 0.8871092587079488, | |
| "train_speed(iter/s)": 0.081028 | |
| }, | |
| { | |
| "epoch": 3.7628865979381443, | |
| "grad_norm": 0.6281626069775872, | |
| "learning_rate": 1.5872342839067304e-07, | |
| "loss": 0.34305672645568847, | |
| "memory(GiB)": 37.77, | |
| "step": 365, | |
| "token_acc": 0.874572343008498, | |
| "train_speed(iter/s)": 0.081796 | |
| }, | |
| { | |
| "epoch": 3.8144329896907214, | |
| "grad_norm": 0.6620673803452289, | |
| "learning_rate": 1.4644660940672627e-07, | |
| "loss": 0.33494589328765867, | |
| "memory(GiB)": 37.77, | |
| "step": 370, | |
| "token_acc": 0.889901752253621, | |
| "train_speed(iter/s)": 0.082621 | |
| }, | |
| { | |
| "epoch": 3.865979381443299, | |
| "grad_norm": 0.693656406725637, | |
| "learning_rate": 1.3458201786093794e-07, | |
| "loss": 0.3408153533935547, | |
| "memory(GiB)": 37.77, | |
| "step": 375, | |
| "token_acc": 0.876736032056828, | |
| "train_speed(iter/s)": 0.083425 | |
| }, | |
| { | |
| "epoch": 3.917525773195876, | |
| "grad_norm": 0.6371484991381379, | |
| "learning_rate": 1.2314348733631957e-07, | |
| "loss": 0.3388832092285156, | |
| "memory(GiB)": 37.77, | |
| "step": 380, | |
| "token_acc": 0.8669541981154189, | |
| "train_speed(iter/s)": 0.084189 | |
| }, | |
| { | |
| "epoch": 3.9690721649484537, | |
| "grad_norm": 0.6261422020208081, | |
| "learning_rate": 1.1214435464779003e-07, | |
| "loss": 0.32546391487121584, | |
| "memory(GiB)": 37.77, | |
| "step": 385, | |
| "token_acc": 0.877057210031348, | |
| "train_speed(iter/s)": 0.085015 | |
| }, | |
| { | |
| "epoch": 4.020618556701031, | |
| "grad_norm": 0.6597131788548194, | |
| "learning_rate": 1.0159744429204775e-07, | |
| "loss": 0.3102993965148926, | |
| "memory(GiB)": 37.77, | |
| "step": 390, | |
| "token_acc": 0.8759420669768662, | |
| "train_speed(iter/s)": 0.085822 | |
| }, | |
| { | |
| "epoch": 4.072164948453608, | |
| "grad_norm": 0.6279013012460978, | |
| "learning_rate": 9.1515053494779e-08, | |
| "loss": 0.33462517261505126, | |
| "memory(GiB)": 37.77, | |
| "step": 395, | |
| "token_acc": 0.8786680275580505, | |
| "train_speed(iter/s)": 0.086608 | |
| }, | |
| { | |
| "epoch": 4.123711340206185, | |
| "grad_norm": 0.7265548614470879, | |
| "learning_rate": 8.190893787264469e-08, | |
| "loss": 0.3322861433029175, | |
| "memory(GiB)": 37.77, | |
| "step": 400, | |
| "token_acc": 0.8789391575663027, | |
| "train_speed(iter/s)": 0.087381 | |
| }, | |
| { | |
| "epoch": 4.123711340206185, | |
| "eval_loss": 0.38200807571411133, | |
| "eval_runtime": 0.9767, | |
| "eval_samples_per_second": 15.358, | |
| "eval_steps_per_second": 2.048, | |
| "eval_token_acc": 0.8655444659615671, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 485, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 29414501638144.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |