| { | |
| "best_global_step": 100, | |
| "best_metric": 2.52767062, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 521, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0019193857965451055, | |
| "grad_norm": 50.499073662885124, | |
| "learning_rate": 3.773584905660378e-07, | |
| "loss": 1.6497690677642822, | |
| "memory(GiB)": 5.28, | |
| "step": 1, | |
| "token_acc": 0.5833333333333334, | |
| "train_speed(iter/s)": 0.09523 | |
| }, | |
| { | |
| "epoch": 0.009596928982725527, | |
| "grad_norm": 62.15822717412118, | |
| "learning_rate": 1.8867924528301889e-06, | |
| "loss": 1.7728731632232666, | |
| "memory(GiB)": 8.51, | |
| "step": 5, | |
| "token_acc": 0.4583333333333333, | |
| "train_speed(iter/s)": 0.298833 | |
| }, | |
| { | |
| "epoch": 0.019193857965451054, | |
| "grad_norm": 50.6949450155208, | |
| "learning_rate": 3.7735849056603777e-06, | |
| "loss": 1.5503947257995605, | |
| "memory(GiB)": 8.51, | |
| "step": 10, | |
| "token_acc": 0.5833333333333334, | |
| "train_speed(iter/s)": 0.486352 | |
| }, | |
| { | |
| "epoch": 0.028790786948176585, | |
| "grad_norm": 14.839276462104923, | |
| "learning_rate": 5.660377358490566e-06, | |
| "loss": 1.230274486541748, | |
| "memory(GiB)": 8.51, | |
| "step": 15, | |
| "token_acc": 0.6666666666666666, | |
| "train_speed(iter/s)": 0.629079 | |
| }, | |
| { | |
| "epoch": 0.03838771593090211, | |
| "grad_norm": 13.392088231370584, | |
| "learning_rate": 7.5471698113207555e-06, | |
| "loss": 1.1847952842712401, | |
| "memory(GiB)": 8.51, | |
| "step": 20, | |
| "token_acc": 0.6166666666666667, | |
| "train_speed(iter/s)": 0.726565 | |
| }, | |
| { | |
| "epoch": 0.04798464491362764, | |
| "grad_norm": 10.717873377537414, | |
| "learning_rate": 9.433962264150944e-06, | |
| "loss": 1.0945304870605468, | |
| "memory(GiB)": 8.51, | |
| "step": 25, | |
| "token_acc": 0.6833333333333333, | |
| "train_speed(iter/s)": 0.810848 | |
| }, | |
| { | |
| "epoch": 0.05758157389635317, | |
| "grad_norm": 8.889002212294471, | |
| "learning_rate": 1.1320754716981132e-05, | |
| "loss": 0.9990409851074219, | |
| "memory(GiB)": 8.51, | |
| "step": 30, | |
| "token_acc": 0.6833333333333333, | |
| "train_speed(iter/s)": 0.869855 | |
| }, | |
| { | |
| "epoch": 0.0671785028790787, | |
| "grad_norm": 8.012070786994876, | |
| "learning_rate": 1.320754716981132e-05, | |
| "loss": 0.9144926071166992, | |
| "memory(GiB)": 8.51, | |
| "step": 35, | |
| "token_acc": 0.7333333333333333, | |
| "train_speed(iter/s)": 0.928161 | |
| }, | |
| { | |
| "epoch": 0.07677543186180422, | |
| "grad_norm": 7.202447694086144, | |
| "learning_rate": 1.5094339622641511e-05, | |
| "loss": 0.9043998718261719, | |
| "memory(GiB)": 8.51, | |
| "step": 40, | |
| "token_acc": 0.7, | |
| "train_speed(iter/s)": 0.97586 | |
| }, | |
| { | |
| "epoch": 0.08637236084452975, | |
| "grad_norm": 8.57492132801927, | |
| "learning_rate": 1.69811320754717e-05, | |
| "loss": 0.8435896873474121, | |
| "memory(GiB)": 8.51, | |
| "step": 45, | |
| "token_acc": 0.75, | |
| "train_speed(iter/s)": 1.018993 | |
| }, | |
| { | |
| "epoch": 0.09596928982725528, | |
| "grad_norm": 8.638236742778556, | |
| "learning_rate": 1.8867924528301888e-05, | |
| "loss": 0.8246500015258789, | |
| "memory(GiB)": 8.51, | |
| "step": 50, | |
| "token_acc": 0.7833333333333333, | |
| "train_speed(iter/s)": 1.055415 | |
| }, | |
| { | |
| "epoch": 0.10556621880998081, | |
| "grad_norm": 13.004082007332865, | |
| "learning_rate": 1.999909877856721e-05, | |
| "loss": 0.7747729301452637, | |
| "memory(GiB)": 8.51, | |
| "step": 55, | |
| "token_acc": 0.7833333333333333, | |
| "train_speed(iter/s)": 1.085874 | |
| }, | |
| { | |
| "epoch": 0.11516314779270634, | |
| "grad_norm": 6.247101033734483, | |
| "learning_rate": 1.9988961902877895e-05, | |
| "loss": 0.7058096885681152, | |
| "memory(GiB)": 8.51, | |
| "step": 60, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.11476 | |
| }, | |
| { | |
| "epoch": 0.12476007677543186, | |
| "grad_norm": 9.747466310041144, | |
| "learning_rate": 1.9967573081342103e-05, | |
| "loss": 0.7115418434143066, | |
| "memory(GiB)": 8.51, | |
| "step": 65, | |
| "token_acc": 0.8833333333333333, | |
| "train_speed(iter/s)": 1.139879 | |
| }, | |
| { | |
| "epoch": 0.1343570057581574, | |
| "grad_norm": 7.98629623252791, | |
| "learning_rate": 1.9934956407140285e-05, | |
| "loss": 0.6644338130950928, | |
| "memory(GiB)": 8.51, | |
| "step": 70, | |
| "token_acc": 0.85, | |
| "train_speed(iter/s)": 1.16312 | |
| }, | |
| { | |
| "epoch": 0.14395393474088292, | |
| "grad_norm": 8.205621057385066, | |
| "learning_rate": 1.989114862093232e-05, | |
| "loss": 0.6379920959472656, | |
| "memory(GiB)": 8.51, | |
| "step": 75, | |
| "token_acc": 0.8833333333333333, | |
| "train_speed(iter/s)": 1.184238 | |
| }, | |
| { | |
| "epoch": 0.15355086372360843, | |
| "grad_norm": 6.0726795344748306, | |
| "learning_rate": 1.983619906947144e-05, | |
| "loss": 0.5962705135345459, | |
| "memory(GiB)": 8.51, | |
| "step": 80, | |
| "token_acc": 0.85, | |
| "train_speed(iter/s)": 1.203586 | |
| }, | |
| { | |
| "epoch": 0.16314779270633398, | |
| "grad_norm": 7.461834141040474, | |
| "learning_rate": 1.977016965001817e-05, | |
| "loss": 0.6223911285400391, | |
| "memory(GiB)": 8.51, | |
| "step": 85, | |
| "token_acc": 0.8, | |
| "train_speed(iter/s)": 1.220982 | |
| }, | |
| { | |
| "epoch": 0.1727447216890595, | |
| "grad_norm": 8.225152822596419, | |
| "learning_rate": 1.9693134740616924e-05, | |
| "loss": 0.6050288677215576, | |
| "memory(GiB)": 8.51, | |
| "step": 90, | |
| "token_acc": 0.8166666666666667, | |
| "train_speed(iter/s)": 1.234884 | |
| }, | |
| { | |
| "epoch": 0.18234165067178504, | |
| "grad_norm": 8.58498910714931, | |
| "learning_rate": 1.9605181116313725e-05, | |
| "loss": 0.6168498516082763, | |
| "memory(GiB)": 8.51, | |
| "step": 95, | |
| "token_acc": 0.7666666666666667, | |
| "train_speed(iter/s)": 1.249099 | |
| }, | |
| { | |
| "epoch": 0.19193857965451055, | |
| "grad_norm": 5.4177042583550525, | |
| "learning_rate": 1.950640785140951e-05, | |
| "loss": 0.5574191093444825, | |
| "memory(GiB)": 8.51, | |
| "step": 100, | |
| "token_acc": 0.8166666666666667, | |
| "train_speed(iter/s)": 1.262519 | |
| }, | |
| { | |
| "epoch": 0.19193857965451055, | |
| "eval_loss": 2.5276706218719482, | |
| "eval_runtime": 8.6199, | |
| "eval_samples_per_second": 118.796, | |
| "eval_steps_per_second": 2.552, | |
| "eval_token_acc": 0.7803310391900347, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.20153550863723607, | |
| "grad_norm": 6.179146882493892, | |
| "learning_rate": 1.9396926207859085e-05, | |
| "loss": 0.5534902572631836, | |
| "memory(GiB)": 14.21, | |
| "step": 105, | |
| "token_acc": 0.7369402985074627, | |
| "train_speed(iter/s)": 1.143897 | |
| }, | |
| { | |
| "epoch": 0.21113243761996162, | |
| "grad_norm": 5.641170797612031, | |
| "learning_rate": 1.927685950994143e-05, | |
| "loss": 0.5470232963562012, | |
| "memory(GiB)": 14.21, | |
| "step": 110, | |
| "token_acc": 0.75, | |
| "train_speed(iter/s)": 1.158894 | |
| }, | |
| { | |
| "epoch": 0.22072936660268713, | |
| "grad_norm": 7.710301646225508, | |
| "learning_rate": 1.9146343005342546e-05, | |
| "loss": 0.5334534645080566, | |
| "memory(GiB)": 14.21, | |
| "step": 115, | |
| "token_acc": 0.8135593220338984, | |
| "train_speed(iter/s)": 1.172 | |
| }, | |
| { | |
| "epoch": 0.23032629558541268, | |
| "grad_norm": 6.26207731899911, | |
| "learning_rate": 1.9005523712807335e-05, | |
| "loss": 0.5158659934997558, | |
| "memory(GiB)": 14.21, | |
| "step": 120, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.184167 | |
| }, | |
| { | |
| "epoch": 0.2399232245681382, | |
| "grad_norm": 6.172362578282768, | |
| "learning_rate": 1.8854560256532098e-05, | |
| "loss": 0.5012799263000488, | |
| "memory(GiB)": 14.21, | |
| "step": 125, | |
| "token_acc": 0.8666666666666667, | |
| "train_speed(iter/s)": 1.19569 | |
| }, | |
| { | |
| "epoch": 0.2495201535508637, | |
| "grad_norm": 6.712616277483349, | |
| "learning_rate": 1.869362268748423e-05, | |
| "loss": 0.5169489860534668, | |
| "memory(GiB)": 14.21, | |
| "step": 130, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.207346 | |
| }, | |
| { | |
| "epoch": 0.2591170825335892, | |
| "grad_norm": 7.148207284239342, | |
| "learning_rate": 1.8522892291850335e-05, | |
| "loss": 0.4680202007293701, | |
| "memory(GiB)": 14.21, | |
| "step": 135, | |
| "token_acc": 0.8666666666666667, | |
| "train_speed(iter/s)": 1.218262 | |
| }, | |
| { | |
| "epoch": 0.2687140115163148, | |
| "grad_norm": 5.828450923300192, | |
| "learning_rate": 1.8342561386828613e-05, | |
| "loss": 0.4936178207397461, | |
| "memory(GiB)": 14.21, | |
| "step": 140, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.226985 | |
| }, | |
| { | |
| "epoch": 0.2783109404990403, | |
| "grad_norm": 6.652489935810153, | |
| "learning_rate": 1.8152833103995443e-05, | |
| "loss": 0.4828921318054199, | |
| "memory(GiB)": 14.21, | |
| "step": 145, | |
| "token_acc": 0.8166666666666667, | |
| "train_speed(iter/s)": 1.235963 | |
| }, | |
| { | |
| "epoch": 0.28790786948176583, | |
| "grad_norm": 7.187378559413358, | |
| "learning_rate": 1.795392116049028e-05, | |
| "loss": 0.4845115661621094, | |
| "memory(GiB)": 14.21, | |
| "step": 150, | |
| "token_acc": 0.8166666666666667, | |
| "train_speed(iter/s)": 1.244382 | |
| }, | |
| { | |
| "epoch": 0.29750479846449135, | |
| "grad_norm": 7.907765460694232, | |
| "learning_rate": 1.7746049618276545e-05, | |
| "loss": 0.46329803466796876, | |
| "memory(GiB)": 14.21, | |
| "step": 155, | |
| "token_acc": 0.8666666666666667, | |
| "train_speed(iter/s)": 1.252842 | |
| }, | |
| { | |
| "epoch": 0.30710172744721687, | |
| "grad_norm": 7.811457813376289, | |
| "learning_rate": 1.7529452631749743e-05, | |
| "loss": 0.44974498748779296, | |
| "memory(GiB)": 14.21, | |
| "step": 160, | |
| "token_acc": 0.9491525423728814, | |
| "train_speed(iter/s)": 1.261036 | |
| }, | |
| { | |
| "epoch": 0.31669865642994244, | |
| "grad_norm": 7.0025117295075185, | |
| "learning_rate": 1.7304374183977032e-05, | |
| "loss": 0.44871058464050295, | |
| "memory(GiB)": 14.21, | |
| "step": 165, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.269117 | |
| }, | |
| { | |
| "epoch": 0.32629558541266795, | |
| "grad_norm": 8.097741811820914, | |
| "learning_rate": 1.7071067811865477e-05, | |
| "loss": 0.41682958602905273, | |
| "memory(GiB)": 14.21, | |
| "step": 170, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.276776 | |
| }, | |
| { | |
| "epoch": 0.33589251439539347, | |
| "grad_norm": 6.662982343123765, | |
| "learning_rate": 1.6829796320568416e-05, | |
| "loss": 0.4470475196838379, | |
| "memory(GiB)": 14.21, | |
| "step": 175, | |
| "token_acc": 0.7627118644067796, | |
| "train_speed(iter/s)": 1.284139 | |
| }, | |
| { | |
| "epoch": 0.345489443378119, | |
| "grad_norm": 5.604840370084793, | |
| "learning_rate": 1.6580831487451788e-05, | |
| "loss": 0.4485145568847656, | |
| "memory(GiB)": 14.21, | |
| "step": 180, | |
| "token_acc": 0.8, | |
| "train_speed(iter/s)": 1.291145 | |
| }, | |
| { | |
| "epoch": 0.3550863723608445, | |
| "grad_norm": 7.227414415218275, | |
| "learning_rate": 1.6324453755953772e-05, | |
| "loss": 0.4181208610534668, | |
| "memory(GiB)": 14.21, | |
| "step": 185, | |
| "token_acc": 0.85, | |
| "train_speed(iter/s)": 1.297846 | |
| }, | |
| { | |
| "epoch": 0.3646833013435701, | |
| "grad_norm": 6.576718679657512, | |
| "learning_rate": 1.6060951919682665e-05, | |
| "loss": 0.4237715721130371, | |
| "memory(GiB)": 14.21, | |
| "step": 190, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.304264 | |
| }, | |
| { | |
| "epoch": 0.3742802303262956, | |
| "grad_norm": 5.849761166650766, | |
| "learning_rate": 1.579062279710879e-05, | |
| "loss": 0.4109220504760742, | |
| "memory(GiB)": 14.21, | |
| "step": 195, | |
| "token_acc": 0.8, | |
| "train_speed(iter/s)": 1.310403 | |
| }, | |
| { | |
| "epoch": 0.3838771593090211, | |
| "grad_norm": 5.198284842400891, | |
| "learning_rate": 1.551377089721692e-05, | |
| "loss": 0.43619818687438966, | |
| "memory(GiB)": 14.21, | |
| "step": 200, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 1.316343 | |
| }, | |
| { | |
| "epoch": 0.3838771593090211, | |
| "eval_loss": 2.959151029586792, | |
| "eval_runtime": 5.2772, | |
| "eval_samples_per_second": 194.044, | |
| "eval_steps_per_second": 4.169, | |
| "eval_token_acc": 0.7764063477617883, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3934740882917466, | |
| "grad_norm": 5.869635779885074, | |
| "learning_rate": 1.5230708076495777e-05, | |
| "loss": 0.3859025716781616, | |
| "memory(GiB)": 14.37, | |
| "step": 205, | |
| "token_acc": 0.7406716417910447, | |
| "train_speed(iter/s)": 1.273743 | |
| }, | |
| { | |
| "epoch": 0.40307101727447214, | |
| "grad_norm": 7.585471915910738, | |
| "learning_rate": 1.494175318765107e-05, | |
| "loss": 0.3978905439376831, | |
| "memory(GiB)": 14.37, | |
| "step": 210, | |
| "token_acc": 0.85, | |
| "train_speed(iter/s)": 1.279651 | |
| }, | |
| { | |
| "epoch": 0.4126679462571977, | |
| "grad_norm": 5.581100758479657, | |
| "learning_rate": 1.4647231720437687e-05, | |
| "loss": 0.38069303035736085, | |
| "memory(GiB)": 14.37, | |
| "step": 215, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.283121 | |
| }, | |
| { | |
| "epoch": 0.42226487523992323, | |
| "grad_norm": 4.527918410267229, | |
| "learning_rate": 1.4347475435015686e-05, | |
| "loss": 0.37502858638763426, | |
| "memory(GiB)": 14.37, | |
| "step": 220, | |
| "token_acc": 0.8166666666666667, | |
| "train_speed(iter/s)": 1.288521 | |
| }, | |
| { | |
| "epoch": 0.43186180422264875, | |
| "grad_norm": 7.683117652206841, | |
| "learning_rate": 1.404282198824305e-05, | |
| "loss": 0.376017165184021, | |
| "memory(GiB)": 14.37, | |
| "step": 225, | |
| "token_acc": 0.8983050847457628, | |
| "train_speed(iter/s)": 1.243016 | |
| }, | |
| { | |
| "epoch": 0.44145873320537427, | |
| "grad_norm": 6.479535475022426, | |
| "learning_rate": 1.3733614553326211e-05, | |
| "loss": 0.38472347259521483, | |
| "memory(GiB)": 14.37, | |
| "step": 230, | |
| "token_acc": 0.8, | |
| "train_speed(iter/s)": 1.249779 | |
| }, | |
| { | |
| "epoch": 0.4510556621880998, | |
| "grad_norm": 4.918112500502897, | |
| "learning_rate": 1.342020143325669e-05, | |
| "loss": 0.36906707286834717, | |
| "memory(GiB)": 14.37, | |
| "step": 235, | |
| "token_acc": 0.8813559322033898, | |
| "train_speed(iter/s)": 1.256024 | |
| }, | |
| { | |
| "epoch": 0.46065259117082535, | |
| "grad_norm": 7.155983951765217, | |
| "learning_rate": 1.3102935668469403e-05, | |
| "loss": 0.3751584768295288, | |
| "memory(GiB)": 14.37, | |
| "step": 240, | |
| "token_acc": 0.9, | |
| "train_speed(iter/s)": 1.262154 | |
| }, | |
| { | |
| "epoch": 0.47024952015355087, | |
| "grad_norm": 6.331174285818235, | |
| "learning_rate": 1.2782174639164528e-05, | |
| "loss": 0.33184859752655027, | |
| "memory(GiB)": 14.37, | |
| "step": 245, | |
| "token_acc": 0.8666666666666667, | |
| "train_speed(iter/s)": 1.268047 | |
| }, | |
| { | |
| "epoch": 0.4798464491362764, | |
| "grad_norm": 6.272564933268104, | |
| "learning_rate": 1.2458279662740853e-05, | |
| "loss": 0.3417738676071167, | |
| "memory(GiB)": 14.37, | |
| "step": 250, | |
| "token_acc": 0.9322033898305084, | |
| "train_speed(iter/s)": 1.27312 | |
| }, | |
| { | |
| "epoch": 0.4894433781190019, | |
| "grad_norm": 5.135899432601579, | |
| "learning_rate": 1.2131615586794162e-05, | |
| "loss": 0.3375750303268433, | |
| "memory(GiB)": 14.37, | |
| "step": 255, | |
| "token_acc": 0.8666666666666667, | |
| "train_speed(iter/s)": 1.27874 | |
| }, | |
| { | |
| "epoch": 0.4990403071017274, | |
| "grad_norm": 4.932904357562163, | |
| "learning_rate": 1.180255037813906e-05, | |
| "loss": 0.33879258632659914, | |
| "memory(GiB)": 14.37, | |
| "step": 260, | |
| "token_acc": 0.85, | |
| "train_speed(iter/s)": 1.284461 | |
| }, | |
| { | |
| "epoch": 0.508637236084453, | |
| "grad_norm": 6.284886551479371, | |
| "learning_rate": 1.1471454708317163e-05, | |
| "loss": 0.3437638759613037, | |
| "memory(GiB)": 14.37, | |
| "step": 265, | |
| "token_acc": 0.8166666666666667, | |
| "train_speed(iter/s)": 1.289794 | |
| }, | |
| { | |
| "epoch": 0.5182341650671785, | |
| "grad_norm": 5.507510219698484, | |
| "learning_rate": 1.1138701536058624e-05, | |
| "loss": 0.3238619327545166, | |
| "memory(GiB)": 14.37, | |
| "step": 270, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 1.294939 | |
| }, | |
| { | |
| "epoch": 0.527831094049904, | |
| "grad_norm": 5.817219043282623, | |
| "learning_rate": 1.0804665687167262e-05, | |
| "loss": 0.34387760162353515, | |
| "memory(GiB)": 14.37, | |
| "step": 275, | |
| "token_acc": 0.9152542372881356, | |
| "train_speed(iter/s)": 1.299594 | |
| }, | |
| { | |
| "epoch": 0.5374280230326296, | |
| "grad_norm": 6.342634523270455, | |
| "learning_rate": 1.0469723432302528e-05, | |
| "loss": 0.32937374114990237, | |
| "memory(GiB)": 14.37, | |
| "step": 280, | |
| "token_acc": 0.9833333333333333, | |
| "train_speed(iter/s)": 1.304474 | |
| }, | |
| { | |
| "epoch": 0.5470249520153551, | |
| "grad_norm": 5.687502098500072, | |
| "learning_rate": 1.0134252063133976e-05, | |
| "loss": 0.31735076904296877, | |
| "memory(GiB)": 14.37, | |
| "step": 285, | |
| "token_acc": 0.9322033898305084, | |
| "train_speed(iter/s)": 1.309045 | |
| }, | |
| { | |
| "epoch": 0.5566218809980806, | |
| "grad_norm": 7.638698661734066, | |
| "learning_rate": 9.7986294673456e-06, | |
| "loss": 0.32815046310424806, | |
| "memory(GiB)": 14.37, | |
| "step": 290, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.312928 | |
| }, | |
| { | |
| "epoch": 0.5662188099808061, | |
| "grad_norm": 5.185796628229698, | |
| "learning_rate": 9.463233702968784e-06, | |
| "loss": 0.28549041748046877, | |
| "memory(GiB)": 14.37, | |
| "step": 295, | |
| "token_acc": 0.8833333333333333, | |
| "train_speed(iter/s)": 1.316799 | |
| }, | |
| { | |
| "epoch": 0.5758157389635317, | |
| "grad_norm": 6.568762262669577, | |
| "learning_rate": 9.128442572523418e-06, | |
| "loss": 0.30935580730438234, | |
| "memory(GiB)": 14.37, | |
| "step": 300, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.320695 | |
| }, | |
| { | |
| "epoch": 0.5758157389635317, | |
| "eval_loss": 3.3033859729766846, | |
| "eval_runtime": 5.4365, | |
| "eval_samples_per_second": 188.356, | |
| "eval_steps_per_second": 4.047, | |
| "eval_token_acc": 0.7782833740970366, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5854126679462572, | |
| "grad_norm": 4.913910457218756, | |
| "learning_rate": 8.79463319744677e-06, | |
| "loss": 0.29166135787963865, | |
| "memory(GiB)": 14.64, | |
| "step": 305, | |
| "token_acc": 0.746268656716418, | |
| "train_speed(iter/s)": 1.290093 | |
| }, | |
| { | |
| "epoch": 0.5950095969289827, | |
| "grad_norm": 5.315946085188087, | |
| "learning_rate": 8.462181593289596e-06, | |
| "loss": 0.2921705961227417, | |
| "memory(GiB)": 14.64, | |
| "step": 310, | |
| "token_acc": 0.9, | |
| "train_speed(iter/s)": 1.293804 | |
| }, | |
| { | |
| "epoch": 0.6046065259117083, | |
| "grad_norm": 6.632601739948281, | |
| "learning_rate": 8.131462246157953e-06, | |
| "loss": 0.27473609447479247, | |
| "memory(GiB)": 14.64, | |
| "step": 315, | |
| "token_acc": 0.8666666666666667, | |
| "train_speed(iter/s)": 1.297418 | |
| }, | |
| { | |
| "epoch": 0.6142034548944337, | |
| "grad_norm": 4.731443243675736, | |
| "learning_rate": 7.802847690877832e-06, | |
| "loss": 0.285859489440918, | |
| "memory(GiB)": 14.64, | |
| "step": 320, | |
| "token_acc": 0.9, | |
| "train_speed(iter/s)": 1.30087 | |
| }, | |
| { | |
| "epoch": 0.6238003838771593, | |
| "grad_norm": 4.810635538915133, | |
| "learning_rate": 7.476708091357783e-06, | |
| "loss": 0.28462786674499513, | |
| "memory(GiB)": 14.64, | |
| "step": 325, | |
| "token_acc": 0.9, | |
| "train_speed(iter/s)": 1.30423 | |
| }, | |
| { | |
| "epoch": 0.6333973128598849, | |
| "grad_norm": 5.436375373696315, | |
| "learning_rate": 7.153410823622253e-06, | |
| "loss": 0.27665414810180666, | |
| "memory(GiB)": 14.64, | |
| "step": 330, | |
| "token_acc": 0.85, | |
| "train_speed(iter/s)": 1.307088 | |
| }, | |
| { | |
| "epoch": 0.6429942418426103, | |
| "grad_norm": 5.703330211485846, | |
| "learning_rate": 6.833320061985278e-06, | |
| "loss": 0.26464405059814455, | |
| "memory(GiB)": 14.64, | |
| "step": 335, | |
| "token_acc": 0.8833333333333333, | |
| "train_speed(iter/s)": 1.309735 | |
| }, | |
| { | |
| "epoch": 0.6525911708253359, | |
| "grad_norm": 5.558113116379478, | |
| "learning_rate": 6.5167963688306975e-06, | |
| "loss": 0.26451945304870605, | |
| "memory(GiB)": 14.64, | |
| "step": 340, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.31332 | |
| }, | |
| { | |
| "epoch": 0.6621880998080614, | |
| "grad_norm": 6.105803216866346, | |
| "learning_rate": 6.204196288461037e-06, | |
| "loss": 0.258668041229248, | |
| "memory(GiB)": 14.64, | |
| "step": 345, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.316835 | |
| }, | |
| { | |
| "epoch": 0.6717850287907869, | |
| "grad_norm": 5.492680478610087, | |
| "learning_rate": 5.895871945472434e-06, | |
| "loss": 0.2474271535873413, | |
| "memory(GiB)": 14.64, | |
| "step": 350, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.320051 | |
| }, | |
| { | |
| "epoch": 0.6813819577735125, | |
| "grad_norm": 5.535798325855053, | |
| "learning_rate": 5.5921706481081405e-06, | |
| "loss": 0.2485145092010498, | |
| "memory(GiB)": 14.64, | |
| "step": 355, | |
| "token_acc": 0.9, | |
| "train_speed(iter/s)": 1.323103 | |
| }, | |
| { | |
| "epoch": 0.690978886756238, | |
| "grad_norm": 5.2845851547926905, | |
| "learning_rate": 5.293434497037355e-06, | |
| "loss": 0.22786808013916016, | |
| "memory(GiB)": 14.64, | |
| "step": 360, | |
| "token_acc": 0.9661016949152542, | |
| "train_speed(iter/s)": 1.324048 | |
| }, | |
| { | |
| "epoch": 0.7005758157389635, | |
| "grad_norm": 5.954809588087527, | |
| "learning_rate": 5.000000000000003e-06, | |
| "loss": 0.22964661121368407, | |
| "memory(GiB)": 14.64, | |
| "step": 365, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.327039 | |
| }, | |
| { | |
| "epoch": 0.710172744721689, | |
| "grad_norm": 4.659945233935531, | |
| "learning_rate": 4.712197692751673e-06, | |
| "loss": 0.23819453716278077, | |
| "memory(GiB)": 14.64, | |
| "step": 370, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.329992 | |
| }, | |
| { | |
| "epoch": 0.7197696737044146, | |
| "grad_norm": 4.743878549379203, | |
| "learning_rate": 4.430351766735609e-06, | |
| "loss": 0.22235493659973143, | |
| "memory(GiB)": 14.64, | |
| "step": 375, | |
| "token_acc": 0.9491525423728814, | |
| "train_speed(iter/s)": 1.332906 | |
| }, | |
| { | |
| "epoch": 0.7293666026871402, | |
| "grad_norm": 4.893844108568125, | |
| "learning_rate": 4.154779703901114e-06, | |
| "loss": 0.19722604751586914, | |
| "memory(GiB)": 14.64, | |
| "step": 380, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.335828 | |
| }, | |
| { | |
| "epoch": 0.7389635316698656, | |
| "grad_norm": 5.145169064081025, | |
| "learning_rate": 3.885791919079878e-06, | |
| "loss": 0.21423752307891847, | |
| "memory(GiB)": 14.64, | |
| "step": 385, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.338733 | |
| }, | |
| { | |
| "epoch": 0.7485604606525912, | |
| "grad_norm": 5.169497287730162, | |
| "learning_rate": 3.6236914103229183e-06, | |
| "loss": 0.20372426509857178, | |
| "memory(GiB)": 14.64, | |
| "step": 390, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 1.341587 | |
| }, | |
| { | |
| "epoch": 0.7581573896353166, | |
| "grad_norm": 5.7298685597313455, | |
| "learning_rate": 3.3687734175920505e-06, | |
| "loss": 0.20854339599609376, | |
| "memory(GiB)": 14.64, | |
| "step": 395, | |
| "token_acc": 0.9833333333333333, | |
| "train_speed(iter/s)": 1.344245 | |
| }, | |
| { | |
| "epoch": 0.7677543186180422, | |
| "grad_norm": 4.441317830039946, | |
| "learning_rate": 3.1213250901904124e-06, | |
| "loss": 0.19554933309555053, | |
| "memory(GiB)": 14.64, | |
| "step": 400, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.346942 | |
| }, | |
| { | |
| "epoch": 0.7677543186180422, | |
| "eval_loss": 4.180116653442383, | |
| "eval_runtime": 5.8876, | |
| "eval_samples_per_second": 173.924, | |
| "eval_steps_per_second": 3.737, | |
| "eval_token_acc": 0.7771457823787042, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7773512476007678, | |
| "grad_norm": 4.806155222132795, | |
| "learning_rate": 2.8816251633065963e-06, | |
| "loss": 0.1874774694442749, | |
| "memory(GiB)": 14.86, | |
| "step": 405, | |
| "token_acc": 0.7518656716417911, | |
| "train_speed(iter/s)": 1.319682 | |
| }, | |
| { | |
| "epoch": 0.7869481765834933, | |
| "grad_norm": 4.794191844356368, | |
| "learning_rate": 2.6499436440367165e-06, | |
| "loss": 0.19104986190795897, | |
| "memory(GiB)": 14.86, | |
| "step": 410, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.322467 | |
| }, | |
| { | |
| "epoch": 0.7965451055662188, | |
| "grad_norm": 4.265720575711308, | |
| "learning_rate": 2.4265415072382016e-06, | |
| "loss": 0.16730222702026368, | |
| "memory(GiB)": 14.86, | |
| "step": 415, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 1.325142 | |
| }, | |
| { | |
| "epoch": 0.8061420345489443, | |
| "grad_norm": 5.305106824299531, | |
| "learning_rate": 2.211670401557804e-06, | |
| "loss": 0.18073848485946656, | |
| "memory(GiB)": 14.86, | |
| "step": 420, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.327768 | |
| }, | |
| { | |
| "epoch": 0.8157389635316699, | |
| "grad_norm": 4.664634897091474, | |
| "learning_rate": 2.0055723659649907e-06, | |
| "loss": 0.1752355933189392, | |
| "memory(GiB)": 14.86, | |
| "step": 425, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.329971 | |
| }, | |
| { | |
| "epoch": 0.8253358925143954, | |
| "grad_norm": 5.32565722929115, | |
| "learning_rate": 1.808479557110081e-06, | |
| "loss": 0.1725071668624878, | |
| "memory(GiB)": 14.86, | |
| "step": 430, | |
| "token_acc": 0.8983050847457628, | |
| "train_speed(iter/s)": 1.332507 | |
| }, | |
| { | |
| "epoch": 0.8349328214971209, | |
| "grad_norm": 5.169994958030524, | |
| "learning_rate": 1.620613987814189e-06, | |
| "loss": 0.17645432949066162, | |
| "memory(GiB)": 14.86, | |
| "step": 435, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.334554 | |
| }, | |
| { | |
| "epoch": 0.8445297504798465, | |
| "grad_norm": 4.776749046302731, | |
| "learning_rate": 1.4421872769855262e-06, | |
| "loss": 0.15960463285446166, | |
| "memory(GiB)": 14.86, | |
| "step": 440, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.337081 | |
| }, | |
| { | |
| "epoch": 0.8541266794625719, | |
| "grad_norm": 4.847177703927449, | |
| "learning_rate": 1.273400411243857e-06, | |
| "loss": 0.17060750722885132, | |
| "memory(GiB)": 14.86, | |
| "step": 445, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.312228 | |
| }, | |
| { | |
| "epoch": 0.8637236084452975, | |
| "grad_norm": 3.6268219099777474, | |
| "learning_rate": 1.1144435185215374e-06, | |
| "loss": 0.14804649353027344, | |
| "memory(GiB)": 14.86, | |
| "step": 450, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.314874 | |
| }, | |
| { | |
| "epoch": 0.8733205374280231, | |
| "grad_norm": 4.281399638282854, | |
| "learning_rate": 9.65495653896179e-07, | |
| "loss": 0.15085405111312866, | |
| "memory(GiB)": 14.86, | |
| "step": 455, | |
| "token_acc": 0.9152542372881356, | |
| "train_speed(iter/s)": 1.317356 | |
| }, | |
| { | |
| "epoch": 0.8829174664107485, | |
| "grad_norm": 4.504296572931946, | |
| "learning_rate": 8.267245978962291e-07, | |
| "loss": 0.1522615671157837, | |
| "memory(GiB)": 14.86, | |
| "step": 460, | |
| "token_acc": 0.9833333333333333, | |
| "train_speed(iter/s)": 1.319858 | |
| }, | |
| { | |
| "epoch": 0.8925143953934741, | |
| "grad_norm": 5.018140341828036, | |
| "learning_rate": 6.98286667506618e-07, | |
| "loss": 0.15103095769882202, | |
| "memory(GiB)": 14.86, | |
| "step": 465, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.32235 | |
| }, | |
| { | |
| "epoch": 0.9021113243761996, | |
| "grad_norm": 4.2640799586261515, | |
| "learning_rate": 5.803265400873514e-07, | |
| "loss": 0.15276429653167725, | |
| "memory(GiB)": 14.86, | |
| "step": 470, | |
| "token_acc": 0.9666666666666667, | |
| "train_speed(iter/s)": 1.324775 | |
| }, | |
| { | |
| "epoch": 0.9117082533589251, | |
| "grad_norm": 4.815044039867973, | |
| "learning_rate": 4.7297709040346474e-07, | |
| "loss": 0.15325998067855834, | |
| "memory(GiB)": 14.86, | |
| "step": 475, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.327096 | |
| }, | |
| { | |
| "epoch": 0.9213051823416507, | |
| "grad_norm": 4.2732206457444315, | |
| "learning_rate": 3.76359240949834e-07, | |
| "loss": 0.13113073110580445, | |
| "memory(GiB)": 14.86, | |
| "step": 480, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.329432 | |
| }, | |
| { | |
| "epoch": 0.9309021113243762, | |
| "grad_norm": 3.725412000540453, | |
| "learning_rate": 2.905818257394799e-07, | |
| "loss": 0.128694486618042, | |
| "memory(GiB)": 14.86, | |
| "step": 485, | |
| "token_acc": 1.0, | |
| "train_speed(iter/s)": 1.331675 | |
| }, | |
| { | |
| "epoch": 0.9404990403071017, | |
| "grad_norm": 4.1471254070434425, | |
| "learning_rate": 2.15741467708821e-07, | |
| "loss": 0.1417480230331421, | |
| "memory(GiB)": 14.86, | |
| "step": 490, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.333922 | |
| }, | |
| { | |
| "epoch": 0.9500959692898272, | |
| "grad_norm": 4.78807945642192, | |
| "learning_rate": 1.519224698779198e-07, | |
| "loss": 0.12884358167648316, | |
| "memory(GiB)": 14.86, | |
| "step": 495, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.336097 | |
| }, | |
| { | |
| "epoch": 0.9596928982725528, | |
| "grad_norm": 4.0494737594621135, | |
| "learning_rate": 9.919672038835926e-08, | |
| "loss": 0.13761271238327027, | |
| "memory(GiB)": 14.86, | |
| "step": 500, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.338335 | |
| }, | |
| { | |
| "epoch": 0.9596928982725528, | |
| "eval_loss": 4.626655578613281, | |
| "eval_runtime": 6.1181, | |
| "eval_samples_per_second": 167.371, | |
| "eval_steps_per_second": 3.596, | |
| "eval_token_acc": 0.7764063477617883, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9692898272552783, | |
| "grad_norm": 3.855247554495634, | |
| "learning_rate": 5.7623611525721155e-08, | |
| "loss": 0.1335052251815796, | |
| "memory(GiB)": 14.86, | |
| "step": 505, | |
| "token_acc": 0.753731343283582, | |
| "train_speed(iter/s)": 1.315161 | |
| }, | |
| { | |
| "epoch": 0.9788867562380038, | |
| "grad_norm": 5.015095661280352, | |
| "learning_rate": 2.7249972817849913e-08, | |
| "loss": 0.14048197269439697, | |
| "memory(GiB)": 14.86, | |
| "step": 510, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 1.317064 | |
| }, | |
| { | |
| "epoch": 0.9884836852207294, | |
| "grad_norm": 3.5753180986867563, | |
| "learning_rate": 8.110018284304132e-09, | |
| "loss": 0.1381472110748291, | |
| "memory(GiB)": 14.86, | |
| "step": 515, | |
| "token_acc": 0.95, | |
| "train_speed(iter/s)": 1.319109 | |
| }, | |
| { | |
| "epoch": 0.9980806142034548, | |
| "grad_norm": 3.833364172860736, | |
| "learning_rate": 2.2530789637986716e-10, | |
| "loss": 0.14658401012420655, | |
| "memory(GiB)": 14.86, | |
| "step": 520, | |
| "token_acc": 0.9166666666666666, | |
| "train_speed(iter/s)": 1.320552 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 4.6301727294921875, | |
| "eval_runtime": 5.7214, | |
| "eval_samples_per_second": 178.977, | |
| "eval_steps_per_second": 3.845, | |
| "eval_token_acc": 0.7765769865195381, | |
| "step": 521 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 521, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 220, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.493328572372746e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |