{ "best_global_step": 100, "best_metric": 2.52767062, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 521, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019193857965451055, "grad_norm": 50.499073662885124, "learning_rate": 3.773584905660378e-07, "loss": 1.6497690677642822, "memory(GiB)": 5.28, "step": 1, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 0.09523 }, { "epoch": 0.009596928982725527, "grad_norm": 62.15822717412118, "learning_rate": 1.8867924528301889e-06, "loss": 1.7728731632232666, "memory(GiB)": 8.51, "step": 5, "token_acc": 0.4583333333333333, "train_speed(iter/s)": 0.298833 }, { "epoch": 0.019193857965451054, "grad_norm": 50.6949450155208, "learning_rate": 3.7735849056603777e-06, "loss": 1.5503947257995605, "memory(GiB)": 8.51, "step": 10, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 0.486352 }, { "epoch": 0.028790786948176585, "grad_norm": 14.839276462104923, "learning_rate": 5.660377358490566e-06, "loss": 1.230274486541748, "memory(GiB)": 8.51, "step": 15, "token_acc": 0.6666666666666666, "train_speed(iter/s)": 0.629079 }, { "epoch": 0.03838771593090211, "grad_norm": 13.392088231370584, "learning_rate": 7.5471698113207555e-06, "loss": 1.1847952842712401, "memory(GiB)": 8.51, "step": 20, "token_acc": 0.6166666666666667, "train_speed(iter/s)": 0.726565 }, { "epoch": 0.04798464491362764, "grad_norm": 10.717873377537414, "learning_rate": 9.433962264150944e-06, "loss": 1.0945304870605468, "memory(GiB)": 8.51, "step": 25, "token_acc": 0.6833333333333333, "train_speed(iter/s)": 0.810848 }, { "epoch": 0.05758157389635317, "grad_norm": 8.889002212294471, "learning_rate": 1.1320754716981132e-05, "loss": 0.9990409851074219, "memory(GiB)": 8.51, "step": 30, "token_acc": 0.6833333333333333, "train_speed(iter/s)": 0.869855 }, { "epoch": 0.0671785028790787, "grad_norm": 8.012070786994876, "learning_rate": 1.320754716981132e-05, "loss": 0.9144926071166992, "memory(GiB)": 8.51, "step": 35, "token_acc": 0.7333333333333333, "train_speed(iter/s)": 0.928161 }, { "epoch": 0.07677543186180422, "grad_norm": 7.202447694086144, "learning_rate": 1.5094339622641511e-05, "loss": 0.9043998718261719, "memory(GiB)": 8.51, "step": 40, "token_acc": 0.7, "train_speed(iter/s)": 0.97586 }, { "epoch": 0.08637236084452975, "grad_norm": 8.57492132801927, "learning_rate": 1.69811320754717e-05, "loss": 0.8435896873474121, "memory(GiB)": 8.51, "step": 45, "token_acc": 0.75, "train_speed(iter/s)": 1.018993 }, { "epoch": 0.09596928982725528, "grad_norm": 8.638236742778556, "learning_rate": 1.8867924528301888e-05, "loss": 0.8246500015258789, "memory(GiB)": 8.51, "step": 50, "token_acc": 0.7833333333333333, "train_speed(iter/s)": 1.055415 }, { "epoch": 0.10556621880998081, "grad_norm": 13.004082007332865, "learning_rate": 1.999909877856721e-05, "loss": 0.7747729301452637, "memory(GiB)": 8.51, "step": 55, "token_acc": 0.7833333333333333, "train_speed(iter/s)": 1.085874 }, { "epoch": 0.11516314779270634, "grad_norm": 6.247101033734483, "learning_rate": 1.9988961902877895e-05, "loss": 0.7058096885681152, "memory(GiB)": 8.51, "step": 60, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.11476 }, { "epoch": 0.12476007677543186, "grad_norm": 9.747466310041144, "learning_rate": 1.9967573081342103e-05, "loss": 0.7115418434143066, "memory(GiB)": 8.51, "step": 65, "token_acc": 0.8833333333333333, "train_speed(iter/s)": 1.139879 }, { "epoch": 0.1343570057581574, "grad_norm": 7.98629623252791, "learning_rate": 1.9934956407140285e-05, "loss": 0.6644338130950928, "memory(GiB)": 8.51, "step": 70, "token_acc": 0.85, "train_speed(iter/s)": 1.16312 }, { "epoch": 0.14395393474088292, "grad_norm": 8.205621057385066, "learning_rate": 1.989114862093232e-05, "loss": 0.6379920959472656, "memory(GiB)": 8.51, "step": 75, "token_acc": 0.8833333333333333, "train_speed(iter/s)": 1.184238 }, { "epoch": 0.15355086372360843, "grad_norm": 6.0726795344748306, "learning_rate": 1.983619906947144e-05, "loss": 0.5962705135345459, "memory(GiB)": 8.51, "step": 80, "token_acc": 0.85, "train_speed(iter/s)": 1.203586 }, { "epoch": 0.16314779270633398, "grad_norm": 7.461834141040474, "learning_rate": 1.977016965001817e-05, "loss": 0.6223911285400391, "memory(GiB)": 8.51, "step": 85, "token_acc": 0.8, "train_speed(iter/s)": 1.220982 }, { "epoch": 0.1727447216890595, "grad_norm": 8.225152822596419, "learning_rate": 1.9693134740616924e-05, "loss": 0.6050288677215576, "memory(GiB)": 8.51, "step": 90, "token_acc": 0.8166666666666667, "train_speed(iter/s)": 1.234884 }, { "epoch": 0.18234165067178504, "grad_norm": 8.58498910714931, "learning_rate": 1.9605181116313725e-05, "loss": 0.6168498516082763, "memory(GiB)": 8.51, "step": 95, "token_acc": 0.7666666666666667, "train_speed(iter/s)": 1.249099 }, { "epoch": 0.19193857965451055, "grad_norm": 5.4177042583550525, "learning_rate": 1.950640785140951e-05, "loss": 0.5574191093444825, "memory(GiB)": 8.51, "step": 100, "token_acc": 0.8166666666666667, "train_speed(iter/s)": 1.262519 }, { "epoch": 0.19193857965451055, "eval_loss": 2.5276706218719482, "eval_runtime": 8.6199, "eval_samples_per_second": 118.796, "eval_steps_per_second": 2.552, "eval_token_acc": 0.7803310391900347, "step": 100 }, { "epoch": 0.20153550863723607, "grad_norm": 6.179146882493892, "learning_rate": 1.9396926207859085e-05, "loss": 0.5534902572631836, "memory(GiB)": 14.21, "step": 105, "token_acc": 0.7369402985074627, "train_speed(iter/s)": 1.143897 }, { "epoch": 0.21113243761996162, "grad_norm": 5.641170797612031, "learning_rate": 1.927685950994143e-05, "loss": 0.5470232963562012, "memory(GiB)": 14.21, "step": 110, "token_acc": 0.75, "train_speed(iter/s)": 1.158894 }, { "epoch": 0.22072936660268713, "grad_norm": 7.710301646225508, "learning_rate": 1.9146343005342546e-05, "loss": 0.5334534645080566, "memory(GiB)": 14.21, "step": 115, "token_acc": 0.8135593220338984, "train_speed(iter/s)": 1.172 }, { "epoch": 0.23032629558541268, "grad_norm": 6.26207731899911, "learning_rate": 1.9005523712807335e-05, "loss": 0.5158659934997558, "memory(GiB)": 14.21, "step": 120, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.184167 }, { "epoch": 0.2399232245681382, "grad_norm": 6.172362578282768, "learning_rate": 1.8854560256532098e-05, "loss": 0.5012799263000488, "memory(GiB)": 14.21, "step": 125, "token_acc": 0.8666666666666667, "train_speed(iter/s)": 1.19569 }, { "epoch": 0.2495201535508637, "grad_norm": 6.712616277483349, "learning_rate": 1.869362268748423e-05, "loss": 0.5169489860534668, "memory(GiB)": 14.21, "step": 130, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.207346 }, { "epoch": 0.2591170825335892, "grad_norm": 7.148207284239342, "learning_rate": 1.8522892291850335e-05, "loss": 0.4680202007293701, "memory(GiB)": 14.21, "step": 135, "token_acc": 0.8666666666666667, "train_speed(iter/s)": 1.218262 }, { "epoch": 0.2687140115163148, "grad_norm": 5.828450923300192, "learning_rate": 1.8342561386828613e-05, "loss": 0.4936178207397461, "memory(GiB)": 14.21, "step": 140, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.226985 }, { "epoch": 0.2783109404990403, "grad_norm": 6.652489935810153, "learning_rate": 1.8152833103995443e-05, "loss": 0.4828921318054199, "memory(GiB)": 14.21, "step": 145, "token_acc": 0.8166666666666667, "train_speed(iter/s)": 1.235963 }, { "epoch": 0.28790786948176583, "grad_norm": 7.187378559413358, "learning_rate": 1.795392116049028e-05, "loss": 0.4845115661621094, "memory(GiB)": 14.21, "step": 150, "token_acc": 0.8166666666666667, "train_speed(iter/s)": 1.244382 }, { "epoch": 0.29750479846449135, "grad_norm": 7.907765460694232, "learning_rate": 1.7746049618276545e-05, "loss": 0.46329803466796876, "memory(GiB)": 14.21, "step": 155, "token_acc": 0.8666666666666667, "train_speed(iter/s)": 1.252842 }, { "epoch": 0.30710172744721687, "grad_norm": 7.811457813376289, "learning_rate": 1.7529452631749743e-05, "loss": 0.44974498748779296, "memory(GiB)": 14.21, "step": 160, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 1.261036 }, { "epoch": 0.31669865642994244, "grad_norm": 7.0025117295075185, "learning_rate": 1.7304374183977032e-05, "loss": 0.44871058464050295, "memory(GiB)": 14.21, "step": 165, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.269117 }, { "epoch": 0.32629558541266795, "grad_norm": 8.097741811820914, "learning_rate": 1.7071067811865477e-05, "loss": 0.41682958602905273, "memory(GiB)": 14.21, "step": 170, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.276776 }, { "epoch": 0.33589251439539347, "grad_norm": 6.662982343123765, "learning_rate": 1.6829796320568416e-05, "loss": 0.4470475196838379, "memory(GiB)": 14.21, "step": 175, "token_acc": 0.7627118644067796, "train_speed(iter/s)": 1.284139 }, { "epoch": 0.345489443378119, "grad_norm": 5.604840370084793, "learning_rate": 1.6580831487451788e-05, "loss": 0.4485145568847656, "memory(GiB)": 14.21, "step": 180, "token_acc": 0.8, "train_speed(iter/s)": 1.291145 }, { "epoch": 0.3550863723608445, "grad_norm": 7.227414415218275, "learning_rate": 1.6324453755953772e-05, "loss": 0.4181208610534668, "memory(GiB)": 14.21, "step": 185, "token_acc": 0.85, "train_speed(iter/s)": 1.297846 }, { "epoch": 0.3646833013435701, "grad_norm": 6.576718679657512, "learning_rate": 1.6060951919682665e-05, "loss": 0.4237715721130371, "memory(GiB)": 14.21, "step": 190, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.304264 }, { "epoch": 0.3742802303262956, "grad_norm": 5.849761166650766, "learning_rate": 1.579062279710879e-05, "loss": 0.4109220504760742, "memory(GiB)": 14.21, "step": 195, "token_acc": 0.8, "train_speed(iter/s)": 1.310403 }, { "epoch": 0.3838771593090211, "grad_norm": 5.198284842400891, "learning_rate": 1.551377089721692e-05, "loss": 0.43619818687438966, "memory(GiB)": 14.21, "step": 200, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 1.316343 }, { "epoch": 0.3838771593090211, "eval_loss": 2.959151029586792, "eval_runtime": 5.2772, "eval_samples_per_second": 194.044, "eval_steps_per_second": 4.169, "eval_token_acc": 0.7764063477617883, "step": 200 }, { "epoch": 0.3934740882917466, "grad_norm": 5.869635779885074, "learning_rate": 1.5230708076495777e-05, "loss": 0.3859025716781616, "memory(GiB)": 14.37, "step": 205, "token_acc": 0.7406716417910447, "train_speed(iter/s)": 1.273743 }, { "epoch": 0.40307101727447214, "grad_norm": 7.585471915910738, "learning_rate": 1.494175318765107e-05, "loss": 0.3978905439376831, "memory(GiB)": 14.37, "step": 210, "token_acc": 0.85, "train_speed(iter/s)": 1.279651 }, { "epoch": 0.4126679462571977, "grad_norm": 5.581100758479657, "learning_rate": 1.4647231720437687e-05, "loss": 0.38069303035736085, "memory(GiB)": 14.37, "step": 215, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.283121 }, { "epoch": 0.42226487523992323, "grad_norm": 4.527918410267229, "learning_rate": 1.4347475435015686e-05, "loss": 0.37502858638763426, "memory(GiB)": 14.37, "step": 220, "token_acc": 0.8166666666666667, "train_speed(iter/s)": 1.288521 }, { "epoch": 0.43186180422264875, "grad_norm": 7.683117652206841, "learning_rate": 1.404282198824305e-05, "loss": 0.376017165184021, "memory(GiB)": 14.37, "step": 225, "token_acc": 0.8983050847457628, "train_speed(iter/s)": 1.243016 }, { "epoch": 0.44145873320537427, "grad_norm": 6.479535475022426, "learning_rate": 1.3733614553326211e-05, "loss": 0.38472347259521483, "memory(GiB)": 14.37, "step": 230, "token_acc": 0.8, "train_speed(iter/s)": 1.249779 }, { "epoch": 0.4510556621880998, "grad_norm": 4.918112500502897, "learning_rate": 1.342020143325669e-05, "loss": 0.36906707286834717, "memory(GiB)": 14.37, "step": 235, "token_acc": 0.8813559322033898, "train_speed(iter/s)": 1.256024 }, { "epoch": 0.46065259117082535, "grad_norm": 7.155983951765217, "learning_rate": 1.3102935668469403e-05, "loss": 0.3751584768295288, "memory(GiB)": 14.37, "step": 240, "token_acc": 0.9, "train_speed(iter/s)": 1.262154 }, { "epoch": 0.47024952015355087, "grad_norm": 6.331174285818235, "learning_rate": 1.2782174639164528e-05, "loss": 0.33184859752655027, "memory(GiB)": 14.37, "step": 245, "token_acc": 0.8666666666666667, "train_speed(iter/s)": 1.268047 }, { "epoch": 0.4798464491362764, "grad_norm": 6.272564933268104, "learning_rate": 1.2458279662740853e-05, "loss": 0.3417738676071167, "memory(GiB)": 14.37, "step": 250, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 1.27312 }, { "epoch": 0.4894433781190019, "grad_norm": 5.135899432601579, "learning_rate": 1.2131615586794162e-05, "loss": 0.3375750303268433, "memory(GiB)": 14.37, "step": 255, "token_acc": 0.8666666666666667, "train_speed(iter/s)": 1.27874 }, { "epoch": 0.4990403071017274, "grad_norm": 4.932904357562163, "learning_rate": 1.180255037813906e-05, "loss": 0.33879258632659914, "memory(GiB)": 14.37, "step": 260, "token_acc": 0.85, "train_speed(iter/s)": 1.284461 }, { "epoch": 0.508637236084453, "grad_norm": 6.284886551479371, "learning_rate": 1.1471454708317163e-05, "loss": 0.3437638759613037, "memory(GiB)": 14.37, "step": 265, "token_acc": 0.8166666666666667, "train_speed(iter/s)": 1.289794 }, { "epoch": 0.5182341650671785, "grad_norm": 5.507510219698484, "learning_rate": 1.1138701536058624e-05, "loss": 0.3238619327545166, "memory(GiB)": 14.37, "step": 270, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 1.294939 }, { "epoch": 0.527831094049904, "grad_norm": 5.817219043282623, "learning_rate": 1.0804665687167262e-05, "loss": 0.34387760162353515, "memory(GiB)": 14.37, "step": 275, "token_acc": 0.9152542372881356, "train_speed(iter/s)": 1.299594 }, { "epoch": 0.5374280230326296, "grad_norm": 6.342634523270455, "learning_rate": 1.0469723432302528e-05, "loss": 0.32937374114990237, "memory(GiB)": 14.37, "step": 280, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 1.304474 }, { "epoch": 0.5470249520153551, "grad_norm": 5.687502098500072, "learning_rate": 1.0134252063133976e-05, "loss": 0.31735076904296877, "memory(GiB)": 14.37, "step": 285, "token_acc": 0.9322033898305084, "train_speed(iter/s)": 1.309045 }, { "epoch": 0.5566218809980806, "grad_norm": 7.638698661734066, "learning_rate": 9.7986294673456e-06, "loss": 0.32815046310424806, "memory(GiB)": 14.37, "step": 290, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.312928 }, { "epoch": 0.5662188099808061, "grad_norm": 5.185796628229698, "learning_rate": 9.463233702968784e-06, "loss": 0.28549041748046877, "memory(GiB)": 14.37, "step": 295, "token_acc": 0.8833333333333333, "train_speed(iter/s)": 1.316799 }, { "epoch": 0.5758157389635317, "grad_norm": 6.568762262669577, "learning_rate": 9.128442572523418e-06, "loss": 0.30935580730438234, "memory(GiB)": 14.37, "step": 300, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.320695 }, { "epoch": 0.5758157389635317, "eval_loss": 3.3033859729766846, "eval_runtime": 5.4365, "eval_samples_per_second": 188.356, "eval_steps_per_second": 4.047, "eval_token_acc": 0.7782833740970366, "step": 300 }, { "epoch": 0.5854126679462572, "grad_norm": 4.913910457218756, "learning_rate": 8.79463319744677e-06, "loss": 0.29166135787963865, "memory(GiB)": 14.64, "step": 305, "token_acc": 0.746268656716418, "train_speed(iter/s)": 1.290093 }, { "epoch": 0.5950095969289827, "grad_norm": 5.315946085188087, "learning_rate": 8.462181593289596e-06, "loss": 0.2921705961227417, "memory(GiB)": 14.64, "step": 310, "token_acc": 0.9, "train_speed(iter/s)": 1.293804 }, { "epoch": 0.6046065259117083, "grad_norm": 6.632601739948281, "learning_rate": 8.131462246157953e-06, "loss": 0.27473609447479247, "memory(GiB)": 14.64, "step": 315, "token_acc": 0.8666666666666667, "train_speed(iter/s)": 1.297418 }, { "epoch": 0.6142034548944337, "grad_norm": 4.731443243675736, "learning_rate": 7.802847690877832e-06, "loss": 0.285859489440918, "memory(GiB)": 14.64, "step": 320, "token_acc": 0.9, "train_speed(iter/s)": 1.30087 }, { "epoch": 0.6238003838771593, "grad_norm": 4.810635538915133, "learning_rate": 7.476708091357783e-06, "loss": 0.28462786674499513, "memory(GiB)": 14.64, "step": 325, "token_acc": 0.9, "train_speed(iter/s)": 1.30423 }, { "epoch": 0.6333973128598849, "grad_norm": 5.436375373696315, "learning_rate": 7.153410823622253e-06, "loss": 0.27665414810180666, "memory(GiB)": 14.64, "step": 330, "token_acc": 0.85, "train_speed(iter/s)": 1.307088 }, { "epoch": 0.6429942418426103, "grad_norm": 5.703330211485846, "learning_rate": 6.833320061985278e-06, "loss": 0.26464405059814455, "memory(GiB)": 14.64, "step": 335, "token_acc": 0.8833333333333333, "train_speed(iter/s)": 1.309735 }, { "epoch": 0.6525911708253359, "grad_norm": 5.558113116379478, "learning_rate": 6.5167963688306975e-06, "loss": 0.26451945304870605, "memory(GiB)": 14.64, "step": 340, "token_acc": 0.95, "train_speed(iter/s)": 1.31332 }, { "epoch": 0.6621880998080614, "grad_norm": 6.105803216866346, "learning_rate": 6.204196288461037e-06, "loss": 0.258668041229248, "memory(GiB)": 14.64, "step": 345, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.316835 }, { "epoch": 0.6717850287907869, "grad_norm": 5.492680478610087, "learning_rate": 5.895871945472434e-06, "loss": 0.2474271535873413, "memory(GiB)": 14.64, "step": 350, "token_acc": 0.95, "train_speed(iter/s)": 1.320051 }, { "epoch": 0.6813819577735125, "grad_norm": 5.535798325855053, "learning_rate": 5.5921706481081405e-06, "loss": 0.2485145092010498, "memory(GiB)": 14.64, "step": 355, "token_acc": 0.9, "train_speed(iter/s)": 1.323103 }, { "epoch": 0.690978886756238, "grad_norm": 5.2845851547926905, "learning_rate": 5.293434497037355e-06, "loss": 0.22786808013916016, "memory(GiB)": 14.64, "step": 360, "token_acc": 0.9661016949152542, "train_speed(iter/s)": 1.324048 }, { "epoch": 0.7005758157389635, "grad_norm": 5.954809588087527, "learning_rate": 5.000000000000003e-06, "loss": 0.22964661121368407, "memory(GiB)": 14.64, "step": 365, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.327039 }, { "epoch": 0.710172744721689, "grad_norm": 4.659945233935531, "learning_rate": 4.712197692751673e-06, "loss": 0.23819453716278077, "memory(GiB)": 14.64, "step": 370, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.329992 }, { "epoch": 0.7197696737044146, "grad_norm": 4.743878549379203, "learning_rate": 4.430351766735609e-06, "loss": 0.22235493659973143, "memory(GiB)": 14.64, "step": 375, "token_acc": 0.9491525423728814, "train_speed(iter/s)": 1.332906 }, { "epoch": 0.7293666026871402, "grad_norm": 4.893844108568125, "learning_rate": 4.154779703901114e-06, "loss": 0.19722604751586914, "memory(GiB)": 14.64, "step": 380, "token_acc": 0.95, "train_speed(iter/s)": 1.335828 }, { "epoch": 0.7389635316698656, "grad_norm": 5.145169064081025, "learning_rate": 3.885791919079878e-06, "loss": 0.21423752307891847, "memory(GiB)": 14.64, "step": 385, "token_acc": 0.95, "train_speed(iter/s)": 1.338733 }, { "epoch": 0.7485604606525912, "grad_norm": 5.169497287730162, "learning_rate": 3.6236914103229183e-06, "loss": 0.20372426509857178, "memory(GiB)": 14.64, "step": 390, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 1.341587 }, { "epoch": 0.7581573896353166, "grad_norm": 5.7298685597313455, "learning_rate": 3.3687734175920505e-06, "loss": 0.20854339599609376, "memory(GiB)": 14.64, "step": 395, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 1.344245 }, { "epoch": 0.7677543186180422, "grad_norm": 4.441317830039946, "learning_rate": 3.1213250901904124e-06, "loss": 0.19554933309555053, "memory(GiB)": 14.64, "step": 400, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.346942 }, { "epoch": 0.7677543186180422, "eval_loss": 4.180116653442383, "eval_runtime": 5.8876, "eval_samples_per_second": 173.924, "eval_steps_per_second": 3.737, "eval_token_acc": 0.7771457823787042, "step": 400 }, { "epoch": 0.7773512476007678, "grad_norm": 4.806155222132795, "learning_rate": 2.8816251633065963e-06, "loss": 0.1874774694442749, "memory(GiB)": 14.86, "step": 405, "token_acc": 0.7518656716417911, "train_speed(iter/s)": 1.319682 }, { "epoch": 0.7869481765834933, "grad_norm": 4.794191844356368, "learning_rate": 2.6499436440367165e-06, "loss": 0.19104986190795897, "memory(GiB)": 14.86, "step": 410, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.322467 }, { "epoch": 0.7965451055662188, "grad_norm": 4.265720575711308, "learning_rate": 2.4265415072382016e-06, "loss": 0.16730222702026368, "memory(GiB)": 14.86, "step": 415, "token_acc": 1.0, "train_speed(iter/s)": 1.325142 }, { "epoch": 0.8061420345489443, "grad_norm": 5.305106824299531, "learning_rate": 2.211670401557804e-06, "loss": 0.18073848485946656, "memory(GiB)": 14.86, "step": 420, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.327768 }, { "epoch": 0.8157389635316699, "grad_norm": 4.664634897091474, "learning_rate": 2.0055723659649907e-06, "loss": 0.1752355933189392, "memory(GiB)": 14.86, "step": 425, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.329971 }, { "epoch": 0.8253358925143954, "grad_norm": 5.32565722929115, "learning_rate": 1.808479557110081e-06, "loss": 0.1725071668624878, "memory(GiB)": 14.86, "step": 430, "token_acc": 0.8983050847457628, "train_speed(iter/s)": 1.332507 }, { "epoch": 0.8349328214971209, "grad_norm": 5.169994958030524, "learning_rate": 1.620613987814189e-06, "loss": 0.17645432949066162, "memory(GiB)": 14.86, "step": 435, "token_acc": 0.95, "train_speed(iter/s)": 1.334554 }, { "epoch": 0.8445297504798465, "grad_norm": 4.776749046302731, "learning_rate": 1.4421872769855262e-06, "loss": 0.15960463285446166, "memory(GiB)": 14.86, "step": 440, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.337081 }, { "epoch": 0.8541266794625719, "grad_norm": 4.847177703927449, "learning_rate": 1.273400411243857e-06, "loss": 0.17060750722885132, "memory(GiB)": 14.86, "step": 445, "token_acc": 0.95, "train_speed(iter/s)": 1.312228 }, { "epoch": 0.8637236084452975, "grad_norm": 3.6268219099777474, "learning_rate": 1.1144435185215374e-06, "loss": 0.14804649353027344, "memory(GiB)": 14.86, "step": 450, "token_acc": 0.95, "train_speed(iter/s)": 1.314874 }, { "epoch": 0.8733205374280231, "grad_norm": 4.281399638282854, "learning_rate": 9.65495653896179e-07, "loss": 0.15085405111312866, "memory(GiB)": 14.86, "step": 455, "token_acc": 0.9152542372881356, "train_speed(iter/s)": 1.317356 }, { "epoch": 0.8829174664107485, "grad_norm": 4.504296572931946, "learning_rate": 8.267245978962291e-07, "loss": 0.1522615671157837, "memory(GiB)": 14.86, "step": 460, "token_acc": 0.9833333333333333, "train_speed(iter/s)": 1.319858 }, { "epoch": 0.8925143953934741, "grad_norm": 5.018140341828036, "learning_rate": 6.98286667506618e-07, "loss": 0.15103095769882202, "memory(GiB)": 14.86, "step": 465, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.32235 }, { "epoch": 0.9021113243761996, "grad_norm": 4.2640799586261515, "learning_rate": 5.803265400873514e-07, "loss": 0.15276429653167725, "memory(GiB)": 14.86, "step": 470, "token_acc": 0.9666666666666667, "train_speed(iter/s)": 1.324775 }, { "epoch": 0.9117082533589251, "grad_norm": 4.815044039867973, "learning_rate": 4.7297709040346474e-07, "loss": 0.15325998067855834, "memory(GiB)": 14.86, "step": 475, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.327096 }, { "epoch": 0.9213051823416507, "grad_norm": 4.2732206457444315, "learning_rate": 3.76359240949834e-07, "loss": 0.13113073110580445, "memory(GiB)": 14.86, "step": 480, "token_acc": 0.95, "train_speed(iter/s)": 1.329432 }, { "epoch": 0.9309021113243762, "grad_norm": 3.725412000540453, "learning_rate": 2.905818257394799e-07, "loss": 0.128694486618042, "memory(GiB)": 14.86, "step": 485, "token_acc": 1.0, "train_speed(iter/s)": 1.331675 }, { "epoch": 0.9404990403071017, "grad_norm": 4.1471254070434425, "learning_rate": 2.15741467708821e-07, "loss": 0.1417480230331421, "memory(GiB)": 14.86, "step": 490, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.333922 }, { "epoch": 0.9500959692898272, "grad_norm": 4.78807945642192, "learning_rate": 1.519224698779198e-07, "loss": 0.12884358167648316, "memory(GiB)": 14.86, "step": 495, "token_acc": 0.95, "train_speed(iter/s)": 1.336097 }, { "epoch": 0.9596928982725528, "grad_norm": 4.0494737594621135, "learning_rate": 9.919672038835926e-08, "loss": 0.13761271238327027, "memory(GiB)": 14.86, "step": 500, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.338335 }, { "epoch": 0.9596928982725528, "eval_loss": 4.626655578613281, "eval_runtime": 6.1181, "eval_samples_per_second": 167.371, "eval_steps_per_second": 3.596, "eval_token_acc": 0.7764063477617883, "step": 500 }, { "epoch": 0.9692898272552783, "grad_norm": 3.855247554495634, "learning_rate": 5.7623611525721155e-08, "loss": 0.1335052251815796, "memory(GiB)": 14.86, "step": 505, "token_acc": 0.753731343283582, "train_speed(iter/s)": 1.315161 }, { "epoch": 0.9788867562380038, "grad_norm": 5.015095661280352, "learning_rate": 2.7249972817849913e-08, "loss": 0.14048197269439697, "memory(GiB)": 14.86, "step": 510, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 1.317064 }, { "epoch": 0.9884836852207294, "grad_norm": 3.5753180986867563, "learning_rate": 8.110018284304132e-09, "loss": 0.1381472110748291, "memory(GiB)": 14.86, "step": 515, "token_acc": 0.95, "train_speed(iter/s)": 1.319109 }, { "epoch": 0.9980806142034548, "grad_norm": 3.833364172860736, "learning_rate": 2.2530789637986716e-10, "loss": 0.14658401012420655, "memory(GiB)": 14.86, "step": 520, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 1.320552 }, { "epoch": 1.0, "eval_loss": 4.6301727294921875, "eval_runtime": 5.7214, "eval_samples_per_second": 178.977, "eval_steps_per_second": 3.845, "eval_token_acc": 0.7765769865195381, "step": 521 } ], "logging_steps": 5, "max_steps": 521, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 220, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.493328572372746e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }