vlac-test / checkpoint-521 /trainer_state.json
futurefantasy's picture
Upload 40 files
8b04fba verified
{
"best_global_step": 100,
"best_metric": 2.52767062,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 521,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0019193857965451055,
"grad_norm": 50.499073662885124,
"learning_rate": 3.773584905660378e-07,
"loss": 1.6497690677642822,
"memory(GiB)": 5.28,
"step": 1,
"token_acc": 0.5833333333333334,
"train_speed(iter/s)": 0.09523
},
{
"epoch": 0.009596928982725527,
"grad_norm": 62.15822717412118,
"learning_rate": 1.8867924528301889e-06,
"loss": 1.7728731632232666,
"memory(GiB)": 8.51,
"step": 5,
"token_acc": 0.4583333333333333,
"train_speed(iter/s)": 0.298833
},
{
"epoch": 0.019193857965451054,
"grad_norm": 50.6949450155208,
"learning_rate": 3.7735849056603777e-06,
"loss": 1.5503947257995605,
"memory(GiB)": 8.51,
"step": 10,
"token_acc": 0.5833333333333334,
"train_speed(iter/s)": 0.486352
},
{
"epoch": 0.028790786948176585,
"grad_norm": 14.839276462104923,
"learning_rate": 5.660377358490566e-06,
"loss": 1.230274486541748,
"memory(GiB)": 8.51,
"step": 15,
"token_acc": 0.6666666666666666,
"train_speed(iter/s)": 0.629079
},
{
"epoch": 0.03838771593090211,
"grad_norm": 13.392088231370584,
"learning_rate": 7.5471698113207555e-06,
"loss": 1.1847952842712401,
"memory(GiB)": 8.51,
"step": 20,
"token_acc": 0.6166666666666667,
"train_speed(iter/s)": 0.726565
},
{
"epoch": 0.04798464491362764,
"grad_norm": 10.717873377537414,
"learning_rate": 9.433962264150944e-06,
"loss": 1.0945304870605468,
"memory(GiB)": 8.51,
"step": 25,
"token_acc": 0.6833333333333333,
"train_speed(iter/s)": 0.810848
},
{
"epoch": 0.05758157389635317,
"grad_norm": 8.889002212294471,
"learning_rate": 1.1320754716981132e-05,
"loss": 0.9990409851074219,
"memory(GiB)": 8.51,
"step": 30,
"token_acc": 0.6833333333333333,
"train_speed(iter/s)": 0.869855
},
{
"epoch": 0.0671785028790787,
"grad_norm": 8.012070786994876,
"learning_rate": 1.320754716981132e-05,
"loss": 0.9144926071166992,
"memory(GiB)": 8.51,
"step": 35,
"token_acc": 0.7333333333333333,
"train_speed(iter/s)": 0.928161
},
{
"epoch": 0.07677543186180422,
"grad_norm": 7.202447694086144,
"learning_rate": 1.5094339622641511e-05,
"loss": 0.9043998718261719,
"memory(GiB)": 8.51,
"step": 40,
"token_acc": 0.7,
"train_speed(iter/s)": 0.97586
},
{
"epoch": 0.08637236084452975,
"grad_norm": 8.57492132801927,
"learning_rate": 1.69811320754717e-05,
"loss": 0.8435896873474121,
"memory(GiB)": 8.51,
"step": 45,
"token_acc": 0.75,
"train_speed(iter/s)": 1.018993
},
{
"epoch": 0.09596928982725528,
"grad_norm": 8.638236742778556,
"learning_rate": 1.8867924528301888e-05,
"loss": 0.8246500015258789,
"memory(GiB)": 8.51,
"step": 50,
"token_acc": 0.7833333333333333,
"train_speed(iter/s)": 1.055415
},
{
"epoch": 0.10556621880998081,
"grad_norm": 13.004082007332865,
"learning_rate": 1.999909877856721e-05,
"loss": 0.7747729301452637,
"memory(GiB)": 8.51,
"step": 55,
"token_acc": 0.7833333333333333,
"train_speed(iter/s)": 1.085874
},
{
"epoch": 0.11516314779270634,
"grad_norm": 6.247101033734483,
"learning_rate": 1.9988961902877895e-05,
"loss": 0.7058096885681152,
"memory(GiB)": 8.51,
"step": 60,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.11476
},
{
"epoch": 0.12476007677543186,
"grad_norm": 9.747466310041144,
"learning_rate": 1.9967573081342103e-05,
"loss": 0.7115418434143066,
"memory(GiB)": 8.51,
"step": 65,
"token_acc": 0.8833333333333333,
"train_speed(iter/s)": 1.139879
},
{
"epoch": 0.1343570057581574,
"grad_norm": 7.98629623252791,
"learning_rate": 1.9934956407140285e-05,
"loss": 0.6644338130950928,
"memory(GiB)": 8.51,
"step": 70,
"token_acc": 0.85,
"train_speed(iter/s)": 1.16312
},
{
"epoch": 0.14395393474088292,
"grad_norm": 8.205621057385066,
"learning_rate": 1.989114862093232e-05,
"loss": 0.6379920959472656,
"memory(GiB)": 8.51,
"step": 75,
"token_acc": 0.8833333333333333,
"train_speed(iter/s)": 1.184238
},
{
"epoch": 0.15355086372360843,
"grad_norm": 6.0726795344748306,
"learning_rate": 1.983619906947144e-05,
"loss": 0.5962705135345459,
"memory(GiB)": 8.51,
"step": 80,
"token_acc": 0.85,
"train_speed(iter/s)": 1.203586
},
{
"epoch": 0.16314779270633398,
"grad_norm": 7.461834141040474,
"learning_rate": 1.977016965001817e-05,
"loss": 0.6223911285400391,
"memory(GiB)": 8.51,
"step": 85,
"token_acc": 0.8,
"train_speed(iter/s)": 1.220982
},
{
"epoch": 0.1727447216890595,
"grad_norm": 8.225152822596419,
"learning_rate": 1.9693134740616924e-05,
"loss": 0.6050288677215576,
"memory(GiB)": 8.51,
"step": 90,
"token_acc": 0.8166666666666667,
"train_speed(iter/s)": 1.234884
},
{
"epoch": 0.18234165067178504,
"grad_norm": 8.58498910714931,
"learning_rate": 1.9605181116313725e-05,
"loss": 0.6168498516082763,
"memory(GiB)": 8.51,
"step": 95,
"token_acc": 0.7666666666666667,
"train_speed(iter/s)": 1.249099
},
{
"epoch": 0.19193857965451055,
"grad_norm": 5.4177042583550525,
"learning_rate": 1.950640785140951e-05,
"loss": 0.5574191093444825,
"memory(GiB)": 8.51,
"step": 100,
"token_acc": 0.8166666666666667,
"train_speed(iter/s)": 1.262519
},
{
"epoch": 0.19193857965451055,
"eval_loss": 2.5276706218719482,
"eval_runtime": 8.6199,
"eval_samples_per_second": 118.796,
"eval_steps_per_second": 2.552,
"eval_token_acc": 0.7803310391900347,
"step": 100
},
{
"epoch": 0.20153550863723607,
"grad_norm": 6.179146882493892,
"learning_rate": 1.9396926207859085e-05,
"loss": 0.5534902572631836,
"memory(GiB)": 14.21,
"step": 105,
"token_acc": 0.7369402985074627,
"train_speed(iter/s)": 1.143897
},
{
"epoch": 0.21113243761996162,
"grad_norm": 5.641170797612031,
"learning_rate": 1.927685950994143e-05,
"loss": 0.5470232963562012,
"memory(GiB)": 14.21,
"step": 110,
"token_acc": 0.75,
"train_speed(iter/s)": 1.158894
},
{
"epoch": 0.22072936660268713,
"grad_norm": 7.710301646225508,
"learning_rate": 1.9146343005342546e-05,
"loss": 0.5334534645080566,
"memory(GiB)": 14.21,
"step": 115,
"token_acc": 0.8135593220338984,
"train_speed(iter/s)": 1.172
},
{
"epoch": 0.23032629558541268,
"grad_norm": 6.26207731899911,
"learning_rate": 1.9005523712807335e-05,
"loss": 0.5158659934997558,
"memory(GiB)": 14.21,
"step": 120,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.184167
},
{
"epoch": 0.2399232245681382,
"grad_norm": 6.172362578282768,
"learning_rate": 1.8854560256532098e-05,
"loss": 0.5012799263000488,
"memory(GiB)": 14.21,
"step": 125,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 1.19569
},
{
"epoch": 0.2495201535508637,
"grad_norm": 6.712616277483349,
"learning_rate": 1.869362268748423e-05,
"loss": 0.5169489860534668,
"memory(GiB)": 14.21,
"step": 130,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.207346
},
{
"epoch": 0.2591170825335892,
"grad_norm": 7.148207284239342,
"learning_rate": 1.8522892291850335e-05,
"loss": 0.4680202007293701,
"memory(GiB)": 14.21,
"step": 135,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 1.218262
},
{
"epoch": 0.2687140115163148,
"grad_norm": 5.828450923300192,
"learning_rate": 1.8342561386828613e-05,
"loss": 0.4936178207397461,
"memory(GiB)": 14.21,
"step": 140,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.226985
},
{
"epoch": 0.2783109404990403,
"grad_norm": 6.652489935810153,
"learning_rate": 1.8152833103995443e-05,
"loss": 0.4828921318054199,
"memory(GiB)": 14.21,
"step": 145,
"token_acc": 0.8166666666666667,
"train_speed(iter/s)": 1.235963
},
{
"epoch": 0.28790786948176583,
"grad_norm": 7.187378559413358,
"learning_rate": 1.795392116049028e-05,
"loss": 0.4845115661621094,
"memory(GiB)": 14.21,
"step": 150,
"token_acc": 0.8166666666666667,
"train_speed(iter/s)": 1.244382
},
{
"epoch": 0.29750479846449135,
"grad_norm": 7.907765460694232,
"learning_rate": 1.7746049618276545e-05,
"loss": 0.46329803466796876,
"memory(GiB)": 14.21,
"step": 155,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 1.252842
},
{
"epoch": 0.30710172744721687,
"grad_norm": 7.811457813376289,
"learning_rate": 1.7529452631749743e-05,
"loss": 0.44974498748779296,
"memory(GiB)": 14.21,
"step": 160,
"token_acc": 0.9491525423728814,
"train_speed(iter/s)": 1.261036
},
{
"epoch": 0.31669865642994244,
"grad_norm": 7.0025117295075185,
"learning_rate": 1.7304374183977032e-05,
"loss": 0.44871058464050295,
"memory(GiB)": 14.21,
"step": 165,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.269117
},
{
"epoch": 0.32629558541266795,
"grad_norm": 8.097741811820914,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.41682958602905273,
"memory(GiB)": 14.21,
"step": 170,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.276776
},
{
"epoch": 0.33589251439539347,
"grad_norm": 6.662982343123765,
"learning_rate": 1.6829796320568416e-05,
"loss": 0.4470475196838379,
"memory(GiB)": 14.21,
"step": 175,
"token_acc": 0.7627118644067796,
"train_speed(iter/s)": 1.284139
},
{
"epoch": 0.345489443378119,
"grad_norm": 5.604840370084793,
"learning_rate": 1.6580831487451788e-05,
"loss": 0.4485145568847656,
"memory(GiB)": 14.21,
"step": 180,
"token_acc": 0.8,
"train_speed(iter/s)": 1.291145
},
{
"epoch": 0.3550863723608445,
"grad_norm": 7.227414415218275,
"learning_rate": 1.6324453755953772e-05,
"loss": 0.4181208610534668,
"memory(GiB)": 14.21,
"step": 185,
"token_acc": 0.85,
"train_speed(iter/s)": 1.297846
},
{
"epoch": 0.3646833013435701,
"grad_norm": 6.576718679657512,
"learning_rate": 1.6060951919682665e-05,
"loss": 0.4237715721130371,
"memory(GiB)": 14.21,
"step": 190,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.304264
},
{
"epoch": 0.3742802303262956,
"grad_norm": 5.849761166650766,
"learning_rate": 1.579062279710879e-05,
"loss": 0.4109220504760742,
"memory(GiB)": 14.21,
"step": 195,
"token_acc": 0.8,
"train_speed(iter/s)": 1.310403
},
{
"epoch": 0.3838771593090211,
"grad_norm": 5.198284842400891,
"learning_rate": 1.551377089721692e-05,
"loss": 0.43619818687438966,
"memory(GiB)": 14.21,
"step": 200,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 1.316343
},
{
"epoch": 0.3838771593090211,
"eval_loss": 2.959151029586792,
"eval_runtime": 5.2772,
"eval_samples_per_second": 194.044,
"eval_steps_per_second": 4.169,
"eval_token_acc": 0.7764063477617883,
"step": 200
},
{
"epoch": 0.3934740882917466,
"grad_norm": 5.869635779885074,
"learning_rate": 1.5230708076495777e-05,
"loss": 0.3859025716781616,
"memory(GiB)": 14.37,
"step": 205,
"token_acc": 0.7406716417910447,
"train_speed(iter/s)": 1.273743
},
{
"epoch": 0.40307101727447214,
"grad_norm": 7.585471915910738,
"learning_rate": 1.494175318765107e-05,
"loss": 0.3978905439376831,
"memory(GiB)": 14.37,
"step": 210,
"token_acc": 0.85,
"train_speed(iter/s)": 1.279651
},
{
"epoch": 0.4126679462571977,
"grad_norm": 5.581100758479657,
"learning_rate": 1.4647231720437687e-05,
"loss": 0.38069303035736085,
"memory(GiB)": 14.37,
"step": 215,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.283121
},
{
"epoch": 0.42226487523992323,
"grad_norm": 4.527918410267229,
"learning_rate": 1.4347475435015686e-05,
"loss": 0.37502858638763426,
"memory(GiB)": 14.37,
"step": 220,
"token_acc": 0.8166666666666667,
"train_speed(iter/s)": 1.288521
},
{
"epoch": 0.43186180422264875,
"grad_norm": 7.683117652206841,
"learning_rate": 1.404282198824305e-05,
"loss": 0.376017165184021,
"memory(GiB)": 14.37,
"step": 225,
"token_acc": 0.8983050847457628,
"train_speed(iter/s)": 1.243016
},
{
"epoch": 0.44145873320537427,
"grad_norm": 6.479535475022426,
"learning_rate": 1.3733614553326211e-05,
"loss": 0.38472347259521483,
"memory(GiB)": 14.37,
"step": 230,
"token_acc": 0.8,
"train_speed(iter/s)": 1.249779
},
{
"epoch": 0.4510556621880998,
"grad_norm": 4.918112500502897,
"learning_rate": 1.342020143325669e-05,
"loss": 0.36906707286834717,
"memory(GiB)": 14.37,
"step": 235,
"token_acc": 0.8813559322033898,
"train_speed(iter/s)": 1.256024
},
{
"epoch": 0.46065259117082535,
"grad_norm": 7.155983951765217,
"learning_rate": 1.3102935668469403e-05,
"loss": 0.3751584768295288,
"memory(GiB)": 14.37,
"step": 240,
"token_acc": 0.9,
"train_speed(iter/s)": 1.262154
},
{
"epoch": 0.47024952015355087,
"grad_norm": 6.331174285818235,
"learning_rate": 1.2782174639164528e-05,
"loss": 0.33184859752655027,
"memory(GiB)": 14.37,
"step": 245,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 1.268047
},
{
"epoch": 0.4798464491362764,
"grad_norm": 6.272564933268104,
"learning_rate": 1.2458279662740853e-05,
"loss": 0.3417738676071167,
"memory(GiB)": 14.37,
"step": 250,
"token_acc": 0.9322033898305084,
"train_speed(iter/s)": 1.27312
},
{
"epoch": 0.4894433781190019,
"grad_norm": 5.135899432601579,
"learning_rate": 1.2131615586794162e-05,
"loss": 0.3375750303268433,
"memory(GiB)": 14.37,
"step": 255,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 1.27874
},
{
"epoch": 0.4990403071017274,
"grad_norm": 4.932904357562163,
"learning_rate": 1.180255037813906e-05,
"loss": 0.33879258632659914,
"memory(GiB)": 14.37,
"step": 260,
"token_acc": 0.85,
"train_speed(iter/s)": 1.284461
},
{
"epoch": 0.508637236084453,
"grad_norm": 6.284886551479371,
"learning_rate": 1.1471454708317163e-05,
"loss": 0.3437638759613037,
"memory(GiB)": 14.37,
"step": 265,
"token_acc": 0.8166666666666667,
"train_speed(iter/s)": 1.289794
},
{
"epoch": 0.5182341650671785,
"grad_norm": 5.507510219698484,
"learning_rate": 1.1138701536058624e-05,
"loss": 0.3238619327545166,
"memory(GiB)": 14.37,
"step": 270,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 1.294939
},
{
"epoch": 0.527831094049904,
"grad_norm": 5.817219043282623,
"learning_rate": 1.0804665687167262e-05,
"loss": 0.34387760162353515,
"memory(GiB)": 14.37,
"step": 275,
"token_acc": 0.9152542372881356,
"train_speed(iter/s)": 1.299594
},
{
"epoch": 0.5374280230326296,
"grad_norm": 6.342634523270455,
"learning_rate": 1.0469723432302528e-05,
"loss": 0.32937374114990237,
"memory(GiB)": 14.37,
"step": 280,
"token_acc": 0.9833333333333333,
"train_speed(iter/s)": 1.304474
},
{
"epoch": 0.5470249520153551,
"grad_norm": 5.687502098500072,
"learning_rate": 1.0134252063133976e-05,
"loss": 0.31735076904296877,
"memory(GiB)": 14.37,
"step": 285,
"token_acc": 0.9322033898305084,
"train_speed(iter/s)": 1.309045
},
{
"epoch": 0.5566218809980806,
"grad_norm": 7.638698661734066,
"learning_rate": 9.7986294673456e-06,
"loss": 0.32815046310424806,
"memory(GiB)": 14.37,
"step": 290,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.312928
},
{
"epoch": 0.5662188099808061,
"grad_norm": 5.185796628229698,
"learning_rate": 9.463233702968784e-06,
"loss": 0.28549041748046877,
"memory(GiB)": 14.37,
"step": 295,
"token_acc": 0.8833333333333333,
"train_speed(iter/s)": 1.316799
},
{
"epoch": 0.5758157389635317,
"grad_norm": 6.568762262669577,
"learning_rate": 9.128442572523418e-06,
"loss": 0.30935580730438234,
"memory(GiB)": 14.37,
"step": 300,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.320695
},
{
"epoch": 0.5758157389635317,
"eval_loss": 3.3033859729766846,
"eval_runtime": 5.4365,
"eval_samples_per_second": 188.356,
"eval_steps_per_second": 4.047,
"eval_token_acc": 0.7782833740970366,
"step": 300
},
{
"epoch": 0.5854126679462572,
"grad_norm": 4.913910457218756,
"learning_rate": 8.79463319744677e-06,
"loss": 0.29166135787963865,
"memory(GiB)": 14.64,
"step": 305,
"token_acc": 0.746268656716418,
"train_speed(iter/s)": 1.290093
},
{
"epoch": 0.5950095969289827,
"grad_norm": 5.315946085188087,
"learning_rate": 8.462181593289596e-06,
"loss": 0.2921705961227417,
"memory(GiB)": 14.64,
"step": 310,
"token_acc": 0.9,
"train_speed(iter/s)": 1.293804
},
{
"epoch": 0.6046065259117083,
"grad_norm": 6.632601739948281,
"learning_rate": 8.131462246157953e-06,
"loss": 0.27473609447479247,
"memory(GiB)": 14.64,
"step": 315,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 1.297418
},
{
"epoch": 0.6142034548944337,
"grad_norm": 4.731443243675736,
"learning_rate": 7.802847690877832e-06,
"loss": 0.285859489440918,
"memory(GiB)": 14.64,
"step": 320,
"token_acc": 0.9,
"train_speed(iter/s)": 1.30087
},
{
"epoch": 0.6238003838771593,
"grad_norm": 4.810635538915133,
"learning_rate": 7.476708091357783e-06,
"loss": 0.28462786674499513,
"memory(GiB)": 14.64,
"step": 325,
"token_acc": 0.9,
"train_speed(iter/s)": 1.30423
},
{
"epoch": 0.6333973128598849,
"grad_norm": 5.436375373696315,
"learning_rate": 7.153410823622253e-06,
"loss": 0.27665414810180666,
"memory(GiB)": 14.64,
"step": 330,
"token_acc": 0.85,
"train_speed(iter/s)": 1.307088
},
{
"epoch": 0.6429942418426103,
"grad_norm": 5.703330211485846,
"learning_rate": 6.833320061985278e-06,
"loss": 0.26464405059814455,
"memory(GiB)": 14.64,
"step": 335,
"token_acc": 0.8833333333333333,
"train_speed(iter/s)": 1.309735
},
{
"epoch": 0.6525911708253359,
"grad_norm": 5.558113116379478,
"learning_rate": 6.5167963688306975e-06,
"loss": 0.26451945304870605,
"memory(GiB)": 14.64,
"step": 340,
"token_acc": 0.95,
"train_speed(iter/s)": 1.31332
},
{
"epoch": 0.6621880998080614,
"grad_norm": 6.105803216866346,
"learning_rate": 6.204196288461037e-06,
"loss": 0.258668041229248,
"memory(GiB)": 14.64,
"step": 345,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.316835
},
{
"epoch": 0.6717850287907869,
"grad_norm": 5.492680478610087,
"learning_rate": 5.895871945472434e-06,
"loss": 0.2474271535873413,
"memory(GiB)": 14.64,
"step": 350,
"token_acc": 0.95,
"train_speed(iter/s)": 1.320051
},
{
"epoch": 0.6813819577735125,
"grad_norm": 5.535798325855053,
"learning_rate": 5.5921706481081405e-06,
"loss": 0.2485145092010498,
"memory(GiB)": 14.64,
"step": 355,
"token_acc": 0.9,
"train_speed(iter/s)": 1.323103
},
{
"epoch": 0.690978886756238,
"grad_norm": 5.2845851547926905,
"learning_rate": 5.293434497037355e-06,
"loss": 0.22786808013916016,
"memory(GiB)": 14.64,
"step": 360,
"token_acc": 0.9661016949152542,
"train_speed(iter/s)": 1.324048
},
{
"epoch": 0.7005758157389635,
"grad_norm": 5.954809588087527,
"learning_rate": 5.000000000000003e-06,
"loss": 0.22964661121368407,
"memory(GiB)": 14.64,
"step": 365,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.327039
},
{
"epoch": 0.710172744721689,
"grad_norm": 4.659945233935531,
"learning_rate": 4.712197692751673e-06,
"loss": 0.23819453716278077,
"memory(GiB)": 14.64,
"step": 370,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.329992
},
{
"epoch": 0.7197696737044146,
"grad_norm": 4.743878549379203,
"learning_rate": 4.430351766735609e-06,
"loss": 0.22235493659973143,
"memory(GiB)": 14.64,
"step": 375,
"token_acc": 0.9491525423728814,
"train_speed(iter/s)": 1.332906
},
{
"epoch": 0.7293666026871402,
"grad_norm": 4.893844108568125,
"learning_rate": 4.154779703901114e-06,
"loss": 0.19722604751586914,
"memory(GiB)": 14.64,
"step": 380,
"token_acc": 0.95,
"train_speed(iter/s)": 1.335828
},
{
"epoch": 0.7389635316698656,
"grad_norm": 5.145169064081025,
"learning_rate": 3.885791919079878e-06,
"loss": 0.21423752307891847,
"memory(GiB)": 14.64,
"step": 385,
"token_acc": 0.95,
"train_speed(iter/s)": 1.338733
},
{
"epoch": 0.7485604606525912,
"grad_norm": 5.169497287730162,
"learning_rate": 3.6236914103229183e-06,
"loss": 0.20372426509857178,
"memory(GiB)": 14.64,
"step": 390,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 1.341587
},
{
"epoch": 0.7581573896353166,
"grad_norm": 5.7298685597313455,
"learning_rate": 3.3687734175920505e-06,
"loss": 0.20854339599609376,
"memory(GiB)": 14.64,
"step": 395,
"token_acc": 0.9833333333333333,
"train_speed(iter/s)": 1.344245
},
{
"epoch": 0.7677543186180422,
"grad_norm": 4.441317830039946,
"learning_rate": 3.1213250901904124e-06,
"loss": 0.19554933309555053,
"memory(GiB)": 14.64,
"step": 400,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.346942
},
{
"epoch": 0.7677543186180422,
"eval_loss": 4.180116653442383,
"eval_runtime": 5.8876,
"eval_samples_per_second": 173.924,
"eval_steps_per_second": 3.737,
"eval_token_acc": 0.7771457823787042,
"step": 400
},
{
"epoch": 0.7773512476007678,
"grad_norm": 4.806155222132795,
"learning_rate": 2.8816251633065963e-06,
"loss": 0.1874774694442749,
"memory(GiB)": 14.86,
"step": 405,
"token_acc": 0.7518656716417911,
"train_speed(iter/s)": 1.319682
},
{
"epoch": 0.7869481765834933,
"grad_norm": 4.794191844356368,
"learning_rate": 2.6499436440367165e-06,
"loss": 0.19104986190795897,
"memory(GiB)": 14.86,
"step": 410,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.322467
},
{
"epoch": 0.7965451055662188,
"grad_norm": 4.265720575711308,
"learning_rate": 2.4265415072382016e-06,
"loss": 0.16730222702026368,
"memory(GiB)": 14.86,
"step": 415,
"token_acc": 1.0,
"train_speed(iter/s)": 1.325142
},
{
"epoch": 0.8061420345489443,
"grad_norm": 5.305106824299531,
"learning_rate": 2.211670401557804e-06,
"loss": 0.18073848485946656,
"memory(GiB)": 14.86,
"step": 420,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.327768
},
{
"epoch": 0.8157389635316699,
"grad_norm": 4.664634897091474,
"learning_rate": 2.0055723659649907e-06,
"loss": 0.1752355933189392,
"memory(GiB)": 14.86,
"step": 425,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.329971
},
{
"epoch": 0.8253358925143954,
"grad_norm": 5.32565722929115,
"learning_rate": 1.808479557110081e-06,
"loss": 0.1725071668624878,
"memory(GiB)": 14.86,
"step": 430,
"token_acc": 0.8983050847457628,
"train_speed(iter/s)": 1.332507
},
{
"epoch": 0.8349328214971209,
"grad_norm": 5.169994958030524,
"learning_rate": 1.620613987814189e-06,
"loss": 0.17645432949066162,
"memory(GiB)": 14.86,
"step": 435,
"token_acc": 0.95,
"train_speed(iter/s)": 1.334554
},
{
"epoch": 0.8445297504798465,
"grad_norm": 4.776749046302731,
"learning_rate": 1.4421872769855262e-06,
"loss": 0.15960463285446166,
"memory(GiB)": 14.86,
"step": 440,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.337081
},
{
"epoch": 0.8541266794625719,
"grad_norm": 4.847177703927449,
"learning_rate": 1.273400411243857e-06,
"loss": 0.17060750722885132,
"memory(GiB)": 14.86,
"step": 445,
"token_acc": 0.95,
"train_speed(iter/s)": 1.312228
},
{
"epoch": 0.8637236084452975,
"grad_norm": 3.6268219099777474,
"learning_rate": 1.1144435185215374e-06,
"loss": 0.14804649353027344,
"memory(GiB)": 14.86,
"step": 450,
"token_acc": 0.95,
"train_speed(iter/s)": 1.314874
},
{
"epoch": 0.8733205374280231,
"grad_norm": 4.281399638282854,
"learning_rate": 9.65495653896179e-07,
"loss": 0.15085405111312866,
"memory(GiB)": 14.86,
"step": 455,
"token_acc": 0.9152542372881356,
"train_speed(iter/s)": 1.317356
},
{
"epoch": 0.8829174664107485,
"grad_norm": 4.504296572931946,
"learning_rate": 8.267245978962291e-07,
"loss": 0.1522615671157837,
"memory(GiB)": 14.86,
"step": 460,
"token_acc": 0.9833333333333333,
"train_speed(iter/s)": 1.319858
},
{
"epoch": 0.8925143953934741,
"grad_norm": 5.018140341828036,
"learning_rate": 6.98286667506618e-07,
"loss": 0.15103095769882202,
"memory(GiB)": 14.86,
"step": 465,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.32235
},
{
"epoch": 0.9021113243761996,
"grad_norm": 4.2640799586261515,
"learning_rate": 5.803265400873514e-07,
"loss": 0.15276429653167725,
"memory(GiB)": 14.86,
"step": 470,
"token_acc": 0.9666666666666667,
"train_speed(iter/s)": 1.324775
},
{
"epoch": 0.9117082533589251,
"grad_norm": 4.815044039867973,
"learning_rate": 4.7297709040346474e-07,
"loss": 0.15325998067855834,
"memory(GiB)": 14.86,
"step": 475,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.327096
},
{
"epoch": 0.9213051823416507,
"grad_norm": 4.2732206457444315,
"learning_rate": 3.76359240949834e-07,
"loss": 0.13113073110580445,
"memory(GiB)": 14.86,
"step": 480,
"token_acc": 0.95,
"train_speed(iter/s)": 1.329432
},
{
"epoch": 0.9309021113243762,
"grad_norm": 3.725412000540453,
"learning_rate": 2.905818257394799e-07,
"loss": 0.128694486618042,
"memory(GiB)": 14.86,
"step": 485,
"token_acc": 1.0,
"train_speed(iter/s)": 1.331675
},
{
"epoch": 0.9404990403071017,
"grad_norm": 4.1471254070434425,
"learning_rate": 2.15741467708821e-07,
"loss": 0.1417480230331421,
"memory(GiB)": 14.86,
"step": 490,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.333922
},
{
"epoch": 0.9500959692898272,
"grad_norm": 4.78807945642192,
"learning_rate": 1.519224698779198e-07,
"loss": 0.12884358167648316,
"memory(GiB)": 14.86,
"step": 495,
"token_acc": 0.95,
"train_speed(iter/s)": 1.336097
},
{
"epoch": 0.9596928982725528,
"grad_norm": 4.0494737594621135,
"learning_rate": 9.919672038835926e-08,
"loss": 0.13761271238327027,
"memory(GiB)": 14.86,
"step": 500,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.338335
},
{
"epoch": 0.9596928982725528,
"eval_loss": 4.626655578613281,
"eval_runtime": 6.1181,
"eval_samples_per_second": 167.371,
"eval_steps_per_second": 3.596,
"eval_token_acc": 0.7764063477617883,
"step": 500
},
{
"epoch": 0.9692898272552783,
"grad_norm": 3.855247554495634,
"learning_rate": 5.7623611525721155e-08,
"loss": 0.1335052251815796,
"memory(GiB)": 14.86,
"step": 505,
"token_acc": 0.753731343283582,
"train_speed(iter/s)": 1.315161
},
{
"epoch": 0.9788867562380038,
"grad_norm": 5.015095661280352,
"learning_rate": 2.7249972817849913e-08,
"loss": 0.14048197269439697,
"memory(GiB)": 14.86,
"step": 510,
"token_acc": 0.9333333333333333,
"train_speed(iter/s)": 1.317064
},
{
"epoch": 0.9884836852207294,
"grad_norm": 3.5753180986867563,
"learning_rate": 8.110018284304132e-09,
"loss": 0.1381472110748291,
"memory(GiB)": 14.86,
"step": 515,
"token_acc": 0.95,
"train_speed(iter/s)": 1.319109
},
{
"epoch": 0.9980806142034548,
"grad_norm": 3.833364172860736,
"learning_rate": 2.2530789637986716e-10,
"loss": 0.14658401012420655,
"memory(GiB)": 14.86,
"step": 520,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 1.320552
},
{
"epoch": 1.0,
"eval_loss": 4.6301727294921875,
"eval_runtime": 5.7214,
"eval_samples_per_second": 178.977,
"eval_steps_per_second": 3.845,
"eval_token_acc": 0.7765769865195381,
"step": 521
}
],
"logging_steps": 5,
"max_steps": 521,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 220,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.493328572372746e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}