DeRooseBERTa / trainer_state.json
ddore14's picture
Upload 6 files
8434575 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 75.1879934828926,
"eval_steps": 2000,
"global_step": 150000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_accuracy": 1.3660035857594126e-07,
"eval_loss": 132.875,
"eval_runtime": 254.898,
"eval_samples_per_second": 6486.396,
"eval_steps_per_second": 12.672,
"step": 0
},
{
"epoch": 0.13762730525736305,
"grad_norm": 49.58098602294922,
"learning_rate": 2.967e-05,
"loss": 178.872046875,
"step": 1000
},
{
"epoch": 0.2752546105147261,
"grad_norm": 40.552101135253906,
"learning_rate": 5.966999999999999e-05,
"loss": 82.2545625,
"step": 2000
},
{
"epoch": 0.2752546105147261,
"eval_accuracy": 0.6086885594122716,
"eval_loss": 16.890625,
"eval_runtime": 245.8339,
"eval_samples_per_second": 6725.555,
"eval_steps_per_second": 13.139,
"step": 2000
},
{
"epoch": 0.41288191577208916,
"grad_norm": 37.14718246459961,
"learning_rate": 8.966999999999999e-05,
"loss": 66.408390625,
"step": 3000
},
{
"epoch": 0.5505092210294522,
"grad_norm": 32.7165641784668,
"learning_rate": 0.00011960999999999999,
"loss": 59.63419921875,
"step": 4000
},
{
"epoch": 0.5505092210294522,
"eval_accuracy": 0.6626575571303744,
"eval_loss": 13.6171875,
"eval_runtime": 239.1422,
"eval_samples_per_second": 6913.748,
"eval_steps_per_second": 13.507,
"step": 4000
},
{
"epoch": 0.6881365262868153,
"grad_norm": 26.651721954345703,
"learning_rate": 0.00014960999999999997,
"loss": 55.59596484375,
"step": 5000
},
{
"epoch": 0.8257638315441783,
"grad_norm": 25.13609504699707,
"learning_rate": 0.00017961,
"loss": 52.9440390625,
"step": 6000
},
{
"epoch": 0.8257638315441783,
"eval_accuracy": 0.6820435002667244,
"eval_loss": 12.4453125,
"eval_runtime": 241.0405,
"eval_samples_per_second": 6859.298,
"eval_steps_per_second": 13.4,
"step": 6000
},
{
"epoch": 0.9633911368015414,
"grad_norm": 25.540454864501953,
"learning_rate": 0.00020960999999999997,
"loss": 51.0653359375,
"step": 7000
},
{
"epoch": 1.1010184420589044,
"grad_norm": 22.598819732666016,
"learning_rate": 0.00023960999999999996,
"loss": 49.669171875,
"step": 8000
},
{
"epoch": 1.1010184420589044,
"eval_accuracy": 0.6911625811516694,
"eval_loss": 11.875,
"eval_runtime": 238.8748,
"eval_samples_per_second": 6921.488,
"eval_steps_per_second": 13.522,
"step": 8000
},
{
"epoch": 1.2386457473162675,
"grad_norm": 21.2167911529541,
"learning_rate": 0.00026957999999999995,
"loss": 48.713546875,
"step": 9000
},
{
"epoch": 1.3762730525736306,
"grad_norm": 20.751371383666992,
"learning_rate": 0.00029955,
"loss": 47.99215625,
"step": 10000
},
{
"epoch": 1.3762730525736306,
"eval_accuracy": 0.6964798957051618,
"eval_loss": 11.5546875,
"eval_runtime": 240.5455,
"eval_samples_per_second": 6873.416,
"eval_steps_per_second": 13.428,
"step": 10000
},
{
"epoch": 1.5139003578309937,
"grad_norm": 21.686861038208008,
"learning_rate": 0.0003,
"loss": 47.29948046875,
"step": 11000
},
{
"epoch": 1.6515276630883569,
"grad_norm": 18.800752639770508,
"learning_rate": 0.0003,
"loss": 46.53960546875,
"step": 12000
},
{
"epoch": 1.6515276630883569,
"eval_accuracy": 0.701906575930677,
"eval_loss": 11.2265625,
"eval_runtime": 239.4873,
"eval_samples_per_second": 6903.785,
"eval_steps_per_second": 13.487,
"step": 12000
},
{
"epoch": 1.7891549683457197,
"grad_norm": 19.42099952697754,
"learning_rate": 0.0003,
"loss": 45.9191953125,
"step": 13000
},
{
"epoch": 1.9267822736030829,
"grad_norm": 19.15869140625,
"learning_rate": 0.0003,
"loss": 45.381796875,
"step": 14000
},
{
"epoch": 1.9267822736030829,
"eval_accuracy": 0.7061886218301343,
"eval_loss": 10.9765625,
"eval_runtime": 239.2568,
"eval_samples_per_second": 6910.438,
"eval_steps_per_second": 13.5,
"step": 14000
},
{
"epoch": 2.0644095788604457,
"grad_norm": 16.94078826904297,
"learning_rate": 0.0003,
"loss": 44.90976953125,
"step": 15000
},
{
"epoch": 2.202036884117809,
"grad_norm": 17.655250549316406,
"learning_rate": 0.0003,
"loss": 44.45808203125,
"step": 16000
},
{
"epoch": 2.202036884117809,
"eval_accuracy": 0.7093567074531988,
"eval_loss": 10.8203125,
"eval_runtime": 241.1911,
"eval_samples_per_second": 6855.016,
"eval_steps_per_second": 13.392,
"step": 16000
},
{
"epoch": 2.339664189375172,
"grad_norm": 17.694721221923828,
"learning_rate": 0.0003,
"loss": 44.16196484375,
"step": 17000
},
{
"epoch": 2.477291494632535,
"grad_norm": 17.49053955078125,
"learning_rate": 0.0003,
"loss": 43.84825,
"step": 18000
},
{
"epoch": 2.477291494632535,
"eval_accuracy": 0.7115680703757034,
"eval_loss": 10.6640625,
"eval_runtime": 239.4688,
"eval_samples_per_second": 6904.32,
"eval_steps_per_second": 13.488,
"step": 18000
},
{
"epoch": 2.614918799889898,
"grad_norm": 19.09914207458496,
"learning_rate": 0.0003,
"loss": 43.59271875,
"step": 19000
},
{
"epoch": 2.7525461051472613,
"grad_norm": 16.3907527923584,
"learning_rate": 0.0003,
"loss": 43.352640625,
"step": 20000
},
{
"epoch": 2.7525461051472613,
"eval_accuracy": 0.7139675040013439,
"eval_loss": 10.5546875,
"eval_runtime": 238.1647,
"eval_samples_per_second": 6942.126,
"eval_steps_per_second": 13.562,
"step": 20000
},
{
"epoch": 2.8901734104046244,
"grad_norm": 15.896549224853516,
"learning_rate": 0.0003,
"loss": 43.17196875,
"step": 21000
},
{
"epoch": 3.0278007156619875,
"grad_norm": 29.67310905456543,
"learning_rate": 0.0003,
"loss": 42.92155859375,
"step": 22000
},
{
"epoch": 3.0278007156619875,
"eval_accuracy": 0.7146398068421496,
"eval_loss": 10.484375,
"eval_runtime": 238.4701,
"eval_samples_per_second": 6933.233,
"eval_steps_per_second": 13.545,
"step": 22000
},
{
"epoch": 3.1654280209193506,
"grad_norm": 16.424579620361328,
"learning_rate": 0.0003,
"loss": 42.65390625,
"step": 23000
},
{
"epoch": 3.3030553261767133,
"grad_norm": 16.19496726989746,
"learning_rate": 0.0003,
"loss": 42.48802734375,
"step": 24000
},
{
"epoch": 3.3030553261767133,
"eval_accuracy": 0.7170818189501579,
"eval_loss": 10.3671875,
"eval_runtime": 240.7935,
"eval_samples_per_second": 6866.337,
"eval_steps_per_second": 13.414,
"step": 24000
},
{
"epoch": 3.4406826314340764,
"grad_norm": 15.53753662109375,
"learning_rate": 0.0003,
"loss": 42.357984375,
"step": 25000
},
{
"epoch": 3.5783099366914395,
"grad_norm": 16.701377868652344,
"learning_rate": 0.0003,
"loss": 42.1965703125,
"step": 26000
},
{
"epoch": 3.5783099366914395,
"eval_accuracy": 0.7182790131411184,
"eval_loss": 10.2890625,
"eval_runtime": 241.2004,
"eval_samples_per_second": 6854.753,
"eval_steps_per_second": 13.391,
"step": 26000
},
{
"epoch": 3.7159372419488026,
"grad_norm": 15.334391593933105,
"learning_rate": 0.0003,
"loss": 42.05885546875,
"step": 27000
},
{
"epoch": 3.8535645472061657,
"grad_norm": 15.341226577758789,
"learning_rate": 0.0003,
"loss": 41.9392421875,
"step": 28000
},
{
"epoch": 3.8535645472061657,
"eval_accuracy": 0.719322972712139,
"eval_loss": 10.2421875,
"eval_runtime": 241.186,
"eval_samples_per_second": 6855.162,
"eval_steps_per_second": 13.392,
"step": 28000
},
{
"epoch": 3.991191852463529,
"grad_norm": 16.253334045410156,
"learning_rate": 0.0003,
"loss": 41.81163671875,
"step": 29000
},
{
"epoch": 4.1288191577208915,
"grad_norm": 15.035149574279785,
"learning_rate": 0.0003,
"loss": 41.617953125,
"step": 30000
},
{
"epoch": 4.1288191577208915,
"eval_accuracy": 0.720451396648655,
"eval_loss": 10.171875,
"eval_runtime": 240.2855,
"eval_samples_per_second": 6880.853,
"eval_steps_per_second": 13.442,
"step": 30000
},
{
"epoch": 4.266446462978255,
"grad_norm": 14.762296676635742,
"learning_rate": 0.0003,
"loss": 41.5138203125,
"step": 31000
},
{
"epoch": 4.404073768235618,
"grad_norm": 14.627701759338379,
"learning_rate": 0.0003,
"loss": 41.4306015625,
"step": 32000
},
{
"epoch": 4.404073768235618,
"eval_accuracy": 0.7213651673804347,
"eval_loss": 10.1328125,
"eval_runtime": 242.2962,
"eval_samples_per_second": 6823.752,
"eval_steps_per_second": 13.331,
"step": 32000
},
{
"epoch": 4.541701073492981,
"grad_norm": 14.57941722869873,
"learning_rate": 0.0003,
"loss": 41.3221171875,
"step": 33000
},
{
"epoch": 4.679328378750344,
"grad_norm": 15.291731834411621,
"learning_rate": 0.0003,
"loss": 41.276203125,
"step": 34000
},
{
"epoch": 4.679328378750344,
"eval_accuracy": 0.7223401458779132,
"eval_loss": 10.0703125,
"eval_runtime": 239.5218,
"eval_samples_per_second": 6902.793,
"eval_steps_per_second": 13.485,
"step": 34000
},
{
"epoch": 4.8169556840077075,
"grad_norm": 15.057552337646484,
"learning_rate": 0.0003,
"loss": 41.19701171875,
"step": 35000
},
{
"epoch": 4.95458298926507,
"grad_norm": 15.457907676696777,
"learning_rate": 0.0003,
"loss": 41.10438671875,
"step": 36000
},
{
"epoch": 4.95458298926507,
"eval_accuracy": 0.7230995451445803,
"eval_loss": 10.0546875,
"eval_runtime": 240.0931,
"eval_samples_per_second": 6886.367,
"eval_steps_per_second": 13.453,
"step": 36000
},
{
"epoch": 5.092210294522434,
"grad_norm": 15.539594650268555,
"learning_rate": 0.0003,
"loss": 40.93646875,
"step": 37000
},
{
"epoch": 5.229837599779796,
"grad_norm": 14.915628433227539,
"learning_rate": 0.0003,
"loss": 40.8286875,
"step": 38000
},
{
"epoch": 5.229837599779796,
"eval_accuracy": 0.7238966529952525,
"eval_loss": 10.0,
"eval_runtime": 239.9098,
"eval_samples_per_second": 6891.629,
"eval_steps_per_second": 13.463,
"step": 38000
},
{
"epoch": 5.367464905037159,
"grad_norm": 14.271048545837402,
"learning_rate": 0.0003,
"loss": 40.80625,
"step": 39000
},
{
"epoch": 5.505092210294523,
"grad_norm": 14.605119705200195,
"learning_rate": 0.0003,
"loss": 40.713796875,
"step": 40000
},
{
"epoch": 5.505092210294523,
"eval_accuracy": 0.7245612679427045,
"eval_loss": 9.9609375,
"eval_runtime": 240.0103,
"eval_samples_per_second": 6888.743,
"eval_steps_per_second": 13.458,
"step": 40000
},
{
"epoch": 5.642719515551885,
"grad_norm": 14.748287200927734,
"learning_rate": 0.0003,
"loss": 40.62338671875,
"step": 41000
},
{
"epoch": 5.780346820809249,
"grad_norm": 15.422652244567871,
"learning_rate": 0.0003,
"loss": 40.56144140625,
"step": 42000
},
{
"epoch": 5.780346820809249,
"eval_accuracy": 0.7251576961266964,
"eval_loss": 9.9375,
"eval_runtime": 240.638,
"eval_samples_per_second": 6870.772,
"eval_steps_per_second": 13.423,
"step": 42000
},
{
"epoch": 5.917974126066611,
"grad_norm": 15.326558113098145,
"learning_rate": 0.0003,
"loss": 40.5059375,
"step": 43000
},
{
"epoch": 6.055601431323975,
"grad_norm": 15.331598281860352,
"learning_rate": 0.0003,
"loss": 40.40818359375,
"step": 44000
},
{
"epoch": 6.055601431323975,
"eval_accuracy": 0.7254487554600376,
"eval_loss": 9.8984375,
"eval_runtime": 240.1599,
"eval_samples_per_second": 6884.449,
"eval_steps_per_second": 13.449,
"step": 44000
},
{
"epoch": 6.193228736581338,
"grad_norm": 14.527973175048828,
"learning_rate": 0.0003,
"loss": 40.3428828125,
"step": 45000
},
{
"epoch": 6.330856041838701,
"grad_norm": 15.686996459960938,
"learning_rate": 0.0003,
"loss": 40.3244765625,
"step": 46000
},
{
"epoch": 6.330856041838701,
"eval_accuracy": 0.7256079479674087,
"eval_loss": 9.8984375,
"eval_runtime": 239.085,
"eval_samples_per_second": 6915.403,
"eval_steps_per_second": 13.51,
"step": 46000
},
{
"epoch": 6.468483347096064,
"grad_norm": 14.848986625671387,
"learning_rate": 0.0003,
"loss": 40.312796875,
"step": 47000
},
{
"epoch": 6.6061106523534265,
"grad_norm": 14.275111198425293,
"learning_rate": 0.0003,
"loss": 40.28499609375,
"step": 48000
},
{
"epoch": 6.6061106523534265,
"eval_accuracy": 0.7262142861047188,
"eval_loss": 9.875,
"eval_runtime": 240.5807,
"eval_samples_per_second": 6872.409,
"eval_steps_per_second": 13.426,
"step": 48000
},
{
"epoch": 6.74373795761079,
"grad_norm": 14.665587425231934,
"learning_rate": 0.0003,
"loss": 40.18369921875,
"step": 49000
},
{
"epoch": 6.881365262868153,
"grad_norm": 14.547246932983398,
"learning_rate": 0.0003,
"loss": 40.1498828125,
"step": 50000
},
{
"epoch": 6.881365262868153,
"eval_accuracy": 0.7269716959581425,
"eval_loss": 9.8515625,
"eval_runtime": 241.4549,
"eval_samples_per_second": 6847.527,
"eval_steps_per_second": 13.377,
"step": 50000
},
{
"epoch": 7.018992568125516,
"grad_norm": 14.525768280029297,
"learning_rate": 0.0003,
"loss": 40.1036328125,
"step": 51000
},
{
"epoch": 7.156619873382879,
"grad_norm": 14.632113456726074,
"learning_rate": 0.0003,
"loss": 39.9834296875,
"step": 52000
},
{
"epoch": 7.156619873382879,
"eval_accuracy": 0.7272316426626143,
"eval_loss": 9.828125,
"eval_runtime": 239.3181,
"eval_samples_per_second": 6908.667,
"eval_steps_per_second": 13.497,
"step": 52000
},
{
"epoch": 7.2942471786402425,
"grad_norm": 14.982499122619629,
"learning_rate": 0.0003,
"loss": 39.9509375,
"step": 53000
},
{
"epoch": 7.431874483897605,
"grad_norm": 16.801025390625,
"learning_rate": 0.0003,
"loss": 39.891859375,
"step": 54000
},
{
"epoch": 7.431874483897605,
"eval_accuracy": 0.7271305788939304,
"eval_loss": 9.828125,
"eval_runtime": 240.159,
"eval_samples_per_second": 6884.477,
"eval_steps_per_second": 13.449,
"step": 54000
},
{
"epoch": 7.569501789154968,
"grad_norm": 14.868009567260742,
"learning_rate": 0.0003,
"loss": 39.88668359375,
"step": 55000
},
{
"epoch": 7.707129094412331,
"grad_norm": 14.595479011535645,
"learning_rate": 0.0003,
"loss": 39.821890625,
"step": 56000
},
{
"epoch": 7.707129094412331,
"eval_accuracy": 0.7280901536840519,
"eval_loss": 9.7734375,
"eval_runtime": 238.9096,
"eval_samples_per_second": 6920.478,
"eval_steps_per_second": 13.52,
"step": 56000
},
{
"epoch": 7.844756399669695,
"grad_norm": 13.92586612701416,
"learning_rate": 0.0003,
"loss": 39.78269921875,
"step": 57000
},
{
"epoch": 7.982383704927058,
"grad_norm": 15.85058307647705,
"learning_rate": 0.0003,
"loss": 39.72277734375,
"step": 58000
},
{
"epoch": 7.982383704927058,
"eval_accuracy": 0.7287356832938983,
"eval_loss": 9.7578125,
"eval_runtime": 239.7822,
"eval_samples_per_second": 6895.295,
"eval_steps_per_second": 13.471,
"step": 58000
},
{
"epoch": 8.12001101018442,
"grad_norm": 15.202603340148926,
"learning_rate": 0.0003,
"loss": 39.6687421875,
"step": 59000
},
{
"epoch": 8.257638315441783,
"grad_norm": 14.994338989257812,
"learning_rate": 0.0003,
"loss": 39.60739453125,
"step": 60000
},
{
"epoch": 8.257638315441783,
"eval_accuracy": 0.7289930926403759,
"eval_loss": 9.7265625,
"eval_runtime": 241.1318,
"eval_samples_per_second": 6856.702,
"eval_steps_per_second": 13.395,
"step": 60000
},
{
"epoch": 8.395265620699147,
"grad_norm": 15.15245532989502,
"learning_rate": 0.0003,
"loss": 39.57180859375,
"step": 61000
},
{
"epoch": 8.53289292595651,
"grad_norm": 15.941924095153809,
"learning_rate": 0.0003,
"loss": 39.5704296875,
"step": 62000
},
{
"epoch": 8.53289292595651,
"eval_accuracy": 0.7289831970926051,
"eval_loss": 9.734375,
"eval_runtime": 241.0009,
"eval_samples_per_second": 6860.426,
"eval_steps_per_second": 13.402,
"step": 62000
},
{
"epoch": 8.670520231213873,
"grad_norm": 14.842296600341797,
"learning_rate": 0.0003,
"loss": 39.53778125,
"step": 63000
},
{
"epoch": 8.808147536471235,
"grad_norm": 17.454763412475586,
"learning_rate": 0.0003,
"loss": 39.540921875,
"step": 64000
},
{
"epoch": 8.808147536471235,
"eval_accuracy": 0.7290886771189041,
"eval_loss": 9.7109375,
"eval_runtime": 240.0306,
"eval_samples_per_second": 6888.158,
"eval_steps_per_second": 13.457,
"step": 64000
},
{
"epoch": 8.9457748417286,
"grad_norm": 13.98570442199707,
"learning_rate": 0.0003,
"loss": 39.512796875,
"step": 65000
},
{
"epoch": 9.083402146985962,
"grad_norm": 18.010318756103516,
"learning_rate": 0.0003,
"loss": 39.4786171875,
"step": 66000
},
{
"epoch": 9.083402146985962,
"eval_accuracy": 0.7294749784251455,
"eval_loss": 9.7109375,
"eval_runtime": 240.1115,
"eval_samples_per_second": 6885.838,
"eval_steps_per_second": 13.452,
"step": 66000
},
{
"epoch": 9.221029452243325,
"grad_norm": 15.137900352478027,
"learning_rate": 0.0003,
"loss": 39.4073359375,
"step": 67000
},
{
"epoch": 9.358656757500688,
"grad_norm": 18.228130340576172,
"learning_rate": 0.0003,
"loss": 39.3549765625,
"step": 68000
},
{
"epoch": 9.358656757500688,
"eval_accuracy": 0.7301181665976199,
"eval_loss": 9.671875,
"eval_runtime": 239.7862,
"eval_samples_per_second": 6895.179,
"eval_steps_per_second": 13.47,
"step": 68000
},
{
"epoch": 9.49628406275805,
"grad_norm": 16.575559616088867,
"learning_rate": 0.0003,
"loss": 39.3098828125,
"step": 69000
},
{
"epoch": 9.633911368015415,
"grad_norm": 14.635740280151367,
"learning_rate": 0.0003,
"loss": 39.35287890625,
"step": 70000
},
{
"epoch": 9.633911368015415,
"eval_accuracy": 0.7295560826467988,
"eval_loss": 9.6875,
"eval_runtime": 241.3494,
"eval_samples_per_second": 6850.521,
"eval_steps_per_second": 13.383,
"step": 70000
},
{
"epoch": 9.771538673272778,
"grad_norm": 14.436244010925293,
"learning_rate": 0.0003,
"loss": 39.29956640625,
"step": 71000
},
{
"epoch": 9.90916597853014,
"grad_norm": 14.493698120117188,
"learning_rate": 0.0003,
"loss": 39.31009375,
"step": 72000
},
{
"epoch": 9.90916597853014,
"eval_accuracy": 0.7304579509385183,
"eval_loss": 9.6484375,
"eval_runtime": 245.6684,
"eval_samples_per_second": 6730.085,
"eval_steps_per_second": 13.148,
"step": 72000
},
{
"epoch": 10.046793283787503,
"grad_norm": 15.077356338500977,
"learning_rate": 0.0003,
"loss": 39.2335546875,
"step": 73000
},
{
"epoch": 10.184420589044867,
"grad_norm": 13.661473274230957,
"learning_rate": 0.0003,
"loss": 39.09965625,
"step": 74000
},
{
"epoch": 10.184420589044867,
"eval_accuracy": 0.7312455280822778,
"eval_loss": 9.625,
"eval_runtime": 239.6349,
"eval_samples_per_second": 6899.534,
"eval_steps_per_second": 13.479,
"step": 74000
},
{
"epoch": 10.32204789430223,
"grad_norm": 15.429136276245117,
"learning_rate": 0.0003,
"loss": 39.147140625,
"step": 75000
},
{
"epoch": 10.459675199559593,
"grad_norm": 15.229757308959961,
"learning_rate": 0.0003,
"loss": 39.1339453125,
"step": 76000
},
{
"epoch": 10.459675199559593,
"eval_accuracy": 0.731277762807936,
"eval_loss": 9.609375,
"eval_runtime": 238.7439,
"eval_samples_per_second": 6925.282,
"eval_steps_per_second": 13.529,
"step": 76000
},
{
"epoch": 10.597302504816955,
"grad_norm": 14.771382331848145,
"learning_rate": 0.0003,
"loss": 39.1441796875,
"step": 77000
},
{
"epoch": 10.734929810074318,
"grad_norm": 13.703607559204102,
"learning_rate": 0.0003,
"loss": 39.141265625,
"step": 78000
},
{
"epoch": 10.734929810074318,
"eval_accuracy": 0.7310708531463442,
"eval_loss": 9.609375,
"eval_runtime": 239.4162,
"eval_samples_per_second": 6905.836,
"eval_steps_per_second": 13.491,
"step": 78000
},
{
"epoch": 10.872557115331682,
"grad_norm": 19.041141510009766,
"learning_rate": 0.0003,
"loss": 39.0934140625,
"step": 79000
},
{
"epoch": 11.010184420589045,
"grad_norm": 17.401290893554688,
"learning_rate": 0.0003,
"loss": 39.113875,
"step": 80000
},
{
"epoch": 11.010184420589045,
"eval_accuracy": 0.7312406631974454,
"eval_loss": 9.6015625,
"eval_runtime": 238.9522,
"eval_samples_per_second": 6919.246,
"eval_steps_per_second": 13.517,
"step": 80000
},
{
"epoch": 11.147811725846408,
"grad_norm": 14.292427062988281,
"learning_rate": 0.0003,
"loss": 39.012484375,
"step": 81000
},
{
"epoch": 11.28543903110377,
"grad_norm": 15.462931632995605,
"learning_rate": 0.0003,
"loss": 39.04391796875,
"step": 82000
},
{
"epoch": 11.28543903110377,
"eval_accuracy": 0.7316027472794033,
"eval_loss": 9.6015625,
"eval_runtime": 240.4477,
"eval_samples_per_second": 6876.21,
"eval_steps_per_second": 13.433,
"step": 82000
},
{
"epoch": 11.423066336361135,
"grad_norm": 17.796772003173828,
"learning_rate": 0.0003,
"loss": 38.957421875,
"step": 83000
},
{
"epoch": 11.560693641618498,
"grad_norm": 17.314067840576172,
"learning_rate": 0.0003,
"loss": 38.9495234375,
"step": 84000
},
{
"epoch": 11.560693641618498,
"eval_accuracy": 0.7321146855990825,
"eval_loss": 9.578125,
"eval_runtime": 239.2384,
"eval_samples_per_second": 6910.967,
"eval_steps_per_second": 13.501,
"step": 84000
},
{
"epoch": 11.69832094687586,
"grad_norm": 16.145645141601562,
"learning_rate": 0.0003,
"loss": 38.91906640625,
"step": 85000
},
{
"epoch": 11.835948252133223,
"grad_norm": 13.51314640045166,
"learning_rate": 0.0003,
"loss": 38.91014453125,
"step": 86000
},
{
"epoch": 11.835948252133223,
"eval_accuracy": 0.732051599943418,
"eval_loss": 9.5546875,
"eval_runtime": 240.3949,
"eval_samples_per_second": 6877.722,
"eval_steps_per_second": 13.436,
"step": 86000
},
{
"epoch": 11.973575557390586,
"grad_norm": 15.877927780151367,
"learning_rate": 0.0003,
"loss": 38.933609375,
"step": 87000
},
{
"epoch": 12.11120286264795,
"grad_norm": 15.215489387512207,
"learning_rate": 0.0003,
"loss": 38.8452265625,
"step": 88000
},
{
"epoch": 12.11120286264795,
"eval_accuracy": 0.7323534973774022,
"eval_loss": 9.546875,
"eval_runtime": 240.1188,
"eval_samples_per_second": 6885.629,
"eval_steps_per_second": 13.452,
"step": 88000
},
{
"epoch": 12.248830167905313,
"grad_norm": 15.539190292358398,
"learning_rate": 0.0003,
"loss": 38.8104296875,
"step": 89000
},
{
"epoch": 12.386457473162675,
"grad_norm": 15.577831268310547,
"learning_rate": 0.0003,
"loss": 38.80796875,
"step": 90000
},
{
"epoch": 12.386457473162675,
"eval_accuracy": 0.7324531616408847,
"eval_loss": 9.546875,
"eval_runtime": 240.2465,
"eval_samples_per_second": 6881.969,
"eval_steps_per_second": 13.445,
"step": 90000
},
{
"epoch": 12.524084778420038,
"grad_norm": 14.47063159942627,
"learning_rate": 0.0003,
"loss": 38.865859375,
"step": 91000
},
{
"epoch": 12.661712083677402,
"grad_norm": 13.968493461608887,
"learning_rate": 0.0003,
"loss": 38.81719921875,
"step": 92000
},
{
"epoch": 12.661712083677402,
"eval_accuracy": 0.7321305936040334,
"eval_loss": 9.546875,
"eval_runtime": 239.1976,
"eval_samples_per_second": 6912.148,
"eval_steps_per_second": 13.503,
"step": 92000
},
{
"epoch": 12.799339388934765,
"grad_norm": 28.390636444091797,
"learning_rate": 0.0003,
"loss": 38.815578125,
"step": 93000
},
{
"epoch": 12.936966694192128,
"grad_norm": 27.102386474609375,
"learning_rate": 0.0003,
"loss": 38.82604296875,
"step": 94000
},
{
"epoch": 12.936966694192128,
"eval_accuracy": 0.732027704335829,
"eval_loss": 9.546875,
"eval_runtime": 240.3497,
"eval_samples_per_second": 6879.014,
"eval_steps_per_second": 13.439,
"step": 94000
},
{
"epoch": 13.07459399944949,
"grad_norm": 14.193507194519043,
"learning_rate": 0.0003,
"loss": 38.72788671875,
"step": 95000
},
{
"epoch": 13.212221304706853,
"grad_norm": 18.604595184326172,
"learning_rate": 0.0003,
"loss": 38.6876171875,
"step": 96000
},
{
"epoch": 13.212221304706853,
"eval_accuracy": 0.7321750300843878,
"eval_loss": 9.546875,
"eval_runtime": 240.1101,
"eval_samples_per_second": 6885.879,
"eval_steps_per_second": 13.452,
"step": 96000
},
{
"epoch": 13.349848609964218,
"grad_norm": 16.717756271362305,
"learning_rate": 0.0003,
"loss": 38.7415390625,
"step": 97000
},
{
"epoch": 13.48747591522158,
"grad_norm": 13.74322509765625,
"learning_rate": 0.0003,
"loss": 38.704234375,
"step": 98000
},
{
"epoch": 13.48747591522158,
"eval_accuracy": 0.7335116918906991,
"eval_loss": 9.4921875,
"eval_runtime": 240.4214,
"eval_samples_per_second": 6876.962,
"eval_steps_per_second": 13.435,
"step": 98000
},
{
"epoch": 13.625103220478943,
"grad_norm": 17.836227416992188,
"learning_rate": 0.0003,
"loss": 38.6647890625,
"step": 99000
},
{
"epoch": 13.762730525736306,
"grad_norm": 20.256298065185547,
"learning_rate": 0.0003,
"loss": 38.654390625,
"step": 100000
},
{
"epoch": 13.762730525736306,
"eval_accuracy": 0.7328628073699861,
"eval_loss": 9.5078125,
"eval_runtime": 240.901,
"eval_samples_per_second": 6863.272,
"eval_steps_per_second": 13.408,
"step": 100000
},
{
"epoch": 13.762730525736306,
"eval_accuracy": 0.7328718517886109,
"eval_loss": 9.5078125,
"eval_runtime": 257.3667,
"eval_samples_per_second": 6424.176,
"eval_steps_per_second": 12.55,
"step": 100000
},
{
"epoch": 13.90035783099367,
"grad_norm": 17.175275802612305,
"learning_rate": 0.0003,
"loss": 38.7094453125,
"step": 101000
},
{
"epoch": 14.037985136251033,
"grad_norm": 30.791107177734375,
"learning_rate": 0.0003,
"loss": 38.7431796875,
"step": 102000
},
{
"epoch": 14.037985136251033,
"eval_accuracy": 0.732853775052298,
"eval_loss": 9.53125,
"eval_runtime": 244.7843,
"eval_samples_per_second": 6754.392,
"eval_steps_per_second": 13.195,
"step": 102000
},
{
"epoch": 14.175612441508395,
"grad_norm": 15.07434368133545,
"learning_rate": 0.0003,
"loss": 38.5621875,
"step": 103000
},
{
"epoch": 14.313239746765758,
"grad_norm": 16.333436965942383,
"learning_rate": 0.0003,
"loss": 38.6172734375,
"step": 104000
},
{
"epoch": 14.313239746765758,
"eval_accuracy": 0.7328456338360237,
"eval_loss": 9.515625,
"eval_runtime": 243.7048,
"eval_samples_per_second": 6784.311,
"eval_steps_per_second": 13.254,
"step": 104000
},
{
"epoch": 14.45086705202312,
"grad_norm": 14.872163772583008,
"learning_rate": 0.0003,
"loss": 38.61624609375,
"step": 105000
},
{
"epoch": 14.588494357280485,
"grad_norm": 15.491616249084473,
"learning_rate": 0.0003,
"loss": 38.5978203125,
"step": 106000
},
{
"epoch": 14.588494357280485,
"eval_accuracy": 0.7325266385860558,
"eval_loss": 9.53125,
"eval_runtime": 241.371,
"eval_samples_per_second": 6849.906,
"eval_steps_per_second": 13.382,
"step": 106000
},
{
"epoch": 14.726121662537848,
"grad_norm": 14.945006370544434,
"learning_rate": 0.0003,
"loss": 38.621796875,
"step": 107000
},
{
"epoch": 14.86374896779521,
"grad_norm": 14.714298248291016,
"learning_rate": 0.0003,
"loss": 38.5805546875,
"step": 108000
},
{
"epoch": 14.86374896779521,
"eval_accuracy": 0.7336863429887471,
"eval_loss": 9.484375,
"eval_runtime": 243.3178,
"eval_samples_per_second": 6795.1,
"eval_steps_per_second": 13.275,
"step": 108000
},
{
"epoch": 15.001376273052573,
"grad_norm": 17.513687133789062,
"learning_rate": 0.0003,
"loss": 38.5988359375,
"step": 109000
},
{
"epoch": 15.139003578309937,
"grad_norm": 14.208888053894043,
"learning_rate": 0.0003,
"loss": 38.5494453125,
"step": 110000
},
{
"epoch": 15.139003578309937,
"eval_accuracy": 0.7334265583450897,
"eval_loss": 9.4921875,
"eval_runtime": 245.3975,
"eval_samples_per_second": 6737.512,
"eval_steps_per_second": 13.162,
"step": 110000
},
{
"epoch": 15.2766308835673,
"grad_norm": 20.13620376586914,
"learning_rate": 0.0003,
"loss": 38.51769140625,
"step": 111000
},
{
"epoch": 15.414258188824663,
"grad_norm": 14.885974884033203,
"learning_rate": 0.0003,
"loss": 38.52906640625,
"step": 112000
},
{
"epoch": 15.414258188824663,
"eval_accuracy": 0.7332003955432331,
"eval_loss": 9.4921875,
"eval_runtime": 246.8519,
"eval_samples_per_second": 6697.818,
"eval_steps_per_second": 13.085,
"step": 112000
},
{
"epoch": 15.551885494082025,
"grad_norm": 14.931363105773926,
"learning_rate": 0.0003,
"loss": 38.534203125,
"step": 113000
},
{
"epoch": 15.689512799339388,
"grad_norm": 15.144700050354004,
"learning_rate": 0.0003,
"loss": 38.5433125,
"step": 114000
},
{
"epoch": 15.689512799339388,
"eval_accuracy": 0.7337025970829132,
"eval_loss": 9.4765625,
"eval_runtime": 244.85,
"eval_samples_per_second": 6752.58,
"eval_steps_per_second": 13.192,
"step": 114000
},
{
"epoch": 15.827140104596753,
"grad_norm": 17.183073043823242,
"learning_rate": 0.0003,
"loss": 38.4901015625,
"step": 115000
},
{
"epoch": 15.964767409854115,
"grad_norm": 14.985239028930664,
"learning_rate": 0.0003,
"loss": 38.51575390625,
"step": 116000
},
{
"epoch": 15.964767409854115,
"eval_accuracy": 0.7338189696183159,
"eval_loss": 9.484375,
"eval_runtime": 245.1155,
"eval_samples_per_second": 6745.266,
"eval_steps_per_second": 13.177,
"step": 116000
},
{
"epoch": 16.10239471511148,
"grad_norm": 19.971887588500977,
"learning_rate": 0.0003,
"loss": 38.4035234375,
"step": 117000
},
{
"epoch": 16.24002202036884,
"grad_norm": 17.1956844329834,
"learning_rate": 0.0003,
"loss": 38.42918359375,
"step": 118000
},
{
"epoch": 16.24002202036884,
"eval_accuracy": 0.733730730614503,
"eval_loss": 9.46875,
"eval_runtime": 243.3012,
"eval_samples_per_second": 6795.566,
"eval_steps_per_second": 13.276,
"step": 118000
},
{
"epoch": 16.377649325626205,
"grad_norm": 15.118714332580566,
"learning_rate": 0.0003,
"loss": 38.507515625,
"step": 119000
},
{
"epoch": 16.515276630883566,
"grad_norm": 14.03774642944336,
"learning_rate": 0.0003,
"loss": 38.526671875,
"step": 120000
},
{
"epoch": 16.515276630883566,
"eval_accuracy": 0.733831136300071,
"eval_loss": 9.484375,
"eval_runtime": 241.9438,
"eval_samples_per_second": 6833.691,
"eval_steps_per_second": 13.35,
"step": 120000
},
{
"epoch": 16.515276630883566,
"eval_accuracy": 0.7365382984533457,
"eval_loss": 9.328125,
"eval_runtime": 320.3822,
"eval_samples_per_second": 1416.711,
"eval_steps_per_second": 2.769,
"step": 120000
},
{
"epoch": 60.65171074069432,
"grad_norm": 8.715871810913086,
"learning_rate": 0.0003,
"loss": 34.4526328125,
"step": 121000
},
{
"epoch": 61.15290136608598,
"grad_norm": 16.51197052001953,
"learning_rate": 0.0003,
"loss": 33.80009375,
"step": 122000
},
{
"epoch": 61.15290136608598,
"eval_accuracy": 0.7592972259433672,
"eval_loss": 8.328125,
"eval_runtime": 322.1474,
"eval_samples_per_second": 1408.948,
"eval_steps_per_second": 2.753,
"step": 122000
},
{
"epoch": 61.65421732046622,
"grad_norm": 8.568217277526855,
"learning_rate": 0.0003,
"loss": 33.51157421875,
"step": 123000
},
{
"epoch": 62.155407945857874,
"grad_norm": 13.904038429260254,
"learning_rate": 0.0003,
"loss": 33.3759140625,
"step": 124000
},
{
"epoch": 62.155407945857874,
"eval_accuracy": 0.7606993464209245,
"eval_loss": 8.2578125,
"eval_runtime": 310.5126,
"eval_samples_per_second": 1461.741,
"eval_steps_per_second": 2.857,
"step": 124000
},
{
"epoch": 62.65672390023813,
"grad_norm": 9.302454948425293,
"learning_rate": 0.0003,
"loss": 33.2303125,
"step": 125000
},
{
"epoch": 63.15791452562978,
"grad_norm": 10.245097160339355,
"learning_rate": 0.0003,
"loss": 33.114984375,
"step": 126000
},
{
"epoch": 63.15791452562978,
"eval_accuracy": 0.7620252803249203,
"eval_loss": 8.1953125,
"eval_runtime": 311.281,
"eval_samples_per_second": 1458.133,
"eval_steps_per_second": 2.85,
"step": 126000
},
{
"epoch": 63.659230480010024,
"grad_norm": 9.459521293640137,
"learning_rate": 0.0003,
"loss": 33.0674765625,
"step": 127000
},
{
"epoch": 64.16042110540168,
"grad_norm": 12.050172805786133,
"learning_rate": 0.0003,
"loss": 33.0123046875,
"step": 128000
},
{
"epoch": 64.16042110540168,
"eval_accuracy": 0.7628614283635131,
"eval_loss": 8.15625,
"eval_runtime": 309.2479,
"eval_samples_per_second": 1467.719,
"eval_steps_per_second": 2.868,
"step": 128000
},
{
"epoch": 64.66173705978193,
"grad_norm": 8.326544761657715,
"learning_rate": 0.0003,
"loss": 32.89726171875,
"step": 129000
},
{
"epoch": 65.16292768517359,
"grad_norm": 9.267374038696289,
"learning_rate": 0.0003,
"loss": 32.78715625,
"step": 130000
},
{
"epoch": 65.16292768517359,
"eval_accuracy": 0.7632605632607093,
"eval_loss": 8.1484375,
"eval_runtime": 313.0209,
"eval_samples_per_second": 1450.028,
"eval_steps_per_second": 2.834,
"step": 130000
},
{
"epoch": 65.66424363955383,
"grad_norm": 9.583052635192871,
"learning_rate": 0.0003,
"loss": 32.747501953125,
"step": 131000
},
{
"epoch": 66.16543426494549,
"grad_norm": 8.761311531066895,
"learning_rate": 0.0003,
"loss": 32.67369140625,
"step": 132000
},
{
"epoch": 66.16543426494549,
"eval_accuracy": 0.7639422135833412,
"eval_loss": 8.1015625,
"eval_runtime": 311.6656,
"eval_samples_per_second": 1456.333,
"eval_steps_per_second": 2.846,
"step": 132000
},
{
"epoch": 66.66675021932573,
"grad_norm": 8.83479118347168,
"learning_rate": 0.0003,
"loss": 32.617767578125,
"step": 133000
},
{
"epoch": 67.16794084471738,
"grad_norm": 8.598926544189453,
"learning_rate": 0.0003,
"loss": 32.5695625,
"step": 134000
},
{
"epoch": 67.16794084471738,
"eval_accuracy": 0.7644549296283725,
"eval_loss": 8.078125,
"eval_runtime": 308.4248,
"eval_samples_per_second": 1471.636,
"eval_steps_per_second": 2.876,
"step": 134000
},
{
"epoch": 67.66925679909762,
"grad_norm": 10.846793174743652,
"learning_rate": 0.0003,
"loss": 32.53196484375,
"step": 135000
},
{
"epoch": 68.17044742448928,
"grad_norm": 23.080833435058594,
"learning_rate": 0.0003,
"loss": 32.47344140625,
"step": 136000
},
{
"epoch": 68.17044742448928,
"eval_accuracy": 0.7638109525627774,
"eval_loss": 8.109375,
"eval_runtime": 312.4841,
"eval_samples_per_second": 1452.519,
"eval_steps_per_second": 2.839,
"step": 136000
},
{
"epoch": 68.67176337886953,
"grad_norm": 11.440296173095703,
"learning_rate": 0.0003,
"loss": 32.4546796875,
"step": 137000
},
{
"epoch": 69.17295400426119,
"grad_norm": 9.561952590942383,
"learning_rate": 0.0003,
"loss": 32.3915703125,
"step": 138000
},
{
"epoch": 69.17295400426119,
"eval_accuracy": 0.7654183586207545,
"eval_loss": 8.0234375,
"eval_runtime": 311.376,
"eval_samples_per_second": 1457.688,
"eval_steps_per_second": 2.849,
"step": 138000
},
{
"epoch": 69.67426995864143,
"grad_norm": 10.652801513671875,
"learning_rate": 0.0003,
"loss": 32.3813203125,
"step": 139000
},
{
"epoch": 70.17546058403309,
"grad_norm": 9.549755096435547,
"learning_rate": 0.0003,
"loss": 32.329857421875,
"step": 140000
},
{
"epoch": 70.17546058403309,
"eval_accuracy": 0.765731013146163,
"eval_loss": 8.015625,
"eval_runtime": 311.5237,
"eval_samples_per_second": 1456.997,
"eval_steps_per_second": 2.847,
"step": 140000
},
{
"epoch": 70.17546058403309,
"eval_accuracy": 0.7655576478890911,
"eval_loss": 8.03125,
"eval_runtime": 312.782,
"eval_samples_per_second": 1451.135,
"eval_steps_per_second": 2.836,
"step": 140000
},
{
"epoch": 70.67677653841334,
"grad_norm": 8.273364067077637,
"learning_rate": 0.0003,
"loss": 32.32880859375,
"step": 141000
},
{
"epoch": 71.177967163805,
"grad_norm": 11.310037612915039,
"learning_rate": 0.0003,
"loss": 32.2803671875,
"step": 142000
},
{
"epoch": 71.177967163805,
"eval_accuracy": 0.7654140428452689,
"eval_loss": 8.0234375,
"eval_runtime": 302.7715,
"eval_samples_per_second": 1499.114,
"eval_steps_per_second": 2.93,
"step": 142000
},
{
"epoch": 71.67928311818524,
"grad_norm": 9.46422004699707,
"learning_rate": 0.0003,
"loss": 32.241615234375,
"step": 143000
},
{
"epoch": 72.18047374357688,
"grad_norm": 9.287914276123047,
"learning_rate": 0.0003,
"loss": 32.22880078125,
"step": 144000
},
{
"epoch": 72.18047374357688,
"eval_accuracy": 0.7658155554395308,
"eval_loss": 8.015625,
"eval_runtime": 300.7976,
"eval_samples_per_second": 1508.951,
"eval_steps_per_second": 2.949,
"step": 144000
},
{
"epoch": 72.68178969795714,
"grad_norm": 9.183584213256836,
"learning_rate": 0.0003,
"loss": 32.233244140625,
"step": 145000
},
{
"epoch": 73.18298032334879,
"grad_norm": 9.008417129516602,
"learning_rate": 0.0003,
"loss": 32.181228515625,
"step": 146000
},
{
"epoch": 73.18298032334879,
"eval_accuracy": 0.76619202647217,
"eval_loss": 7.98828125,
"eval_runtime": 302.0251,
"eval_samples_per_second": 1502.819,
"eval_steps_per_second": 2.937,
"step": 146000
},
{
"epoch": 73.68429627772903,
"grad_norm": 8.19743537902832,
"learning_rate": 0.0003,
"loss": 32.162357421875,
"step": 147000
},
{
"epoch": 74.18548690312069,
"grad_norm": 8.455910682678223,
"learning_rate": 0.0003,
"loss": 32.091048828125,
"step": 148000
},
{
"epoch": 74.18548690312069,
"eval_accuracy": 0.7663843416476586,
"eval_loss": 7.97265625,
"eval_runtime": 301.7215,
"eval_samples_per_second": 1504.331,
"eval_steps_per_second": 2.94,
"step": 148000
},
{
"epoch": 74.68680285750094,
"grad_norm": 8.09157943725586,
"learning_rate": 0.0003,
"loss": 32.071322265625,
"step": 149000
},
{
"epoch": 75.1879934828926,
"grad_norm": 12.704072952270508,
"learning_rate": 0.0003,
"loss": 32.044611328125,
"step": 150000
},
{
"epoch": 75.1879934828926,
"eval_accuracy": 0.7670952482486783,
"eval_loss": 7.96484375,
"eval_runtime": 301.7456,
"eval_samples_per_second": 1504.211,
"eval_steps_per_second": 2.94,
"step": 150000
},
{
"epoch": 75.1879934828926,
"step": 150000,
"total_flos": 3.23779983669461e+19,
"train_loss": 2.1457560286458333,
"train_runtime": 27498.2172,
"train_samples_per_second": 11171.633,
"train_steps_per_second": 5.455
}
],
"logging_steps": 1000,
"max_steps": 150000,
"num_input_tokens_seen": 0,
"num_train_epochs": 76,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.23779983669461e+19,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}