NHSQLNL / trainer_state.json
combe4259's picture
init
7c0da7b
{
"best_global_step": 1650,
"best_metric": 0.006614842917770147,
"best_model_checkpoint": "/content/NH-SQL-finetuned/checkpoint-1650",
"epoch": 50.0,
"eval_steps": 500,
"global_step": 1650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030303030303030304,
"grad_norm": 6.595283031463623,
"learning_rate": 0.0,
"loss": 1.9666,
"step": 1
},
{
"epoch": 0.6060606060606061,
"grad_norm": 6.3463263511657715,
"learning_rate": 5.757575757575758e-07,
"loss": 1.8746,
"step": 20
},
{
"epoch": 1.0,
"eval_loss": 1.7834347486495972,
"eval_runtime": 0.9389,
"eval_samples_per_second": 140.592,
"eval_steps_per_second": 18.107,
"step": 33
},
{
"epoch": 1.2121212121212122,
"grad_norm": 4.871561050415039,
"learning_rate": 1.181818181818182e-06,
"loss": 1.7412,
"step": 40
},
{
"epoch": 1.8181818181818183,
"grad_norm": 3.2557735443115234,
"learning_rate": 1.787878787878788e-06,
"loss": 1.5522,
"step": 60
},
{
"epoch": 2.0,
"eval_loss": 1.2882661819458008,
"eval_runtime": 0.9475,
"eval_samples_per_second": 139.315,
"eval_steps_per_second": 17.942,
"step": 66
},
{
"epoch": 2.4242424242424243,
"grad_norm": 1.9632468223571777,
"learning_rate": 2.393939393939394e-06,
"loss": 1.2669,
"step": 80
},
{
"epoch": 3.0,
"eval_loss": 0.8929286003112793,
"eval_runtime": 0.9528,
"eval_samples_per_second": 138.545,
"eval_steps_per_second": 17.843,
"step": 99
},
{
"epoch": 3.0303030303030303,
"grad_norm": 1.6738139390945435,
"learning_rate": 3e-06,
"loss": 1.0527,
"step": 100
},
{
"epoch": 3.6363636363636362,
"grad_norm": 1.4490736722946167,
"learning_rate": 3.606060606060606e-06,
"loss": 0.8491,
"step": 120
},
{
"epoch": 4.0,
"eval_loss": 0.5932092070579529,
"eval_runtime": 0.9509,
"eval_samples_per_second": 138.815,
"eval_steps_per_second": 17.878,
"step": 132
},
{
"epoch": 4.242424242424242,
"grad_norm": 1.0791282653808594,
"learning_rate": 4.212121212121212e-06,
"loss": 0.6586,
"step": 140
},
{
"epoch": 4.848484848484849,
"grad_norm": 0.8789141178131104,
"learning_rate": 4.818181818181819e-06,
"loss": 0.4801,
"step": 160
},
{
"epoch": 5.0,
"eval_loss": 0.42336800694465637,
"eval_runtime": 0.9459,
"eval_samples_per_second": 139.555,
"eval_steps_per_second": 17.973,
"step": 165
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.8390600085258484,
"learning_rate": 4.9989035693310165e-06,
"loss": 0.4134,
"step": 180
},
{
"epoch": 6.0,
"eval_loss": 0.3166239559650421,
"eval_runtime": 0.9402,
"eval_samples_per_second": 140.394,
"eval_steps_per_second": 18.081,
"step": 198
},
{
"epoch": 6.0606060606060606,
"grad_norm": 0.908724308013916,
"learning_rate": 4.993535611735464e-06,
"loss": 0.33,
"step": 200
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.8689959645271301,
"learning_rate": 4.983704338371375e-06,
"loss": 0.2941,
"step": 220
},
{
"epoch": 7.0,
"eval_loss": 0.246794193983078,
"eval_runtime": 0.9539,
"eval_samples_per_second": 138.381,
"eval_steps_per_second": 17.822,
"step": 231
},
{
"epoch": 7.2727272727272725,
"grad_norm": 0.806349515914917,
"learning_rate": 4.969427346772643e-06,
"loss": 0.2513,
"step": 240
},
{
"epoch": 7.878787878787879,
"grad_norm": 0.8216381072998047,
"learning_rate": 4.950730192107368e-06,
"loss": 0.2244,
"step": 260
},
{
"epoch": 8.0,
"eval_loss": 0.19389495253562927,
"eval_runtime": 0.9398,
"eval_samples_per_second": 140.46,
"eval_steps_per_second": 18.09,
"step": 264
},
{
"epoch": 8.484848484848484,
"grad_norm": 0.8309475183486938,
"learning_rate": 4.927646341435276e-06,
"loss": 0.1756,
"step": 280
},
{
"epoch": 9.0,
"eval_loss": 0.1608426719903946,
"eval_runtime": 0.9567,
"eval_samples_per_second": 137.969,
"eval_steps_per_second": 17.769,
"step": 297
},
{
"epoch": 9.090909090909092,
"grad_norm": 0.807766854763031,
"learning_rate": 4.900217113803193e-06,
"loss": 0.1666,
"step": 300
},
{
"epoch": 9.696969696969697,
"grad_norm": 0.638713538646698,
"learning_rate": 4.868491606285823e-06,
"loss": 0.1756,
"step": 320
},
{
"epoch": 10.0,
"eval_loss": 0.13285745680332184,
"eval_runtime": 0.9431,
"eval_samples_per_second": 139.968,
"eval_steps_per_second": 18.026,
"step": 330
},
{
"epoch": 10.303030303030303,
"grad_norm": 0.6577040553092957,
"learning_rate": 4.832526606104213e-06,
"loss": 0.1407,
"step": 340
},
{
"epoch": 10.909090909090908,
"grad_norm": 0.7107315063476562,
"learning_rate": 4.792386488979193e-06,
"loss": 0.1218,
"step": 360
},
{
"epoch": 11.0,
"eval_loss": 0.1115950271487236,
"eval_runtime": 0.9468,
"eval_samples_per_second": 139.417,
"eval_steps_per_second": 17.955,
"step": 363
},
{
"epoch": 11.515151515151516,
"grad_norm": 0.7401019334793091,
"learning_rate": 4.74814310390176e-06,
"loss": 0.1362,
"step": 380
},
{
"epoch": 12.0,
"eval_loss": 0.09207186847925186,
"eval_runtime": 0.9332,
"eval_samples_per_second": 141.446,
"eval_steps_per_second": 18.216,
"step": 396
},
{
"epoch": 12.121212121212121,
"grad_norm": 0.68468177318573,
"learning_rate": 4.699875644526633e-06,
"loss": 0.0987,
"step": 400
},
{
"epoch": 12.727272727272727,
"grad_norm": 0.7722787857055664,
"learning_rate": 4.647670507419206e-06,
"loss": 0.0989,
"step": 420
},
{
"epoch": 13.0,
"eval_loss": 0.0792744979262352,
"eval_runtime": 0.935,
"eval_samples_per_second": 141.172,
"eval_steps_per_second": 18.181,
"step": 429
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.48431074619293213,
"learning_rate": 4.591621137409602e-06,
"loss": 0.0936,
"step": 440
},
{
"epoch": 13.93939393939394,
"grad_norm": 0.555296778678894,
"learning_rate": 4.53182786033067e-06,
"loss": 0.0945,
"step": 460
},
{
"epoch": 14.0,
"eval_loss": 0.07043830305337906,
"eval_runtime": 0.9312,
"eval_samples_per_second": 141.753,
"eval_steps_per_second": 18.256,
"step": 462
},
{
"epoch": 14.545454545454545,
"grad_norm": 0.5280194878578186,
"learning_rate": 4.468397703439282e-06,
"loss": 0.0811,
"step": 480
},
{
"epoch": 15.0,
"eval_loss": 0.06017066538333893,
"eval_runtime": 0.9305,
"eval_samples_per_second": 141.866,
"eval_steps_per_second": 18.271,
"step": 495
},
{
"epoch": 15.151515151515152,
"grad_norm": 0.5912495851516724,
"learning_rate": 4.401444203842396e-06,
"loss": 0.0742,
"step": 500
},
{
"epoch": 15.757575757575758,
"grad_norm": 1.0043439865112305,
"learning_rate": 4.331087205270778e-06,
"loss": 0.0778,
"step": 520
},
{
"epoch": 16.0,
"eval_loss": 0.052856337279081345,
"eval_runtime": 0.9363,
"eval_samples_per_second": 140.988,
"eval_steps_per_second": 18.158,
"step": 528
},
{
"epoch": 16.363636363636363,
"grad_norm": 0.37315091490745544,
"learning_rate": 4.257452643564155e-06,
"loss": 0.0746,
"step": 540
},
{
"epoch": 16.96969696969697,
"grad_norm": 0.5725280046463013,
"learning_rate": 4.180672321251766e-06,
"loss": 0.0651,
"step": 560
},
{
"epoch": 17.0,
"eval_loss": 0.04653371125459671,
"eval_runtime": 0.9263,
"eval_samples_per_second": 142.504,
"eval_steps_per_second": 18.353,
"step": 561
},
{
"epoch": 17.575757575757574,
"grad_norm": 0.6575049161911011,
"learning_rate": 4.100883671631806e-06,
"loss": 0.0529,
"step": 580
},
{
"epoch": 18.0,
"eval_loss": 0.04210372641682625,
"eval_runtime": 0.9448,
"eval_samples_per_second": 139.719,
"eval_steps_per_second": 17.994,
"step": 594
},
{
"epoch": 18.181818181818183,
"grad_norm": 0.7622207999229431,
"learning_rate": 4.018229512772053e-06,
"loss": 0.0644,
"step": 600
},
{
"epoch": 18.78787878787879,
"grad_norm": 0.5367633700370789,
"learning_rate": 3.9328577918719916e-06,
"loss": 0.0551,
"step": 620
},
{
"epoch": 19.0,
"eval_loss": 0.03950377553701401,
"eval_runtime": 0.9419,
"eval_samples_per_second": 140.144,
"eval_steps_per_second": 18.049,
"step": 627
},
{
"epoch": 19.393939393939394,
"grad_norm": 0.581658661365509,
"learning_rate": 3.844921320444031e-06,
"loss": 0.0566,
"step": 640
},
{
"epoch": 20.0,
"grad_norm": 0.7774003148078918,
"learning_rate": 3.754577500787828e-06,
"loss": 0.0532,
"step": 660
},
{
"epoch": 20.0,
"eval_loss": 0.0353802889585495,
"eval_runtime": 0.9534,
"eval_samples_per_second": 138.453,
"eval_steps_per_second": 17.831,
"step": 660
},
{
"epoch": 20.606060606060606,
"grad_norm": 0.6069843769073486,
"learning_rate": 3.66198804424729e-06,
"loss": 0.0436,
"step": 680
},
{
"epoch": 21.0,
"eval_loss": 0.031004376709461212,
"eval_runtime": 0.9391,
"eval_samples_per_second": 140.556,
"eval_steps_per_second": 18.102,
"step": 693
},
{
"epoch": 21.21212121212121,
"grad_norm": 0.5869702696800232,
"learning_rate": 3.5673186817546047e-06,
"loss": 0.0487,
"step": 700
},
{
"epoch": 21.818181818181817,
"grad_norm": 0.39276406168937683,
"learning_rate": 3.4707388671793814e-06,
"loss": 0.0459,
"step": 720
},
{
"epoch": 22.0,
"eval_loss": 0.027377676218748093,
"eval_runtime": 0.9405,
"eval_samples_per_second": 140.352,
"eval_steps_per_second": 18.076,
"step": 726
},
{
"epoch": 22.424242424242426,
"grad_norm": 0.5247741937637329,
"learning_rate": 3.3724214740138933e-06,
"loss": 0.0461,
"step": 740
},
{
"epoch": 23.0,
"eval_loss": 0.02525358274579048,
"eval_runtime": 0.9593,
"eval_samples_per_second": 137.596,
"eval_steps_per_second": 17.721,
"step": 759
},
{
"epoch": 23.03030303030303,
"grad_norm": 0.4420148730278015,
"learning_rate": 3.272542485937369e-06,
"loss": 0.0424,
"step": 760
},
{
"epoch": 23.636363636363637,
"grad_norm": 0.4988000690937042,
"learning_rate": 3.171280681813174e-06,
"loss": 0.0443,
"step": 780
},
{
"epoch": 24.0,
"eval_loss": 0.02300359681248665,
"eval_runtime": 0.9498,
"eval_samples_per_second": 138.973,
"eval_steps_per_second": 17.898,
"step": 792
},
{
"epoch": 24.242424242424242,
"grad_norm": 0.6696539521217346,
"learning_rate": 3.0688173156827454e-06,
"loss": 0.0346,
"step": 800
},
{
"epoch": 24.848484848484848,
"grad_norm": 0.6659572720527649,
"learning_rate": 2.9653357923290753e-06,
"loss": 0.0394,
"step": 820
},
{
"epoch": 25.0,
"eval_loss": 0.02064535580575466,
"eval_runtime": 0.9419,
"eval_samples_per_second": 140.14,
"eval_steps_per_second": 18.048,
"step": 825
},
{
"epoch": 25.454545454545453,
"grad_norm": 0.477318674325943,
"learning_rate": 2.86102133899045e-06,
"loss": 0.0354,
"step": 840
},
{
"epoch": 26.0,
"eval_loss": 0.017997996881604195,
"eval_runtime": 0.9337,
"eval_samples_per_second": 141.375,
"eval_steps_per_second": 18.207,
"step": 858
},
{
"epoch": 26.060606060606062,
"grad_norm": 0.4188827574253082,
"learning_rate": 2.7560606738120947e-06,
"loss": 0.0379,
"step": 860
},
{
"epoch": 26.666666666666668,
"grad_norm": 0.37732234597206116,
"learning_rate": 2.6506416716291466e-06,
"loss": 0.0369,
"step": 880
},
{
"epoch": 27.0,
"eval_loss": 0.01666710339486599,
"eval_runtime": 0.9423,
"eval_samples_per_second": 140.084,
"eval_steps_per_second": 18.041,
"step": 891
},
{
"epoch": 27.272727272727273,
"grad_norm": 0.5058871507644653,
"learning_rate": 2.544953027679216e-06,
"loss": 0.0327,
"step": 900
},
{
"epoch": 27.87878787878788,
"grad_norm": 0.5595805644989014,
"learning_rate": 2.4391839198464613e-06,
"loss": 0.0338,
"step": 920
},
{
"epoch": 28.0,
"eval_loss": 0.015013493597507477,
"eval_runtime": 0.93,
"eval_samples_per_second": 141.934,
"eval_steps_per_second": 18.279,
"step": 924
},
{
"epoch": 28.484848484848484,
"grad_norm": 0.4609270393848419,
"learning_rate": 2.3335236700417404e-06,
"loss": 0.0306,
"step": 940
},
{
"epoch": 29.0,
"eval_loss": 0.014108900912106037,
"eval_runtime": 0.9248,
"eval_samples_per_second": 142.737,
"eval_steps_per_second": 18.383,
"step": 957
},
{
"epoch": 29.09090909090909,
"grad_norm": 0.3746008276939392,
"learning_rate": 2.2281614053249796e-06,
"loss": 0.0307,
"step": 960
},
{
"epoch": 29.696969696969695,
"grad_norm": 0.5330935716629028,
"learning_rate": 2.1232857193762923e-06,
"loss": 0.0298,
"step": 980
},
{
"epoch": 30.0,
"eval_loss": 0.013387720100581646,
"eval_runtime": 0.9587,
"eval_samples_per_second": 137.688,
"eval_steps_per_second": 17.733,
"step": 990
},
{
"epoch": 30.303030303030305,
"grad_norm": 0.31854015588760376,
"learning_rate": 2.019084334921849e-06,
"loss": 0.028,
"step": 1000
},
{
"epoch": 30.90909090909091,
"grad_norm": 0.38515424728393555,
"learning_rate": 1.9157437677186903e-06,
"loss": 0.031,
"step": 1020
},
{
"epoch": 31.0,
"eval_loss": 0.012279902584850788,
"eval_runtime": 0.9342,
"eval_samples_per_second": 141.305,
"eval_steps_per_second": 18.198,
"step": 1023
},
{
"epoch": 31.515151515151516,
"grad_norm": 0.45346567034721375,
"learning_rate": 1.8134489926999837e-06,
"loss": 0.033,
"step": 1040
},
{
"epoch": 32.0,
"eval_loss": 0.011261457577347755,
"eval_runtime": 0.9421,
"eval_samples_per_second": 140.119,
"eval_steps_per_second": 18.046,
"step": 1056
},
{
"epoch": 32.121212121212125,
"grad_norm": 0.44893690943717957,
"learning_rate": 1.7123831128782686e-06,
"loss": 0.0246,
"step": 1060
},
{
"epoch": 32.72727272727273,
"grad_norm": 0.4021283984184265,
"learning_rate": 1.612727031599356e-06,
"loss": 0.03,
"step": 1080
},
{
"epoch": 33.0,
"eval_loss": 0.010378457605838776,
"eval_runtime": 0.9543,
"eval_samples_per_second": 138.318,
"eval_steps_per_second": 17.814,
"step": 1089
},
{
"epoch": 33.333333333333336,
"grad_norm": 0.6586357951164246,
"learning_rate": 1.5146591287335452e-06,
"loss": 0.0266,
"step": 1100
},
{
"epoch": 33.93939393939394,
"grad_norm": 0.4133249521255493,
"learning_rate": 1.4183549413837288e-06,
"loss": 0.026,
"step": 1120
},
{
"epoch": 34.0,
"eval_loss": 0.009874224662780762,
"eval_runtime": 0.9426,
"eval_samples_per_second": 140.041,
"eval_steps_per_second": 18.036,
"step": 1122
},
{
"epoch": 34.54545454545455,
"grad_norm": 0.4618055522441864,
"learning_rate": 1.3239868496819407e-06,
"loss": 0.0278,
"step": 1140
},
{
"epoch": 35.0,
"eval_loss": 0.009275372140109539,
"eval_runtime": 0.9399,
"eval_samples_per_second": 140.441,
"eval_steps_per_second": 18.087,
"step": 1155
},
{
"epoch": 35.15151515151515,
"grad_norm": 0.3481239676475525,
"learning_rate": 1.2317237682367178e-06,
"loss": 0.0253,
"step": 1160
},
{
"epoch": 35.75757575757576,
"grad_norm": 0.42644399404525757,
"learning_rate": 1.1417308437836181e-06,
"loss": 0.0269,
"step": 1180
},
{
"epoch": 36.0,
"eval_loss": 0.008818729780614376,
"eval_runtime": 0.9417,
"eval_samples_per_second": 140.172,
"eval_steps_per_second": 18.052,
"step": 1188
},
{
"epoch": 36.36363636363637,
"grad_norm": 0.5186192393302917,
"learning_rate": 1.0541691595800338e-06,
"loss": 0.0231,
"step": 1200
},
{
"epoch": 36.96969696969697,
"grad_norm": 0.3892291784286499,
"learning_rate": 9.691954470734692e-07,
"loss": 0.0273,
"step": 1220
},
{
"epoch": 37.0,
"eval_loss": 0.008245617151260376,
"eval_runtime": 0.9541,
"eval_samples_per_second": 138.353,
"eval_steps_per_second": 17.818,
"step": 1221
},
{
"epoch": 37.57575757575758,
"grad_norm": 0.5318649411201477,
"learning_rate": 8.869618053593429e-07,
"loss": 0.0251,
"step": 1240
},
{
"epoch": 38.0,
"eval_loss": 0.007936290465295315,
"eval_runtime": 0.9281,
"eval_samples_per_second": 142.225,
"eval_steps_per_second": 18.317,
"step": 1254
},
{
"epoch": 38.18181818181818,
"grad_norm": 0.37824392318725586,
"learning_rate": 8.076154289305019e-07,
"loss": 0.0258,
"step": 1260
},
{
"epoch": 38.78787878787879,
"grad_norm": 0.402786523103714,
"learning_rate": 7.312983442057497e-07,
"loss": 0.0263,
"step": 1280
},
{
"epoch": 39.0,
"eval_loss": 0.007719958666712046,
"eval_runtime": 0.9423,
"eval_samples_per_second": 140.079,
"eval_steps_per_second": 18.041,
"step": 1287
},
{
"epoch": 39.39393939393939,
"grad_norm": 0.41106194257736206,
"learning_rate": 6.581471553089874e-07,
"loss": 0.0243,
"step": 1300
},
{
"epoch": 40.0,
"grad_norm": 0.5733346939086914,
"learning_rate": 5.882927995540266e-07,
"loss": 0.0247,
"step": 1320
},
{
"epoch": 40.0,
"eval_loss": 0.00738176517188549,
"eval_runtime": 0.9247,
"eval_samples_per_second": 142.756,
"eval_steps_per_second": 18.385,
"step": 1320
},
{
"epoch": 40.60606060606061,
"grad_norm": 0.3024619221687317,
"learning_rate": 5.218603130727243e-07,
"loss": 0.0243,
"step": 1340
},
{
"epoch": 41.0,
"eval_loss": 0.007284797262400389,
"eval_runtime": 0.9333,
"eval_samples_per_second": 141.426,
"eval_steps_per_second": 18.214,
"step": 1353
},
{
"epoch": 41.21212121212121,
"grad_norm": 0.48434221744537354,
"learning_rate": 4.589686070059762e-07,
"loss": 0.0245,
"step": 1360
},
{
"epoch": 41.81818181818182,
"grad_norm": 0.4191039204597473,
"learning_rate": 3.997302546581597e-07,
"loss": 0.0259,
"step": 1380
},
{
"epoch": 42.0,
"eval_loss": 0.007058488205075264,
"eval_runtime": 0.9549,
"eval_samples_per_second": 138.232,
"eval_steps_per_second": 17.803,
"step": 1386
},
{
"epoch": 42.42424242424242,
"grad_norm": 0.3011131286621094,
"learning_rate": 3.4425128999602265e-07,
"loss": 0.0234,
"step": 1400
},
{
"epoch": 43.0,
"eval_loss": 0.006919534411281347,
"eval_runtime": 0.9466,
"eval_samples_per_second": 139.447,
"eval_steps_per_second": 17.959,
"step": 1419
},
{
"epoch": 43.03030303030303,
"grad_norm": 0.49921727180480957,
"learning_rate": 2.9263101785268253e-07,
"loss": 0.0268,
"step": 1420
},
{
"epoch": 43.63636363636363,
"grad_norm": 0.41327381134033203,
"learning_rate": 2.449618361764788e-07,
"loss": 0.0232,
"step": 1440
},
{
"epoch": 44.0,
"eval_loss": 0.0067825643345713615,
"eval_runtime": 0.9457,
"eval_samples_per_second": 139.574,
"eval_steps_per_second": 17.975,
"step": 1452
},
{
"epoch": 44.24242424242424,
"grad_norm": 0.42085084319114685,
"learning_rate": 2.0132907064282837e-07,
"loss": 0.0222,
"step": 1460
},
{
"epoch": 44.84848484848485,
"grad_norm": 0.4812968075275421,
"learning_rate": 1.6181082192513352e-07,
"loss": 0.0245,
"step": 1480
},
{
"epoch": 45.0,
"eval_loss": 0.006708750035613775,
"eval_runtime": 0.944,
"eval_samples_per_second": 139.826,
"eval_steps_per_second": 18.008,
"step": 1485
},
{
"epoch": 45.45454545454545,
"grad_norm": 0.41410696506500244,
"learning_rate": 1.264778258981178e-07,
"loss": 0.0234,
"step": 1500
},
{
"epoch": 46.0,
"eval_loss": 0.006636774633079767,
"eval_runtime": 0.9246,
"eval_samples_per_second": 142.77,
"eval_steps_per_second": 18.387,
"step": 1518
},
{
"epoch": 46.06060606060606,
"grad_norm": 0.36860212683677673,
"learning_rate": 9.539332702381026e-08,
"loss": 0.0264,
"step": 1520
},
{
"epoch": 46.666666666666664,
"grad_norm": 0.3396029770374298,
"learning_rate": 6.86129651468273e-08,
"loss": 0.0229,
"step": 1540
},
{
"epoch": 47.0,
"eval_loss": 0.00661947438493371,
"eval_runtime": 0.9503,
"eval_samples_per_second": 138.905,
"eval_steps_per_second": 17.889,
"step": 1551
},
{
"epoch": 47.27272727272727,
"grad_norm": 0.4342035949230194,
"learning_rate": 4.618467590157133e-08,
"loss": 0.0233,
"step": 1560
},
{
"epoch": 47.878787878787875,
"grad_norm": 0.4722955822944641,
"learning_rate": 2.814860490961607e-08,
"loss": 0.0248,
"step": 1580
},
{
"epoch": 48.0,
"eval_loss": 0.006618270184844732,
"eval_runtime": 0.9496,
"eval_samples_per_second": 139.011,
"eval_steps_per_second": 17.903,
"step": 1584
},
{
"epoch": 48.484848484848484,
"grad_norm": 0.22162474691867828,
"learning_rate": 1.453703592086353e-08,
"loss": 0.0239,
"step": 1600
},
{
"epoch": 49.0,
"eval_loss": 0.006619932595640421,
"eval_runtime": 0.947,
"eval_samples_per_second": 139.389,
"eval_steps_per_second": 17.952,
"step": 1617
},
{
"epoch": 49.09090909090909,
"grad_norm": 0.4756720960140228,
"learning_rate": 5.374333027093892e-09,
"loss": 0.0236,
"step": 1620
},
{
"epoch": 49.696969696969695,
"grad_norm": 0.42004162073135376,
"learning_rate": 6.768970513457151e-10,
"loss": 0.0228,
"step": 1640
},
{
"epoch": 50.0,
"eval_loss": 0.006614842917770147,
"eval_runtime": 0.9549,
"eval_samples_per_second": 138.227,
"eval_steps_per_second": 17.802,
"step": 1650
}
],
"logging_steps": 20,
"max_steps": 1650,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.91340755288064e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}