repro_ColQwen2 / trainer_state.json
Hoseob17's picture
Upload folder using huggingface_hub
a21a3b8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 1386,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021645021645021644,
"grad_norm": 10.67443561553955,
"learning_rate": 4.5e-06,
"loss": 3.1621,
"step": 10
},
{
"epoch": 0.04329004329004329,
"grad_norm": 5.979658126831055,
"learning_rate": 9.5e-06,
"loss": 2.9832,
"step": 20
},
{
"epoch": 0.06493506493506493,
"grad_norm": 6.758372783660889,
"learning_rate": 1.45e-05,
"loss": 2.6111,
"step": 30
},
{
"epoch": 0.08658008658008658,
"grad_norm": 6.213019371032715,
"learning_rate": 1.9500000000000003e-05,
"loss": 2.0887,
"step": 40
},
{
"epoch": 0.10822510822510822,
"grad_norm": 4.623664855957031,
"learning_rate": 2.45e-05,
"loss": 1.6826,
"step": 50
},
{
"epoch": 0.12987012987012986,
"grad_norm": 3.8101072311401367,
"learning_rate": 2.95e-05,
"loss": 1.1627,
"step": 60
},
{
"epoch": 0.15151515151515152,
"grad_norm": 3.790235996246338,
"learning_rate": 3.45e-05,
"loss": 0.9186,
"step": 70
},
{
"epoch": 0.17316017316017315,
"grad_norm": 3.3711209297180176,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.7483,
"step": 80
},
{
"epoch": 0.19480519480519481,
"grad_norm": 2.9479563236236572,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.64,
"step": 90
},
{
"epoch": 0.21645021645021645,
"grad_norm": 2.083592414855957,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.5307,
"step": 100
},
{
"epoch": 0.21645021645021645,
"eval_loss": 0.18818749487400055,
"eval_runtime": 26.3426,
"eval_samples_per_second": 18.981,
"eval_steps_per_second": 0.607,
"step": 100
},
{
"epoch": 0.23809523809523808,
"grad_norm": 2.0266427993774414,
"learning_rate": 4.9650077760497674e-05,
"loss": 0.4357,
"step": 110
},
{
"epoch": 0.2597402597402597,
"grad_norm": 2.4006917476654053,
"learning_rate": 4.926127527216174e-05,
"loss": 0.3998,
"step": 120
},
{
"epoch": 0.2813852813852814,
"grad_norm": 1.9328948259353638,
"learning_rate": 4.887247278382582e-05,
"loss": 0.3816,
"step": 130
},
{
"epoch": 0.30303030303030304,
"grad_norm": 2.675459146499634,
"learning_rate": 4.848367029548989e-05,
"loss": 0.3783,
"step": 140
},
{
"epoch": 0.3246753246753247,
"grad_norm": 1.5593161582946777,
"learning_rate": 4.809486780715397e-05,
"loss": 0.3344,
"step": 150
},
{
"epoch": 0.3463203463203463,
"grad_norm": 2.0941596031188965,
"learning_rate": 4.770606531881804e-05,
"loss": 0.3389,
"step": 160
},
{
"epoch": 0.36796536796536794,
"grad_norm": 1.4612257480621338,
"learning_rate": 4.731726283048212e-05,
"loss": 0.3662,
"step": 170
},
{
"epoch": 0.38961038961038963,
"grad_norm": 1.9398542642593384,
"learning_rate": 4.692846034214619e-05,
"loss": 0.3258,
"step": 180
},
{
"epoch": 0.41125541125541126,
"grad_norm": 1.4659396409988403,
"learning_rate": 4.653965785381027e-05,
"loss": 0.3255,
"step": 190
},
{
"epoch": 0.4329004329004329,
"grad_norm": 1.8461475372314453,
"learning_rate": 4.615085536547434e-05,
"loss": 0.3288,
"step": 200
},
{
"epoch": 0.4329004329004329,
"eval_loss": 0.13621099293231964,
"eval_runtime": 25.3133,
"eval_samples_per_second": 19.752,
"eval_steps_per_second": 0.632,
"step": 200
},
{
"epoch": 0.45454545454545453,
"grad_norm": 1.8327516317367554,
"learning_rate": 4.576205287713842e-05,
"loss": 0.3385,
"step": 210
},
{
"epoch": 0.47619047619047616,
"grad_norm": 1.3256858587265015,
"learning_rate": 4.537325038880249e-05,
"loss": 0.3087,
"step": 220
},
{
"epoch": 0.49783549783549785,
"grad_norm": 1.5844945907592773,
"learning_rate": 4.498444790046656e-05,
"loss": 0.3099,
"step": 230
},
{
"epoch": 0.5194805194805194,
"grad_norm": 2.342363119125366,
"learning_rate": 4.4595645412130637e-05,
"loss": 0.2831,
"step": 240
},
{
"epoch": 0.5411255411255411,
"grad_norm": 2.106011152267456,
"learning_rate": 4.420684292379472e-05,
"loss": 0.3231,
"step": 250
},
{
"epoch": 0.5627705627705628,
"grad_norm": 1.503414511680603,
"learning_rate": 4.3818040435458794e-05,
"loss": 0.2756,
"step": 260
},
{
"epoch": 0.5844155844155844,
"grad_norm": 1.6984680891036987,
"learning_rate": 4.342923794712286e-05,
"loss": 0.251,
"step": 270
},
{
"epoch": 0.6060606060606061,
"grad_norm": 1.6489607095718384,
"learning_rate": 4.304043545878694e-05,
"loss": 0.272,
"step": 280
},
{
"epoch": 0.6277056277056277,
"grad_norm": 1.6416125297546387,
"learning_rate": 4.265163297045101e-05,
"loss": 0.2917,
"step": 290
},
{
"epoch": 0.6493506493506493,
"grad_norm": 1.477407693862915,
"learning_rate": 4.226283048211509e-05,
"loss": 0.2849,
"step": 300
},
{
"epoch": 0.6493506493506493,
"eval_loss": 0.12747418880462646,
"eval_runtime": 23.694,
"eval_samples_per_second": 21.102,
"eval_steps_per_second": 0.675,
"step": 300
},
{
"epoch": 0.670995670995671,
"grad_norm": 1.2621872425079346,
"learning_rate": 4.187402799377916e-05,
"loss": 0.2728,
"step": 310
},
{
"epoch": 0.6926406926406926,
"grad_norm": 1.1352860927581787,
"learning_rate": 4.148522550544324e-05,
"loss": 0.2831,
"step": 320
},
{
"epoch": 0.7142857142857143,
"grad_norm": 3.7155425548553467,
"learning_rate": 4.109642301710731e-05,
"loss": 0.2691,
"step": 330
},
{
"epoch": 0.7359307359307359,
"grad_norm": 1.287180781364441,
"learning_rate": 4.070762052877139e-05,
"loss": 0.2711,
"step": 340
},
{
"epoch": 0.7575757575757576,
"grad_norm": 1.4427192211151123,
"learning_rate": 4.031881804043546e-05,
"loss": 0.2923,
"step": 350
},
{
"epoch": 0.7792207792207793,
"grad_norm": 1.3908069133758545,
"learning_rate": 3.993001555209954e-05,
"loss": 0.2636,
"step": 360
},
{
"epoch": 0.8008658008658008,
"grad_norm": 1.7817895412445068,
"learning_rate": 3.954121306376361e-05,
"loss": 0.2533,
"step": 370
},
{
"epoch": 0.8225108225108225,
"grad_norm": 1.4778376817703247,
"learning_rate": 3.915241057542768e-05,
"loss": 0.2829,
"step": 380
},
{
"epoch": 0.8441558441558441,
"grad_norm": 1.4331694841384888,
"learning_rate": 3.8763608087091756e-05,
"loss": 0.2788,
"step": 390
},
{
"epoch": 0.8658008658008658,
"grad_norm": 1.4717748165130615,
"learning_rate": 3.837480559875584e-05,
"loss": 0.318,
"step": 400
},
{
"epoch": 0.8658008658008658,
"eval_loss": 0.11318539083003998,
"eval_runtime": 24.6731,
"eval_samples_per_second": 20.265,
"eval_steps_per_second": 0.648,
"step": 400
},
{
"epoch": 0.8874458874458875,
"grad_norm": 1.3594976663589478,
"learning_rate": 3.798600311041991e-05,
"loss": 0.2735,
"step": 410
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.303183674812317,
"learning_rate": 3.759720062208398e-05,
"loss": 0.249,
"step": 420
},
{
"epoch": 0.9307359307359307,
"grad_norm": 1.1007519960403442,
"learning_rate": 3.7208398133748056e-05,
"loss": 0.2749,
"step": 430
},
{
"epoch": 0.9523809523809523,
"grad_norm": 1.306304931640625,
"learning_rate": 3.681959564541213e-05,
"loss": 0.2643,
"step": 440
},
{
"epoch": 0.974025974025974,
"grad_norm": 1.3814477920532227,
"learning_rate": 3.6430793157076207e-05,
"loss": 0.2601,
"step": 450
},
{
"epoch": 0.9956709956709957,
"grad_norm": 1.2352524995803833,
"learning_rate": 3.604199066874028e-05,
"loss": 0.2669,
"step": 460
},
{
"epoch": 1.0173160173160174,
"grad_norm": 1.3061290979385376,
"learning_rate": 3.565318818040436e-05,
"loss": 0.2395,
"step": 470
},
{
"epoch": 1.0389610389610389,
"grad_norm": 1.3227717876434326,
"learning_rate": 3.526438569206843e-05,
"loss": 0.2439,
"step": 480
},
{
"epoch": 1.0606060606060606,
"grad_norm": 1.2254371643066406,
"learning_rate": 3.487558320373251e-05,
"loss": 0.2421,
"step": 490
},
{
"epoch": 1.0822510822510822,
"grad_norm": 1.0040128231048584,
"learning_rate": 3.448678071539658e-05,
"loss": 0.2691,
"step": 500
},
{
"epoch": 1.0822510822510822,
"eval_loss": 0.12204229831695557,
"eval_runtime": 24.5509,
"eval_samples_per_second": 20.366,
"eval_steps_per_second": 0.652,
"step": 500
},
{
"epoch": 1.103896103896104,
"grad_norm": 1.2335394620895386,
"learning_rate": 3.409797822706066e-05,
"loss": 0.2047,
"step": 510
},
{
"epoch": 1.1255411255411256,
"grad_norm": 1.1710796356201172,
"learning_rate": 3.370917573872473e-05,
"loss": 0.23,
"step": 520
},
{
"epoch": 1.1471861471861473,
"grad_norm": 1.1290990114212036,
"learning_rate": 3.33203732503888e-05,
"loss": 0.2323,
"step": 530
},
{
"epoch": 1.1688311688311688,
"grad_norm": 1.1899147033691406,
"learning_rate": 3.2931570762052876e-05,
"loss": 0.2414,
"step": 540
},
{
"epoch": 1.1904761904761905,
"grad_norm": 1.129758596420288,
"learning_rate": 3.254276827371696e-05,
"loss": 0.2549,
"step": 550
},
{
"epoch": 1.2121212121212122,
"grad_norm": 1.4995540380477905,
"learning_rate": 3.215396578538103e-05,
"loss": 0.2094,
"step": 560
},
{
"epoch": 1.2337662337662338,
"grad_norm": 1.4586797952651978,
"learning_rate": 3.17651632970451e-05,
"loss": 0.24,
"step": 570
},
{
"epoch": 1.2554112554112553,
"grad_norm": 1.256712555885315,
"learning_rate": 3.1376360808709176e-05,
"loss": 0.2069,
"step": 580
},
{
"epoch": 1.277056277056277,
"grad_norm": 1.3815618753433228,
"learning_rate": 3.098755832037325e-05,
"loss": 0.2282,
"step": 590
},
{
"epoch": 1.2987012987012987,
"grad_norm": 1.2242530584335327,
"learning_rate": 3.0598755832037326e-05,
"loss": 0.2318,
"step": 600
},
{
"epoch": 1.2987012987012987,
"eval_loss": 0.12561704218387604,
"eval_runtime": 23.9001,
"eval_samples_per_second": 20.92,
"eval_steps_per_second": 0.669,
"step": 600
},
{
"epoch": 1.3203463203463204,
"grad_norm": 1.352359414100647,
"learning_rate": 3.02099533437014e-05,
"loss": 0.2032,
"step": 610
},
{
"epoch": 1.341991341991342,
"grad_norm": 1.3257685899734497,
"learning_rate": 2.9821150855365476e-05,
"loss": 0.243,
"step": 620
},
{
"epoch": 1.3636363636363638,
"grad_norm": 1.1021738052368164,
"learning_rate": 2.943234836702955e-05,
"loss": 0.2093,
"step": 630
},
{
"epoch": 1.3852813852813852,
"grad_norm": 1.2614463567733765,
"learning_rate": 2.9043545878693623e-05,
"loss": 0.2268,
"step": 640
},
{
"epoch": 1.406926406926407,
"grad_norm": 1.1275025606155396,
"learning_rate": 2.86547433903577e-05,
"loss": 0.2457,
"step": 650
},
{
"epoch": 1.4285714285714286,
"grad_norm": 1.2248728275299072,
"learning_rate": 2.8265940902021777e-05,
"loss": 0.2235,
"step": 660
},
{
"epoch": 1.4502164502164503,
"grad_norm": 1.260707974433899,
"learning_rate": 2.7877138413685848e-05,
"loss": 0.216,
"step": 670
},
{
"epoch": 1.4718614718614718,
"grad_norm": 1.3132363557815552,
"learning_rate": 2.7488335925349923e-05,
"loss": 0.2197,
"step": 680
},
{
"epoch": 1.4935064935064934,
"grad_norm": 1.1333873271942139,
"learning_rate": 2.7099533437013995e-05,
"loss": 0.2319,
"step": 690
},
{
"epoch": 1.5151515151515151,
"grad_norm": 1.1966497898101807,
"learning_rate": 2.6710730948678077e-05,
"loss": 0.225,
"step": 700
},
{
"epoch": 1.5151515151515151,
"eval_loss": 0.12175622582435608,
"eval_runtime": 24.8733,
"eval_samples_per_second": 20.102,
"eval_steps_per_second": 0.643,
"step": 700
},
{
"epoch": 1.5367965367965368,
"grad_norm": 1.1184797286987305,
"learning_rate": 2.632192846034215e-05,
"loss": 0.218,
"step": 710
},
{
"epoch": 1.5584415584415585,
"grad_norm": 1.0531848669052124,
"learning_rate": 2.5933125972006224e-05,
"loss": 0.1902,
"step": 720
},
{
"epoch": 1.5800865800865802,
"grad_norm": 1.3125649690628052,
"learning_rate": 2.5544323483670295e-05,
"loss": 0.2248,
"step": 730
},
{
"epoch": 1.601731601731602,
"grad_norm": 1.2626018524169922,
"learning_rate": 2.515552099533437e-05,
"loss": 0.2111,
"step": 740
},
{
"epoch": 1.6233766233766234,
"grad_norm": 1.2592939138412476,
"learning_rate": 2.4766718506998446e-05,
"loss": 0.2251,
"step": 750
},
{
"epoch": 1.645021645021645,
"grad_norm": 1.3215529918670654,
"learning_rate": 2.437791601866252e-05,
"loss": 0.2089,
"step": 760
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.471236228942871,
"learning_rate": 2.3989113530326596e-05,
"loss": 0.2352,
"step": 770
},
{
"epoch": 1.6883116883116882,
"grad_norm": 1.332468867301941,
"learning_rate": 2.360031104199067e-05,
"loss": 0.2075,
"step": 780
},
{
"epoch": 1.70995670995671,
"grad_norm": 1.2207419872283936,
"learning_rate": 2.3211508553654746e-05,
"loss": 0.2329,
"step": 790
},
{
"epoch": 1.7316017316017316,
"grad_norm": 1.2343429327011108,
"learning_rate": 2.2822706065318818e-05,
"loss": 0.2029,
"step": 800
},
{
"epoch": 1.7316017316017316,
"eval_loss": 0.112111896276474,
"eval_runtime": 24.4556,
"eval_samples_per_second": 20.445,
"eval_steps_per_second": 0.654,
"step": 800
},
{
"epoch": 1.7532467532467533,
"grad_norm": 1.0487161874771118,
"learning_rate": 2.2433903576982893e-05,
"loss": 0.2268,
"step": 810
},
{
"epoch": 1.774891774891775,
"grad_norm": 1.0433166027069092,
"learning_rate": 2.2045101088646968e-05,
"loss": 0.2176,
"step": 820
},
{
"epoch": 1.7965367965367967,
"grad_norm": 1.1807918548583984,
"learning_rate": 2.1656298600311043e-05,
"loss": 0.225,
"step": 830
},
{
"epoch": 1.8181818181818183,
"grad_norm": 1.3814514875411987,
"learning_rate": 2.1267496111975118e-05,
"loss": 0.239,
"step": 840
},
{
"epoch": 1.8398268398268398,
"grad_norm": 1.3248578310012817,
"learning_rate": 2.0878693623639193e-05,
"loss": 0.1726,
"step": 850
},
{
"epoch": 1.8614718614718615,
"grad_norm": 1.1932059526443481,
"learning_rate": 2.0489891135303265e-05,
"loss": 0.2201,
"step": 860
},
{
"epoch": 1.883116883116883,
"grad_norm": 2.034952402114868,
"learning_rate": 2.0101088646967343e-05,
"loss": 0.2238,
"step": 870
},
{
"epoch": 1.9047619047619047,
"grad_norm": 1.3041192293167114,
"learning_rate": 1.9712286158631415e-05,
"loss": 0.2018,
"step": 880
},
{
"epoch": 1.9264069264069263,
"grad_norm": 1.185820460319519,
"learning_rate": 1.9323483670295493e-05,
"loss": 0.2236,
"step": 890
},
{
"epoch": 1.948051948051948,
"grad_norm": 1.0928832292556763,
"learning_rate": 1.8934681181959565e-05,
"loss": 0.2138,
"step": 900
},
{
"epoch": 1.948051948051948,
"eval_loss": 0.10634125024080276,
"eval_runtime": 24.1737,
"eval_samples_per_second": 20.684,
"eval_steps_per_second": 0.662,
"step": 900
},
{
"epoch": 1.9696969696969697,
"grad_norm": 1.2616679668426514,
"learning_rate": 1.854587869362364e-05,
"loss": 0.2109,
"step": 910
},
{
"epoch": 1.9913419913419914,
"grad_norm": 1.5013710260391235,
"learning_rate": 1.8157076205287715e-05,
"loss": 0.196,
"step": 920
},
{
"epoch": 2.012987012987013,
"grad_norm": 1.1472351551055908,
"learning_rate": 1.776827371695179e-05,
"loss": 0.1839,
"step": 930
},
{
"epoch": 2.034632034632035,
"grad_norm": 1.5114259719848633,
"learning_rate": 1.7379471228615865e-05,
"loss": 0.2003,
"step": 940
},
{
"epoch": 2.0562770562770565,
"grad_norm": 1.0763689279556274,
"learning_rate": 1.6990668740279937e-05,
"loss": 0.2029,
"step": 950
},
{
"epoch": 2.0779220779220777,
"grad_norm": 1.5134332180023193,
"learning_rate": 1.6601866251944012e-05,
"loss": 0.2028,
"step": 960
},
{
"epoch": 2.0995670995670994,
"grad_norm": 1.4813941717147827,
"learning_rate": 1.6213063763608087e-05,
"loss": 0.1657,
"step": 970
},
{
"epoch": 2.121212121212121,
"grad_norm": 1.3589473962783813,
"learning_rate": 1.5824261275272162e-05,
"loss": 0.2153,
"step": 980
},
{
"epoch": 2.142857142857143,
"grad_norm": 1.2208274602890015,
"learning_rate": 1.5435458786936237e-05,
"loss": 0.1889,
"step": 990
},
{
"epoch": 2.1645021645021645,
"grad_norm": 1.466425895690918,
"learning_rate": 1.5046656298600313e-05,
"loss": 0.1786,
"step": 1000
},
{
"epoch": 2.1645021645021645,
"eval_loss": 0.11710453778505325,
"eval_runtime": 23.4898,
"eval_samples_per_second": 21.286,
"eval_steps_per_second": 0.681,
"step": 1000
},
{
"epoch": 2.186147186147186,
"grad_norm": 1.1942178010940552,
"learning_rate": 1.4657853810264386e-05,
"loss": 0.1597,
"step": 1010
},
{
"epoch": 2.207792207792208,
"grad_norm": 1.6706466674804688,
"learning_rate": 1.4269051321928461e-05,
"loss": 0.1746,
"step": 1020
},
{
"epoch": 2.2294372294372296,
"grad_norm": 1.4485812187194824,
"learning_rate": 1.3880248833592534e-05,
"loss": 0.1874,
"step": 1030
},
{
"epoch": 2.2510822510822512,
"grad_norm": 1.2479631900787354,
"learning_rate": 1.3491446345256611e-05,
"loss": 0.1755,
"step": 1040
},
{
"epoch": 2.2727272727272725,
"grad_norm": 1.4938236474990845,
"learning_rate": 1.3102643856920685e-05,
"loss": 0.1765,
"step": 1050
},
{
"epoch": 2.2943722943722946,
"grad_norm": 1.1002484560012817,
"learning_rate": 1.2713841368584758e-05,
"loss": 0.1823,
"step": 1060
},
{
"epoch": 2.316017316017316,
"grad_norm": 1.3851666450500488,
"learning_rate": 1.2325038880248835e-05,
"loss": 0.199,
"step": 1070
},
{
"epoch": 2.3376623376623376,
"grad_norm": 1.0471409559249878,
"learning_rate": 1.1936236391912908e-05,
"loss": 0.1707,
"step": 1080
},
{
"epoch": 2.3593073593073592,
"grad_norm": 1.4738085269927979,
"learning_rate": 1.1547433903576983e-05,
"loss": 0.1716,
"step": 1090
},
{
"epoch": 2.380952380952381,
"grad_norm": 1.499088168144226,
"learning_rate": 1.1158631415241058e-05,
"loss": 0.1773,
"step": 1100
},
{
"epoch": 2.380952380952381,
"eval_loss": 0.11423339694738388,
"eval_runtime": 25.1041,
"eval_samples_per_second": 19.917,
"eval_steps_per_second": 0.637,
"step": 1100
},
{
"epoch": 2.4025974025974026,
"grad_norm": 1.1052640676498413,
"learning_rate": 1.0769828926905133e-05,
"loss": 0.164,
"step": 1110
},
{
"epoch": 2.4242424242424243,
"grad_norm": 1.096550464630127,
"learning_rate": 1.0381026438569208e-05,
"loss": 0.1904,
"step": 1120
},
{
"epoch": 2.445887445887446,
"grad_norm": 1.2570807933807373,
"learning_rate": 9.992223950233282e-06,
"loss": 0.172,
"step": 1130
},
{
"epoch": 2.4675324675324677,
"grad_norm": 1.1647895574569702,
"learning_rate": 9.603421461897357e-06,
"loss": 0.1816,
"step": 1140
},
{
"epoch": 2.4891774891774894,
"grad_norm": 1.2219709157943726,
"learning_rate": 9.214618973561432e-06,
"loss": 0.1883,
"step": 1150
},
{
"epoch": 2.5108225108225106,
"grad_norm": 1.1570717096328735,
"learning_rate": 8.825816485225505e-06,
"loss": 0.1864,
"step": 1160
},
{
"epoch": 2.5324675324675323,
"grad_norm": 1.3424004316329956,
"learning_rate": 8.43701399688958e-06,
"loss": 0.1662,
"step": 1170
},
{
"epoch": 2.554112554112554,
"grad_norm": 1.32485032081604,
"learning_rate": 8.048211508553654e-06,
"loss": 0.1593,
"step": 1180
},
{
"epoch": 2.5757575757575757,
"grad_norm": 1.309839129447937,
"learning_rate": 7.659409020217729e-06,
"loss": 0.1811,
"step": 1190
},
{
"epoch": 2.5974025974025974,
"grad_norm": 1.4496508836746216,
"learning_rate": 7.270606531881805e-06,
"loss": 0.1953,
"step": 1200
},
{
"epoch": 2.5974025974025974,
"eval_loss": 0.11794831603765488,
"eval_runtime": 24.811,
"eval_samples_per_second": 20.152,
"eval_steps_per_second": 0.645,
"step": 1200
},
{
"epoch": 2.619047619047619,
"grad_norm": 1.12656569480896,
"learning_rate": 6.881804043545879e-06,
"loss": 0.1845,
"step": 1210
},
{
"epoch": 2.6406926406926408,
"grad_norm": 1.2158961296081543,
"learning_rate": 6.493001555209954e-06,
"loss": 0.1755,
"step": 1220
},
{
"epoch": 2.6623376623376624,
"grad_norm": 0.9884421229362488,
"learning_rate": 6.1041990668740285e-06,
"loss": 0.1611,
"step": 1230
},
{
"epoch": 2.683982683982684,
"grad_norm": 1.1807091236114502,
"learning_rate": 5.715396578538103e-06,
"loss": 0.1955,
"step": 1240
},
{
"epoch": 2.7056277056277054,
"grad_norm": 1.504586935043335,
"learning_rate": 5.326594090202178e-06,
"loss": 0.1999,
"step": 1250
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.1582438945770264,
"learning_rate": 4.937791601866252e-06,
"loss": 0.1815,
"step": 1260
},
{
"epoch": 2.7489177489177488,
"grad_norm": 1.0870862007141113,
"learning_rate": 4.548989113530327e-06,
"loss": 0.1563,
"step": 1270
},
{
"epoch": 2.7705627705627704,
"grad_norm": 1.4519983530044556,
"learning_rate": 4.160186625194401e-06,
"loss": 0.171,
"step": 1280
},
{
"epoch": 2.792207792207792,
"grad_norm": 1.3172404766082764,
"learning_rate": 3.771384136858476e-06,
"loss": 0.1807,
"step": 1290
},
{
"epoch": 2.813852813852814,
"grad_norm": 1.5675866603851318,
"learning_rate": 3.382581648522551e-06,
"loss": 0.1732,
"step": 1300
},
{
"epoch": 2.813852813852814,
"eval_loss": 0.11541545391082764,
"eval_runtime": 25.6413,
"eval_samples_per_second": 19.5,
"eval_steps_per_second": 0.624,
"step": 1300
},
{
"epoch": 2.8354978354978355,
"grad_norm": 1.2589536905288696,
"learning_rate": 2.9937791601866253e-06,
"loss": 0.1823,
"step": 1310
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.9841740727424622,
"learning_rate": 2.6049766718507e-06,
"loss": 0.1649,
"step": 1320
},
{
"epoch": 2.878787878787879,
"grad_norm": 1.3231135606765747,
"learning_rate": 2.2161741835147746e-06,
"loss": 0.1784,
"step": 1330
},
{
"epoch": 2.9004329004329006,
"grad_norm": 1.3610490560531616,
"learning_rate": 1.827371695178849e-06,
"loss": 0.1759,
"step": 1340
},
{
"epoch": 2.9220779220779223,
"grad_norm": 1.0712759494781494,
"learning_rate": 1.438569206842924e-06,
"loss": 0.1848,
"step": 1350
},
{
"epoch": 2.9437229437229435,
"grad_norm": 1.3472720384597778,
"learning_rate": 1.0497667185069984e-06,
"loss": 0.2005,
"step": 1360
},
{
"epoch": 2.965367965367965,
"grad_norm": 1.5114219188690186,
"learning_rate": 6.609642301710731e-07,
"loss": 0.1808,
"step": 1370
},
{
"epoch": 2.987012987012987,
"grad_norm": 1.4632176160812378,
"learning_rate": 2.721617418351478e-07,
"loss": 0.1878,
"step": 1380
}
],
"logging_steps": 10,
"max_steps": 1386,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3171098953286943e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}