AKILM-reason / trainer_state.json
ini's picture
Upload folder using huggingface_hub
53d57c3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 7.9816091954022985,
"eval_steps": 500,
"global_step": 1736,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04597701149425287,
"grad_norm": 19.236724341828868,
"learning_rate": 2.8735632183908047e-07,
"loss": 0.9813,
"step": 10
},
{
"epoch": 0.09195402298850575,
"grad_norm": 22.46256737380087,
"learning_rate": 5.747126436781609e-07,
"loss": 0.9645,
"step": 20
},
{
"epoch": 0.13793103448275862,
"grad_norm": 7.740951848649134,
"learning_rate": 8.620689655172415e-07,
"loss": 0.8355,
"step": 30
},
{
"epoch": 0.1839080459770115,
"grad_norm": 6.373158348765807,
"learning_rate": 1.1494252873563219e-06,
"loss": 0.7287,
"step": 40
},
{
"epoch": 0.22988505747126436,
"grad_norm": 4.553132194954343,
"learning_rate": 1.4367816091954023e-06,
"loss": 0.6077,
"step": 50
},
{
"epoch": 0.27586206896551724,
"grad_norm": 4.217339152427201,
"learning_rate": 1.724137931034483e-06,
"loss": 0.5534,
"step": 60
},
{
"epoch": 0.3218390804597701,
"grad_norm": 4.295308772185913,
"learning_rate": 2.0114942528735633e-06,
"loss": 0.5322,
"step": 70
},
{
"epoch": 0.367816091954023,
"grad_norm": 4.0641475269520155,
"learning_rate": 2.2988505747126437e-06,
"loss": 0.5003,
"step": 80
},
{
"epoch": 0.41379310344827586,
"grad_norm": 13.886675789706416,
"learning_rate": 2.5862068965517246e-06,
"loss": 0.4883,
"step": 90
},
{
"epoch": 0.45977011494252873,
"grad_norm": 4.673192920611484,
"learning_rate": 2.8735632183908046e-06,
"loss": 0.4584,
"step": 100
},
{
"epoch": 0.5057471264367817,
"grad_norm": 3.674033307691337,
"learning_rate": 3.1609195402298854e-06,
"loss": 0.447,
"step": 110
},
{
"epoch": 0.5517241379310345,
"grad_norm": 3.919248171882691,
"learning_rate": 3.448275862068966e-06,
"loss": 0.4556,
"step": 120
},
{
"epoch": 0.5977011494252874,
"grad_norm": 3.5061949892253623,
"learning_rate": 3.7356321839080462e-06,
"loss": 0.4354,
"step": 130
},
{
"epoch": 0.6436781609195402,
"grad_norm": 3.7489900095069073,
"learning_rate": 4.022988505747127e-06,
"loss": 0.4228,
"step": 140
},
{
"epoch": 0.6896551724137931,
"grad_norm": 3.6759677856673103,
"learning_rate": 4.310344827586207e-06,
"loss": 0.4299,
"step": 150
},
{
"epoch": 0.735632183908046,
"grad_norm": 3.6923883409669456,
"learning_rate": 4.5977011494252875e-06,
"loss": 0.4193,
"step": 160
},
{
"epoch": 0.7816091954022989,
"grad_norm": 3.6202856036202427,
"learning_rate": 4.885057471264369e-06,
"loss": 0.4155,
"step": 170
},
{
"epoch": 0.8275862068965517,
"grad_norm": 3.6300778901007864,
"learning_rate": 4.999817969178238e-06,
"loss": 0.4083,
"step": 180
},
{
"epoch": 0.8735632183908046,
"grad_norm": 3.587161182258164,
"learning_rate": 4.998705654596035e-06,
"loss": 0.4225,
"step": 190
},
{
"epoch": 0.9195402298850575,
"grad_norm": 3.3270245057939376,
"learning_rate": 4.996582603056429e-06,
"loss": 0.4019,
"step": 200
},
{
"epoch": 0.9655172413793104,
"grad_norm": 3.8111046706601623,
"learning_rate": 4.9934496733427066e-06,
"loss": 0.3965,
"step": 210
},
{
"epoch": 1.0114942528735633,
"grad_norm": 3.3652791442634187,
"learning_rate": 4.989308132738127e-06,
"loss": 0.3966,
"step": 220
},
{
"epoch": 1.0574712643678161,
"grad_norm": 3.1325910691416747,
"learning_rate": 4.9841596565133e-06,
"loss": 0.3407,
"step": 230
},
{
"epoch": 1.103448275862069,
"grad_norm": 3.167830493874506,
"learning_rate": 4.978006327248537e-06,
"loss": 0.3519,
"step": 240
},
{
"epoch": 1.1494252873563218,
"grad_norm": 3.2716290396557377,
"learning_rate": 4.970850633991432e-06,
"loss": 0.3596,
"step": 250
},
{
"epoch": 1.1954022988505748,
"grad_norm": 3.1954803751416962,
"learning_rate": 4.962695471250033e-06,
"loss": 0.343,
"step": 260
},
{
"epoch": 1.2413793103448276,
"grad_norm": 3.479098208871561,
"learning_rate": 4.953544137822006e-06,
"loss": 0.3369,
"step": 270
},
{
"epoch": 1.2873563218390804,
"grad_norm": 3.3457647349425104,
"learning_rate": 4.9434003354602515e-06,
"loss": 0.3627,
"step": 280
},
{
"epoch": 1.3333333333333333,
"grad_norm": 3.282832685188253,
"learning_rate": 4.932268167375532e-06,
"loss": 0.3528,
"step": 290
},
{
"epoch": 1.3793103448275863,
"grad_norm": 2.6789350907177822,
"learning_rate": 4.920152136576706e-06,
"loss": 0.3406,
"step": 300
},
{
"epoch": 1.4252873563218391,
"grad_norm": 3.3658438451363577,
"learning_rate": 4.9070571440492435e-06,
"loss": 0.3643,
"step": 310
},
{
"epoch": 1.471264367816092,
"grad_norm": 3.103704408419176,
"learning_rate": 4.892988486772756e-06,
"loss": 0.3434,
"step": 320
},
{
"epoch": 1.5172413793103448,
"grad_norm": 2.9669630699191405,
"learning_rate": 4.877951855578342e-06,
"loss": 0.3322,
"step": 330
},
{
"epoch": 1.5632183908045976,
"grad_norm": 3.486652210473939,
"learning_rate": 4.86195333284663e-06,
"loss": 0.3529,
"step": 340
},
{
"epoch": 1.6091954022988506,
"grad_norm": 3.3652573778754253,
"learning_rate": 4.844999390047419e-06,
"loss": 0.3547,
"step": 350
},
{
"epoch": 1.6551724137931034,
"grad_norm": 3.5532322465589385,
"learning_rate": 4.827096885121954e-06,
"loss": 0.3408,
"step": 360
},
{
"epoch": 1.7011494252873565,
"grad_norm": 3.143444966292411,
"learning_rate": 4.808253059708849e-06,
"loss": 0.3506,
"step": 370
},
{
"epoch": 1.7471264367816093,
"grad_norm": 2.8493219432318875,
"learning_rate": 4.788475536214822e-06,
"loss": 0.3368,
"step": 380
},
{
"epoch": 1.793103448275862,
"grad_norm": 2.8628419910614316,
"learning_rate": 4.767772314731394e-06,
"loss": 0.3424,
"step": 390
},
{
"epoch": 1.839080459770115,
"grad_norm": 3.0446997203794344,
"learning_rate": 4.746151769798818e-06,
"loss": 0.3549,
"step": 400
},
{
"epoch": 1.8850574712643677,
"grad_norm": 2.7372586769671337,
"learning_rate": 4.7236226470185505e-06,
"loss": 0.3247,
"step": 410
},
{
"epoch": 1.9310344827586206,
"grad_norm": 3.044988365851536,
"learning_rate": 4.700194059515606e-06,
"loss": 0.3373,
"step": 420
},
{
"epoch": 1.9770114942528736,
"grad_norm": 2.9343723129428643,
"learning_rate": 4.67587548425227e-06,
"loss": 0.3393,
"step": 430
},
{
"epoch": 2.0229885057471266,
"grad_norm": 2.4482722232022764,
"learning_rate": 4.650676758194624e-06,
"loss": 0.3095,
"step": 440
},
{
"epoch": 2.0689655172413794,
"grad_norm": 2.9276614481022127,
"learning_rate": 4.624608074333448e-06,
"loss": 0.256,
"step": 450
},
{
"epoch": 2.1149425287356323,
"grad_norm": 2.6846087529478955,
"learning_rate": 4.597679977561122e-06,
"loss": 0.2471,
"step": 460
},
{
"epoch": 2.160919540229885,
"grad_norm": 2.610365319899241,
"learning_rate": 4.569903360406163e-06,
"loss": 0.2554,
"step": 470
},
{
"epoch": 2.206896551724138,
"grad_norm": 2.948862836398902,
"learning_rate": 4.541289458627155e-06,
"loss": 0.2527,
"step": 480
},
{
"epoch": 2.2528735632183907,
"grad_norm": 2.637472882710502,
"learning_rate": 4.511849846667839e-06,
"loss": 0.2504,
"step": 490
},
{
"epoch": 2.2988505747126435,
"grad_norm": 2.8982721464809016,
"learning_rate": 4.481596432975202e-06,
"loss": 0.2608,
"step": 500
},
{
"epoch": 2.344827586206897,
"grad_norm": 2.852823004859079,
"learning_rate": 4.4505414551824536e-06,
"loss": 0.2606,
"step": 510
},
{
"epoch": 2.3908045977011496,
"grad_norm": 3.197376130493875,
"learning_rate": 4.418697475158861e-06,
"loss": 0.2677,
"step": 520
},
{
"epoch": 2.4367816091954024,
"grad_norm": 2.9868581965061494,
"learning_rate": 4.386077373928413e-06,
"loss": 0.2628,
"step": 530
},
{
"epoch": 2.4827586206896552,
"grad_norm": 2.5165223021619756,
"learning_rate": 4.352694346459397e-06,
"loss": 0.2739,
"step": 540
},
{
"epoch": 2.528735632183908,
"grad_norm": 2.5532695939024723,
"learning_rate": 4.318561896326973e-06,
"loss": 0.2587,
"step": 550
},
{
"epoch": 2.574712643678161,
"grad_norm": 2.80784537094022,
"learning_rate": 4.283693830250926e-06,
"loss": 0.271,
"step": 560
},
{
"epoch": 2.6206896551724137,
"grad_norm": 2.834008814039304,
"learning_rate": 4.248104252510786e-06,
"loss": 0.2596,
"step": 570
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.856340507047602,
"learning_rate": 4.211807559240588e-06,
"loss": 0.2607,
"step": 580
},
{
"epoch": 2.7126436781609193,
"grad_norm": 2.400155413142861,
"learning_rate": 4.174818432605579e-06,
"loss": 0.2714,
"step": 590
},
{
"epoch": 2.7586206896551726,
"grad_norm": 2.9121726603653446,
"learning_rate": 4.137151834863213e-06,
"loss": 0.267,
"step": 600
},
{
"epoch": 2.8045977011494254,
"grad_norm": 2.4158062858535243,
"learning_rate": 4.098823002310864e-06,
"loss": 0.2637,
"step": 610
},
{
"epoch": 2.8505747126436782,
"grad_norm": 2.710372669455237,
"learning_rate": 4.059847439122672e-06,
"loss": 0.2591,
"step": 620
},
{
"epoch": 2.896551724137931,
"grad_norm": 2.694528049244444,
"learning_rate": 4.020240911078041e-06,
"loss": 0.2597,
"step": 630
},
{
"epoch": 2.942528735632184,
"grad_norm": 2.7003321508943077,
"learning_rate": 3.98001943918432e-06,
"loss": 0.2694,
"step": 640
},
{
"epoch": 2.9885057471264367,
"grad_norm": 2.9274287857580212,
"learning_rate": 3.939199293196231e-06,
"loss": 0.2704,
"step": 650
},
{
"epoch": 3.0344827586206895,
"grad_norm": 2.232026779235866,
"learning_rate": 3.897796985034687e-06,
"loss": 0.1997,
"step": 660
},
{
"epoch": 3.0804597701149423,
"grad_norm": 2.7014305023370415,
"learning_rate": 3.855829262107653e-06,
"loss": 0.1716,
"step": 670
},
{
"epoch": 3.1264367816091956,
"grad_norm": 2.4848170071887035,
"learning_rate": 3.813313100535747e-06,
"loss": 0.1803,
"step": 680
},
{
"epoch": 3.1724137931034484,
"grad_norm": 2.7346292529745027,
"learning_rate": 3.770265698285328e-06,
"loss": 0.1754,
"step": 690
},
{
"epoch": 3.218390804597701,
"grad_norm": 2.705054730805242,
"learning_rate": 3.726704468211844e-06,
"loss": 0.1835,
"step": 700
},
{
"epoch": 3.264367816091954,
"grad_norm": 2.6584866622780603,
"learning_rate": 3.6826470310162645e-06,
"loss": 0.1792,
"step": 710
},
{
"epoch": 3.310344827586207,
"grad_norm": 2.6514881876130225,
"learning_rate": 3.6381112081174254e-06,
"loss": 0.1765,
"step": 720
},
{
"epoch": 3.3563218390804597,
"grad_norm": 2.516694208524121,
"learning_rate": 3.593115014443195e-06,
"loss": 0.1817,
"step": 730
},
{
"epoch": 3.4022988505747125,
"grad_norm": 2.4579414740197065,
"learning_rate": 3.547676651143361e-06,
"loss": 0.1849,
"step": 740
},
{
"epoch": 3.4482758620689653,
"grad_norm": 2.6311823995451364,
"learning_rate": 3.5018144982271814e-06,
"loss": 0.1769,
"step": 750
},
{
"epoch": 3.4942528735632186,
"grad_norm": 2.3725394123707026,
"learning_rate": 3.455547107128602e-06,
"loss": 0.1848,
"step": 760
},
{
"epoch": 3.5402298850574714,
"grad_norm": 2.4539717154314933,
"learning_rate": 3.4088931932021193e-06,
"loss": 0.1892,
"step": 770
},
{
"epoch": 3.586206896551724,
"grad_norm": 2.4899658946067933,
"learning_rate": 3.3618716281523384e-06,
"loss": 0.1807,
"step": 780
},
{
"epoch": 3.632183908045977,
"grad_norm": 2.550071627944622,
"learning_rate": 3.3145014324002945e-06,
"loss": 0.1852,
"step": 790
},
{
"epoch": 3.67816091954023,
"grad_norm": 2.675827164092041,
"learning_rate": 3.266801767389608e-06,
"loss": 0.1885,
"step": 800
},
{
"epoch": 3.7241379310344827,
"grad_norm": 2.5251713436347227,
"learning_rate": 3.2187919278356027e-06,
"loss": 0.1835,
"step": 810
},
{
"epoch": 3.7701149425287355,
"grad_norm": 2.7315545503128638,
"learning_rate": 3.1704913339205107e-06,
"loss": 0.1863,
"step": 820
},
{
"epoch": 3.8160919540229887,
"grad_norm": 2.7536607881256665,
"learning_rate": 3.121919523437927e-06,
"loss": 0.1921,
"step": 830
},
{
"epoch": 3.862068965517241,
"grad_norm": 2.598851663508124,
"learning_rate": 3.073096143889689e-06,
"loss": 0.1868,
"step": 840
},
{
"epoch": 3.9080459770114944,
"grad_norm": 2.523389034722214,
"learning_rate": 3.0240409445383835e-06,
"loss": 0.1855,
"step": 850
},
{
"epoch": 3.954022988505747,
"grad_norm": 2.7756855701720404,
"learning_rate": 2.97477376841868e-06,
"loss": 0.1791,
"step": 860
},
{
"epoch": 4.0,
"grad_norm": 2.489766938283244,
"learning_rate": 2.9253145443107455e-06,
"loss": 0.1756,
"step": 870
},
{
"epoch": 4.045977011494253,
"grad_norm": 2.7317574118504044,
"learning_rate": 2.8756832786789667e-06,
"loss": 0.1081,
"step": 880
},
{
"epoch": 4.091954022988506,
"grad_norm": 2.1106005588504626,
"learning_rate": 2.825900047579251e-06,
"loss": 0.1052,
"step": 890
},
{
"epoch": 4.137931034482759,
"grad_norm": 2.40365919767509,
"learning_rate": 2.775984988538175e-06,
"loss": 0.1032,
"step": 900
},
{
"epoch": 4.183908045977011,
"grad_norm": 2.3074631869946507,
"learning_rate": 2.725958292407276e-06,
"loss": 0.1049,
"step": 910
},
{
"epoch": 4.2298850574712645,
"grad_norm": 2.5166276182004714,
"learning_rate": 2.6758401951957625e-06,
"loss": 0.1051,
"step": 920
},
{
"epoch": 4.275862068965517,
"grad_norm": 2.3612573716815093,
"learning_rate": 2.6256509698849652e-06,
"loss": 0.1071,
"step": 930
},
{
"epoch": 4.32183908045977,
"grad_norm": 2.377295836801757,
"learning_rate": 2.5754109182278298e-06,
"loss": 0.1077,
"step": 940
},
{
"epoch": 4.3678160919540225,
"grad_norm": 2.5390227361500823,
"learning_rate": 2.525140362536775e-06,
"loss": 0.1085,
"step": 950
},
{
"epoch": 4.413793103448276,
"grad_norm": 2.233377317311712,
"learning_rate": 2.474859637463226e-06,
"loss": 0.1082,
"step": 960
},
{
"epoch": 4.459770114942529,
"grad_norm": 2.551052323312396,
"learning_rate": 2.42458908177217e-06,
"loss": 0.1102,
"step": 970
},
{
"epoch": 4.505747126436781,
"grad_norm": 2.6483774815598804,
"learning_rate": 2.374349030115036e-06,
"loss": 0.1105,
"step": 980
},
{
"epoch": 4.551724137931035,
"grad_norm": 2.429849103852719,
"learning_rate": 2.3241598048042383e-06,
"loss": 0.1082,
"step": 990
},
{
"epoch": 4.597701149425287,
"grad_norm": 2.486141654635446,
"learning_rate": 2.2740417075927244e-06,
"loss": 0.109,
"step": 1000
},
{
"epoch": 4.64367816091954,
"grad_norm": 2.5918016937723527,
"learning_rate": 2.2240150114618262e-06,
"loss": 0.1068,
"step": 1010
},
{
"epoch": 4.689655172413794,
"grad_norm": 2.663853206431205,
"learning_rate": 2.17409995242075e-06,
"loss": 0.1058,
"step": 1020
},
{
"epoch": 4.735632183908046,
"grad_norm": 2.657379315743761,
"learning_rate": 2.1243167213210337e-06,
"loss": 0.1015,
"step": 1030
},
{
"epoch": 4.781609195402299,
"grad_norm": 2.357887989241257,
"learning_rate": 2.0746854556892545e-06,
"loss": 0.1032,
"step": 1040
},
{
"epoch": 4.827586206896552,
"grad_norm": 2.1971439440232134,
"learning_rate": 2.0252262315813213e-06,
"loss": 0.1033,
"step": 1050
},
{
"epoch": 4.873563218390805,
"grad_norm": 2.3384701132936683,
"learning_rate": 1.9759590554616177e-06,
"loss": 0.1075,
"step": 1060
},
{
"epoch": 4.919540229885057,
"grad_norm": 2.611584410768705,
"learning_rate": 1.9269038561103114e-06,
"loss": 0.1075,
"step": 1070
},
{
"epoch": 4.9655172413793105,
"grad_norm": 2.679036350683068,
"learning_rate": 1.8780804765620747e-06,
"loss": 0.1033,
"step": 1080
},
{
"epoch": 5.011494252873563,
"grad_norm": 1.5576689531522379,
"learning_rate": 1.8295086660794903e-06,
"loss": 0.0939,
"step": 1090
},
{
"epoch": 5.057471264367816,
"grad_norm": 2.4254209848134027,
"learning_rate": 1.7812080721643977e-06,
"loss": 0.0524,
"step": 1100
},
{
"epoch": 5.103448275862069,
"grad_norm": 2.2936105257318675,
"learning_rate": 1.7331982326103922e-06,
"loss": 0.0531,
"step": 1110
},
{
"epoch": 5.149425287356322,
"grad_norm": 2.2180450680938013,
"learning_rate": 1.6854985675997065e-06,
"loss": 0.0518,
"step": 1120
},
{
"epoch": 5.195402298850575,
"grad_norm": 1.9813446486569681,
"learning_rate": 1.6381283718476622e-06,
"loss": 0.0521,
"step": 1130
},
{
"epoch": 5.241379310344827,
"grad_norm": 1.9952422258717124,
"learning_rate": 1.591106806797882e-06,
"loss": 0.0534,
"step": 1140
},
{
"epoch": 5.287356321839081,
"grad_norm": 1.910457789638185,
"learning_rate": 1.5444528928713987e-06,
"loss": 0.0529,
"step": 1150
},
{
"epoch": 5.333333333333333,
"grad_norm": 2.247598889328078,
"learning_rate": 1.4981855017728197e-06,
"loss": 0.054,
"step": 1160
},
{
"epoch": 5.379310344827586,
"grad_norm": 2.355104769702966,
"learning_rate": 1.4523233488566394e-06,
"loss": 0.0583,
"step": 1170
},
{
"epoch": 5.425287356321839,
"grad_norm": 2.2631373957215732,
"learning_rate": 1.4068849855568042e-06,
"loss": 0.0513,
"step": 1180
},
{
"epoch": 5.471264367816092,
"grad_norm": 4.183885604965178,
"learning_rate": 1.3618887918825752e-06,
"loss": 0.0547,
"step": 1190
},
{
"epoch": 5.517241379310345,
"grad_norm": 2.149593926911987,
"learning_rate": 1.3173529689837355e-06,
"loss": 0.0543,
"step": 1200
},
{
"epoch": 5.563218390804598,
"grad_norm": 2.17719523603525,
"learning_rate": 1.2732955317881563e-06,
"loss": 0.0544,
"step": 1210
},
{
"epoch": 5.609195402298851,
"grad_norm": 2.247756297696003,
"learning_rate": 1.2297343017146727e-06,
"loss": 0.0529,
"step": 1220
},
{
"epoch": 5.655172413793103,
"grad_norm": 1.7927074781175658,
"learning_rate": 1.1866868994642535e-06,
"loss": 0.0514,
"step": 1230
},
{
"epoch": 5.7011494252873565,
"grad_norm": 2.4235617167647816,
"learning_rate": 1.1441707378923475e-06,
"loss": 0.0529,
"step": 1240
},
{
"epoch": 5.747126436781609,
"grad_norm": 1.7438808513899655,
"learning_rate": 1.1022030149653134e-06,
"loss": 0.0524,
"step": 1250
},
{
"epoch": 5.793103448275862,
"grad_norm": 2.080026881638302,
"learning_rate": 1.0608007068037702e-06,
"loss": 0.0528,
"step": 1260
},
{
"epoch": 5.8390804597701145,
"grad_norm": 1.9309457262833183,
"learning_rate": 1.0199805608156802e-06,
"loss": 0.0504,
"step": 1270
},
{
"epoch": 5.885057471264368,
"grad_norm": 2.1120867508468493,
"learning_rate": 9.79759088921959e-07,
"loss": 0.0524,
"step": 1280
},
{
"epoch": 5.931034482758621,
"grad_norm": 1.803440728603622,
"learning_rate": 9.401525608773293e-07,
"loss": 0.0453,
"step": 1290
},
{
"epoch": 5.977011494252873,
"grad_norm": 1.756476820082346,
"learning_rate": 9.011769976891368e-07,
"loss": 0.0483,
"step": 1300
},
{
"epoch": 6.022988505747127,
"grad_norm": 1.14195571634091,
"learning_rate": 8.628481651367876e-07,
"loss": 0.0373,
"step": 1310
},
{
"epoch": 6.068965517241379,
"grad_norm": 1.2823038089726142,
"learning_rate": 8.25181567394422e-07,
"loss": 0.0257,
"step": 1320
},
{
"epoch": 6.114942528735632,
"grad_norm": 1.9765342255678002,
"learning_rate": 7.88192440759413e-07,
"loss": 0.0243,
"step": 1330
},
{
"epoch": 6.160919540229885,
"grad_norm": 1.4924174584280798,
"learning_rate": 7.51895747489215e-07,
"loss": 0.0244,
"step": 1340
},
{
"epoch": 6.206896551724138,
"grad_norm": 1.4740924405566593,
"learning_rate": 7.163061697490742e-07,
"loss": 0.0253,
"step": 1350
},
{
"epoch": 6.252873563218391,
"grad_norm": 1.2039837293309412,
"learning_rate": 6.814381036730275e-07,
"loss": 0.0237,
"step": 1360
},
{
"epoch": 6.2988505747126435,
"grad_norm": 1.5435771614300633,
"learning_rate": 6.473056535406036e-07,
"loss": 0.0247,
"step": 1370
},
{
"epoch": 6.344827586206897,
"grad_norm": 1.6972735405981059,
"learning_rate": 6.139226260715872e-07,
"loss": 0.0232,
"step": 1380
},
{
"epoch": 6.390804597701149,
"grad_norm": 1.7603825909556183,
"learning_rate": 5.813025248411397e-07,
"loss": 0.0209,
"step": 1390
},
{
"epoch": 6.436781609195402,
"grad_norm": 1.7734840776368426,
"learning_rate": 5.494585448175474e-07,
"loss": 0.0248,
"step": 1400
},
{
"epoch": 6.482758620689655,
"grad_norm": 2.155524841148084,
"learning_rate": 5.184035670247989e-07,
"loss": 0.0251,
"step": 1410
},
{
"epoch": 6.528735632183908,
"grad_norm": 2.1248857415061315,
"learning_rate": 4.881501533321605e-07,
"loss": 0.0238,
"step": 1420
},
{
"epoch": 6.574712643678161,
"grad_norm": 1.2638686532353145,
"learning_rate": 4.587105413728457e-07,
"loss": 0.0227,
"step": 1430
},
{
"epoch": 6.620689655172414,
"grad_norm": 1.3539825062665019,
"learning_rate": 4.3009663959383776e-07,
"loss": 0.0229,
"step": 1440
},
{
"epoch": 6.666666666666667,
"grad_norm": 1.2318013380818635,
"learning_rate": 4.0232002243887873e-07,
"loss": 0.0244,
"step": 1450
},
{
"epoch": 6.712643678160919,
"grad_norm": 2.1272364662366887,
"learning_rate": 3.7539192566655254e-07,
"loss": 0.0234,
"step": 1460
},
{
"epoch": 6.758620689655173,
"grad_norm": 1.4894449105223861,
"learning_rate": 3.493232418053774e-07,
"loss": 0.0223,
"step": 1470
},
{
"epoch": 6.804597701149425,
"grad_norm": 1.5981961713791006,
"learning_rate": 3.24124515747731e-07,
"loss": 0.0218,
"step": 1480
},
{
"epoch": 6.850574712643678,
"grad_norm": 1.6361128000262934,
"learning_rate": 2.9980594048439477e-07,
"loss": 0.0226,
"step": 1490
},
{
"epoch": 6.896551724137931,
"grad_norm": 2.3404616936998455,
"learning_rate": 2.7637735298145064e-07,
"loss": 0.0212,
"step": 1500
},
{
"epoch": 6.942528735632184,
"grad_norm": 1.7100444245019304,
"learning_rate": 2.538482302011822e-07,
"loss": 0.0225,
"step": 1510
},
{
"epoch": 6.988505747126437,
"grad_norm": 1.4325323759545483,
"learning_rate": 2.3222768526860701e-07,
"loss": 0.0227,
"step": 1520
},
{
"epoch": 7.0344827586206895,
"grad_norm": 0.8626586832251956,
"learning_rate": 2.115244637851782e-07,
"loss": 0.0151,
"step": 1530
},
{
"epoch": 7.080459770114943,
"grad_norm": 0.650488570824418,
"learning_rate": 1.9174694029115148e-07,
"loss": 0.0123,
"step": 1540
},
{
"epoch": 7.126436781609195,
"grad_norm": 2.091704655001972,
"learning_rate": 1.7290311487804689e-07,
"loss": 0.0126,
"step": 1550
},
{
"epoch": 7.172413793103448,
"grad_norm": 0.8790217013924919,
"learning_rate": 1.5500060995258136e-07,
"loss": 0.0128,
"step": 1560
},
{
"epoch": 7.218390804597701,
"grad_norm": 0.7303846115721658,
"learning_rate": 1.3804666715337117e-07,
"loss": 0.0122,
"step": 1570
},
{
"epoch": 7.264367816091954,
"grad_norm": 1.0662215851718453,
"learning_rate": 1.2204814442165814e-07,
"loss": 0.0124,
"step": 1580
},
{
"epoch": 7.310344827586207,
"grad_norm": 0.8739271592863969,
"learning_rate": 1.0701151322724451e-07,
"loss": 0.0117,
"step": 1590
},
{
"epoch": 7.35632183908046,
"grad_norm": 0.7737781041692757,
"learning_rate": 9.294285595075669e-08,
"loss": 0.0122,
"step": 1600
},
{
"epoch": 7.402298850574713,
"grad_norm": 0.7170467686655235,
"learning_rate": 7.984786342329493e-08,
"loss": 0.0133,
"step": 1610
},
{
"epoch": 7.448275862068965,
"grad_norm": 0.6837027766335966,
"learning_rate": 6.773183262446914e-08,
"loss": 0.0112,
"step": 1620
},
{
"epoch": 7.494252873563219,
"grad_norm": 0.9449591802350379,
"learning_rate": 5.65996645397493e-08,
"loss": 0.0139,
"step": 1630
},
{
"epoch": 7.540229885057471,
"grad_norm": 1.999894266793329,
"learning_rate": 4.645586217799453e-08,
"loss": 0.0131,
"step": 1640
},
{
"epoch": 7.586206896551724,
"grad_norm": 0.8351980509392402,
"learning_rate": 3.730452874996737e-08,
"loss": 0.0124,
"step": 1650
},
{
"epoch": 7.6321839080459775,
"grad_norm": 0.7193434252067797,
"learning_rate": 2.914936600856899e-08,
"loss": 0.0112,
"step": 1660
},
{
"epoch": 7.67816091954023,
"grad_norm": 0.8238436973186618,
"learning_rate": 2.199367275146358e-08,
"loss": 0.0112,
"step": 1670
},
{
"epoch": 7.724137931034483,
"grad_norm": 1.05150261931409,
"learning_rate": 1.5840343486700216e-08,
"loss": 0.0133,
"step": 1680
},
{
"epoch": 7.7701149425287355,
"grad_norm": 0.8116973247797554,
"learning_rate": 1.0691867261874155e-08,
"loss": 0.0116,
"step": 1690
},
{
"epoch": 7.816091954022989,
"grad_norm": 0.8080179165425967,
"learning_rate": 6.550326657293882e-09,
"loss": 0.0111,
"step": 1700
},
{
"epoch": 7.862068965517241,
"grad_norm": 0.8576475297914047,
"learning_rate": 3.4173969435710717e-09,
"loss": 0.0121,
"step": 1710
},
{
"epoch": 7.908045977011494,
"grad_norm": 1.1647467814066779,
"learning_rate": 1.2943454039654467e-09,
"loss": 0.0122,
"step": 1720
},
{
"epoch": 7.954022988505747,
"grad_norm": 0.6589417835112291,
"learning_rate": 1.8203082176287967e-10,
"loss": 0.0125,
"step": 1730
},
{
"epoch": 7.9816091954022985,
"step": 1736,
"total_flos": 118981985435648.0,
"train_loss": 0.19023791693305503,
"train_runtime": 9688.5007,
"train_samples_per_second": 1.436,
"train_steps_per_second": 0.179
}
],
"logging_steps": 10,
"max_steps": 1736,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 118981985435648.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}