LFM2-150M-1.5B / trainer_state.json

init upload

39cd858 verified about 2 months ago

26.7 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.3932878867330886,
	"eval_steps": 500,
	"global_step": 1500,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.00026219192448872575,
	"grad_norm": 22.20619010925293,
	"learning_rate": 0.0,
	"loss": 10.5131,
	"step": 1
	},
	{
	"epoch": 0.0026219192448872575,
	"grad_norm": 22.429588317871094,
	"learning_rate": 4.4999999999999996e-05,
	"loss": 10.4662,
	"step": 10
	},
	{
	"epoch": 0.005243838489774515,
	"grad_norm": 22.83245086669922,
	"learning_rate": 9.5e-05,
	"loss": 10.1612,
	"step": 20
	},
	{
	"epoch": 0.007865757734661772,
	"grad_norm": 23.247602462768555,
	"learning_rate": 0.000145,
	"loss": 9.5256,
	"step": 30
	},
	{
	"epoch": 0.01048767697954903,
	"grad_norm": 23.51291275024414,
	"learning_rate": 0.00019500000000000002,
	"loss": 8.5708,
	"step": 40
	},
	{
	"epoch": 0.013109596224436287,
	"grad_norm": 22.496492385864258,
	"learning_rate": 0.000245,
	"loss": 7.3388,
	"step": 50
	},
	{
	"epoch": 0.015731515469323543,
	"grad_norm": 16.345460891723633,
	"learning_rate": 0.000295,
	"loss": 5.9703,
	"step": 60
	},
	{
	"epoch": 0.018353434714210803,
	"grad_norm": 3.921259880065918,
	"learning_rate": 0.000345,
	"loss": 4.9478,
	"step": 70
	},
	{
	"epoch": 0.02097535395909806,
	"grad_norm": 7.0385589599609375,
	"learning_rate": 0.000395,
	"loss": 4.6803,
	"step": 80
	},
	{
	"epoch": 0.023597273203985317,
	"grad_norm": 2.6207873821258545,
	"learning_rate": 0.00044500000000000003,
	"loss": 4.4974,
	"step": 90
	},
	{
	"epoch": 0.026219192448872573,
	"grad_norm": 1.9961260557174683,
	"learning_rate": 0.000495,
	"loss": 4.3314,
	"step": 100
	},
	{
	"epoch": 0.028841111693759833,
	"grad_norm": 1.6183704137802124,
	"learning_rate": 0.000545,
	"loss": 4.1959,
	"step": 110
	},
	{
	"epoch": 0.03146303093864709,
	"grad_norm": 1.331021785736084,
	"learning_rate": 0.0005949999999999999,
	"loss": 4.0158,
	"step": 120
	},
	{
	"epoch": 0.03408495018353435,
	"grad_norm": 1.14554762840271,
	"learning_rate": 0.0006450000000000001,
	"loss": 3.9321,
	"step": 130
	},
	{
	"epoch": 0.03670686942842161,
	"grad_norm": 0.9175837635993958,
	"learning_rate": 0.000695,
	"loss": 3.802,
	"step": 140
	},
	{
	"epoch": 0.03932878867330886,
	"grad_norm": 0.7335033416748047,
	"learning_rate": 0.000745,
	"loss": 3.6618,
	"step": 150
	},
	{
	"epoch": 0.04195070791819612,
	"grad_norm": 0.5916274785995483,
	"learning_rate": 0.000795,
	"loss": 3.5341,
	"step": 160
	},
	{
	"epoch": 0.04457262716308338,
	"grad_norm": 0.4947799742221832,
	"learning_rate": 0.0008449999999999999,
	"loss": 3.5311,
	"step": 170
	},
	{
	"epoch": 0.04719454640797063,
	"grad_norm": 0.40263015031814575,
	"learning_rate": 0.0008950000000000001,
	"loss": 3.4709,
	"step": 180
	},
	{
	"epoch": 0.04981646565285789,
	"grad_norm": 0.32677406072616577,
	"learning_rate": 0.000945,
	"loss": 3.2973,
	"step": 190
	},
	{
	"epoch": 0.05243838489774515,
	"grad_norm": 0.3071628212928772,
	"learning_rate": 0.000995,
	"loss": 3.28,
	"step": 200
	},
	{
	"epoch": 0.05506030414263241,
	"grad_norm": 0.3233015835285187,
	"learning_rate": 0.001045,
	"loss": 3.2038,
	"step": 210
	},
	{
	"epoch": 0.05768222338751967,
	"grad_norm": 0.39402100443840027,
	"learning_rate": 0.001095,
	"loss": 3.1627,
	"step": 220
	},
	{
	"epoch": 0.060304142632406924,
	"grad_norm": 0.5528343915939331,
	"learning_rate": 0.001145,
	"loss": 3.1341,
	"step": 230
	},
	{
	"epoch": 0.06292606187729417,
	"grad_norm": 0.4888489842414856,
	"learning_rate": 0.001195,
	"loss": 3.0192,
	"step": 240
	},
	{
	"epoch": 0.06554798112218144,
	"grad_norm": 0.5662292838096619,
	"learning_rate": 0.0012450000000000002,
	"loss": 2.991,
	"step": 250
	},
	{
	"epoch": 0.0681699003670687,
	"grad_norm": 0.5800466537475586,
	"learning_rate": 0.001295,
	"loss": 2.992,
	"step": 260
	},
	{
	"epoch": 0.07079181961195595,
	"grad_norm": 0.5511091947555542,
	"learning_rate": 0.001345,
	"loss": 2.9246,
	"step": 270
	},
	{
	"epoch": 0.07341373885684321,
	"grad_norm": 0.7486537098884583,
	"learning_rate": 0.001395,
	"loss": 2.8996,
	"step": 280
	},
	{
	"epoch": 0.07603565810173046,
	"grad_norm": 0.6995801329612732,
	"learning_rate": 0.001445,
	"loss": 2.7945,
	"step": 290
	},
	{
	"epoch": 0.07865757734661773,
	"grad_norm": 0.7938666939735413,
	"learning_rate": 0.0014950000000000002,
	"loss": 2.7632,
	"step": 300
	},
	{
	"epoch": 0.08127949659150498,
	"grad_norm": 0.7555065155029297,
	"learning_rate": 0.001545,
	"loss": 2.7513,
	"step": 310
	},
	{
	"epoch": 0.08390141583639224,
	"grad_norm": 0.7714865803718567,
	"learning_rate": 0.001595,
	"loss": 2.6165,
	"step": 320
	},
	{
	"epoch": 0.08652333508127949,
	"grad_norm": 0.7604843974113464,
	"learning_rate": 0.001645,
	"loss": 2.6391,
	"step": 330
	},
	{
	"epoch": 0.08914525432616675,
	"grad_norm": 0.7840315699577332,
	"learning_rate": 0.0016950000000000001,
	"loss": 2.5818,
	"step": 340
	},
	{
	"epoch": 0.09176717357105402,
	"grad_norm": 1.0126832723617554,
	"learning_rate": 0.0017450000000000002,
	"loss": 2.5417,
	"step": 350
	},
	{
	"epoch": 0.09438909281594127,
	"grad_norm": 1.0092129707336426,
	"learning_rate": 0.001795,
	"loss": 2.4844,
	"step": 360
	},
	{
	"epoch": 0.09701101206082853,
	"grad_norm": 1.1585489511489868,
	"learning_rate": 0.001845,
	"loss": 2.4645,
	"step": 370
	},
	{
	"epoch": 0.09963293130571578,
	"grad_norm": 1.0778034925460815,
	"learning_rate": 0.001895,
	"loss": 2.4003,
	"step": 380
	},
	{
	"epoch": 0.10225485055060304,
	"grad_norm": 1.146636962890625,
	"learning_rate": 0.0019450000000000001,
	"loss": 2.3466,
	"step": 390
	},
	{
	"epoch": 0.1048767697954903,
	"grad_norm": 0.9742526412010193,
	"learning_rate": 0.0019950000000000002,
	"loss": 2.3088,
	"step": 400
	},
	{
	"epoch": 0.10749868904037756,
	"grad_norm": 1.3035728931427002,
	"learning_rate": 0.0019999657054386192,
	"loss": 2.2834,
	"step": 410
	},
	{
	"epoch": 0.11012060828526482,
	"grad_norm": 1.0689384937286377,
	"learning_rate": 0.0019998471593574603,
	"loss": 2.2473,
	"step": 420
	},
	{
	"epoch": 0.11274252753015207,
	"grad_norm": 1.1519441604614258,
	"learning_rate": 0.001999643948402709,
	"loss": 2.1925,
	"step": 430
	},
	{
	"epoch": 0.11536444677503933,
	"grad_norm": 0.9427940249443054,
	"learning_rate": 0.0019993560897818255,
	"loss": 2.1774,
	"step": 440
	},
	{
	"epoch": 0.11798636601992658,
	"grad_norm": 0.9017934203147888,
	"learning_rate": 0.0019989836078700496,
	"loss": 2.152,
	"step": 450
	},
	{
	"epoch": 0.12060828526481385,
	"grad_norm": 1.018966555595398,
	"learning_rate": 0.001998526534208335,
	"loss": 2.0825,
	"step": 460
	},
	{
	"epoch": 0.1232302045097011,
	"grad_norm": 1.0533466339111328,
	"learning_rate": 0.0019979849075006813,
	"loss": 2.1358,
	"step": 470
	},
	{
	"epoch": 0.12585212375458835,
	"grad_norm": 0.941605806350708,
	"learning_rate": 0.001997358773610856,
	"loss": 2.0524,
	"step": 480
	},
	{
	"epoch": 0.12847404299947562,
	"grad_norm": 0.8877449035644531,
	"learning_rate": 0.0019966481855585075,
	"loss": 2.0308,
	"step": 490
	},
	{
	"epoch": 0.13109596224436287,
	"grad_norm": 0.8652307391166687,
	"learning_rate": 0.001995853203514682,
	"loss": 2.012,
	"step": 500
	},
	{
	"epoch": 0.13371788148925012,
	"grad_norm": 0.8943641781806946,
	"learning_rate": 0.0019949738947967217,
	"loss": 1.9729,
	"step": 510
	},
	{
	"epoch": 0.1363398007341374,
	"grad_norm": 0.9359736442565918,
	"learning_rate": 0.001994010333862568,
	"loss": 1.9997,
	"step": 520
	},
	{
	"epoch": 0.13896171997902465,
	"grad_norm": 1.0085017681121826,
	"learning_rate": 0.001992962602304456,
	"loss": 1.937,
	"step": 530
	},
	{
	"epoch": 0.1415836392239119,
	"grad_norm": 0.7549618482589722,
	"learning_rate": 0.0019918307888420065,
	"loss": 1.9268,
	"step": 540
	},
	{
	"epoch": 0.14420555846879915,
	"grad_norm": 0.8932085037231445,
	"learning_rate": 0.0019906149893147104,
	"loss": 1.9014,
	"step": 550
	},
	{
	"epoch": 0.14682747771368643,
	"grad_norm": 0.8130724430084229,
	"learning_rate": 0.001989315306673817,
	"loss": 1.8577,
	"step": 560
	},
	{
	"epoch": 0.14944939695857368,
	"grad_norm": 0.8497139811515808,
	"learning_rate": 0.0019879318509736137,
	"loss": 1.8185,
	"step": 570
	},
	{
	"epoch": 0.15207131620346093,
	"grad_norm": 0.6299962997436523,
	"learning_rate": 0.001986464739362106,
	"loss": 1.811,
	"step": 580
	},
	{
	"epoch": 0.1546932354483482,
	"grad_norm": 0.7180768251419067,
	"learning_rate": 0.0019849140960711024,
	"loss": 1.7944,
	"step": 590
	},
	{
	"epoch": 0.15731515469323545,
	"grad_norm": 0.8082334399223328,
	"learning_rate": 0.0019832800524056888,
	"loss": 1.8333,
	"step": 600
	},
	{
	"epoch": 0.1599370739381227,
	"grad_norm": 0.8284159302711487,
	"learning_rate": 0.0019815627467331142,
	"loss": 1.811,
	"step": 610
	},
	{
	"epoch": 0.16255899318300995,
	"grad_norm": 0.7332941293716431,
	"learning_rate": 0.0019797623244710715,
	"loss": 1.7704,
	"step": 620
	},
	{
	"epoch": 0.16518091242789723,
	"grad_norm": 0.7234723567962646,
	"learning_rate": 0.0019778789380753862,
	"loss": 1.7558,
	"step": 630
	},
	{
	"epoch": 0.16780283167278448,
	"grad_norm": 0.693242073059082,
	"learning_rate": 0.001975912747027104,
	"loss": 1.742,
	"step": 640
	},
	{
	"epoch": 0.17042475091767173,
	"grad_norm": 0.8523733019828796,
	"learning_rate": 0.0019738639178189885,
	"loss": 1.7438,
	"step": 650
	},
	{
	"epoch": 0.17304667016255898,
	"grad_norm": 0.7505561709403992,
	"learning_rate": 0.001971732623941422,
	"loss": 1.7251,
	"step": 660
	},
	{
	"epoch": 0.17566858940744626,
	"grad_norm": 0.7338821887969971,
	"learning_rate": 0.0019695190458677144,
	"loss": 1.7281,
	"step": 670
	},
	{
	"epoch": 0.1782905086523335,
	"grad_norm": 0.8278585076332092,
	"learning_rate": 0.001967223371038823,
	"loss": 1.6983,
	"step": 680
	},
	{
	"epoch": 0.18091242789722076,
	"grad_norm": 0.6785498261451721,
	"learning_rate": 0.0019648457938474776,
	"loss": 1.7018,
	"step": 690
	},
	{
	"epoch": 0.18353434714210803,
	"grad_norm": 0.7954968810081482,
	"learning_rate": 0.0019623865156217215,
	"loss": 1.6978,
	"step": 700
	},
	{
	"epoch": 0.18615626638699528,
	"grad_norm": 0.6877925992012024,
	"learning_rate": 0.001959845744607864,
	"loss": 1.6693,
	"step": 710
	},
	{
	"epoch": 0.18877818563188253,
	"grad_norm": 0.6183112859725952,
	"learning_rate": 0.001957223695952844,
	"loss": 1.656,
	"step": 720
	},
	{
	"epoch": 0.19140010487676978,
	"grad_norm": 0.6864896416664124,
	"learning_rate": 0.0019545205916860152,
	"loss": 1.6188,
	"step": 730
	},
	{
	"epoch": 0.19402202412165706,
	"grad_norm": 0.6678555011749268,
	"learning_rate": 0.0019517366607003429,
	"loss": 1.6195,
	"step": 740
	},
	{
	"epoch": 0.1966439433665443,
	"grad_norm": 0.724320113658905,
	"learning_rate": 0.0019488721387330222,
	"loss": 1.6067,
	"step": 750
	},
	{
	"epoch": 0.19926586261143156,
	"grad_norm": 0.6665757298469543,
	"learning_rate": 0.0019459272683455162,
	"loss": 1.5781,
	"step": 760
	},
	{
	"epoch": 0.20188778185631884,
	"grad_norm": 0.7139772772789001,
	"learning_rate": 0.0019429022989030176,
	"loss": 1.5647,
	"step": 770
	},
	{
	"epoch": 0.2045097011012061,
	"grad_norm": 0.6505457758903503,
	"learning_rate": 0.0019397974865533315,
	"loss": 1.5869,
	"step": 780
	},
	{
	"epoch": 0.20713162034609334,
	"grad_norm": 0.6815754175186157,
	"learning_rate": 0.001936613094205186,
	"loss": 1.5848,
	"step": 790
	},
	{
	"epoch": 0.2097535395909806,
	"grad_norm": 0.6977171897888184,
	"learning_rate": 0.00193334939150597,
	"loss": 1.5284,
	"step": 800
	},
	{
	"epoch": 0.21237545883586786,
	"grad_norm": 0.5965753197669983,
	"learning_rate": 0.0019300066548188998,
	"loss": 1.5468,
	"step": 810
	},
	{
	"epoch": 0.2149973780807551,
	"grad_norm": 0.596052885055542,
	"learning_rate": 0.001926585167199616,
	"loss": 1.5579,
	"step": 820
	},
	{
	"epoch": 0.21761929732564236,
	"grad_norm": 0.6821017861366272,
	"learning_rate": 0.001923085218372218,
	"loss": 1.4984,
	"step": 830
	},
	{
	"epoch": 0.22024121657052964,
	"grad_norm": 0.6523297429084778,
	"learning_rate": 0.0019195071047047277,
	"loss": 1.537,
	"step": 840
	},
	{
	"epoch": 0.2228631358154169,
	"grad_norm": 0.648935079574585,
	"learning_rate": 0.0019158511291839945,
	"loss": 1.5192,
	"step": 850
	},
	{
	"epoch": 0.22548505506030414,
	"grad_norm": 0.6102792620658875,
	"learning_rate": 0.0019121176013900407,
	"loss": 1.5209,
	"step": 860
	},
	{
	"epoch": 0.2281069743051914,
	"grad_norm": 0.6573307514190674,
	"learning_rate": 0.0019083068374698448,
	"loss": 1.49,
	"step": 870
	},
	{
	"epoch": 0.23072889355007867,
	"grad_norm": 0.6355723738670349,
	"learning_rate": 0.0019044191601105727,
	"loss": 1.4929,
	"step": 880
	},
	{
	"epoch": 0.23335081279496592,
	"grad_norm": 0.5931225419044495,
	"learning_rate": 0.0019004548985122511,
	"loss": 1.4813,
	"step": 890
	},
	{
	"epoch": 0.23597273203985317,
	"grad_norm": 0.6640650629997253,
	"learning_rate": 0.0018964143883598936,
	"loss": 1.4808,
	"step": 900
	},
	{
	"epoch": 0.23859465128474042,
	"grad_norm": 0.6377866268157959,
	"learning_rate": 0.0018922979717950748,
	"loss": 1.4901,
	"step": 910
	},
	{
	"epoch": 0.2412165705296277,
	"grad_norm": 0.6502982378005981,
	"learning_rate": 0.0018881059973869581,
	"loss": 1.4501,
	"step": 920
	},
	{
	"epoch": 0.24383848977451494,
	"grad_norm": 0.602969765663147,
	"learning_rate": 0.0018838388201027805,
	"loss": 1.4661,
	"step": 930
	},
	{
	"epoch": 0.2464604090194022,
	"grad_norm": 0.6061879396438599,
	"learning_rate": 0.001879496801277794,
	"loss": 1.4408,
	"step": 940
	},
	{
	"epoch": 0.24908232826428947,
	"grad_norm": 0.8049127459526062,
	"learning_rate": 0.001875080308584669,
	"loss": 1.4466,
	"step": 950
	},
	{
	"epoch": 0.2517042475091767,
	"grad_norm": 0.46771517395973206,
	"learning_rate": 0.00187058971600236,
	"loss": 1.4382,
	"step": 960
	},
	{
	"epoch": 0.254326166754064,
	"grad_norm": 0.6081333756446838,
	"learning_rate": 0.001866025403784439,
	"loss": 1.4518,
	"step": 970
	},
	{
	"epoch": 0.25694808599895125,
	"grad_norm": 0.6247040033340454,
	"learning_rate": 0.0018613877584268944,
	"loss": 1.4639,
	"step": 980
	},
	{
	"epoch": 0.2595700052438385,
	"grad_norm": 0.5699506998062134,
	"learning_rate": 0.0018566771726354063,
	"loss": 1.4218,
	"step": 990
	},
	{
	"epoch": 0.26219192448872575,
	"grad_norm": 0.5360729694366455,
	"learning_rate": 0.0018518940452920906,
	"loss": 1.4189,
	"step": 1000
	},
	{
	"epoch": 0.264813843733613,
	"grad_norm": 0.5921474695205688,
	"learning_rate": 0.0018470387814217232,
	"loss": 1.424,
	"step": 1010
	},
	{
	"epoch": 0.26743576297850025,
	"grad_norm": 0.6162559986114502,
	"learning_rate": 0.0018421117921574438,
	"loss": 1.4307,
	"step": 1020
	},
	{
	"epoch": 0.2700576822233875,
	"grad_norm": 0.5530286431312561,
	"learning_rate": 0.001837113494705942,
	"loss": 1.4158,
	"step": 1030
	},
	{
	"epoch": 0.2726796014682748,
	"grad_norm": 0.5585499405860901,
	"learning_rate": 0.0018320443123121283,
	"loss": 1.3861,
	"step": 1040
	},
	{
	"epoch": 0.27530152071316205,
	"grad_norm": 0.6225973963737488,
	"learning_rate": 0.0018269046742232966,
	"loss": 1.3942,
	"step": 1050
	},
	{
	"epoch": 0.2779234399580493,
	"grad_norm": 0.49642321467399597,
	"learning_rate": 0.0018216950156527737,
	"loss": 1.3912,
	"step": 1060
	},
	{
	"epoch": 0.28054535920293655,
	"grad_norm": 0.6089576482772827,
	"learning_rate": 0.0018164157777430681,
	"loss": 1.3732,
	"step": 1070
	},
	{
	"epoch": 0.2831672784478238,
	"grad_norm": 0.5753847360610962,
	"learning_rate": 0.0018110674075285157,
	"loss": 1.398,
	"step": 1080
	},
	{
	"epoch": 0.28578919769271105,
	"grad_norm": 0.5357734560966492,
	"learning_rate": 0.0018056503578974242,
	"loss": 1.3851,
	"step": 1090
	},
	{
	"epoch": 0.2884111169375983,
	"grad_norm": 0.5319791436195374,
	"learning_rate": 0.001800165087553724,
	"loss": 1.3804,
	"step": 1100
	},
	{
	"epoch": 0.2910330361824856,
	"grad_norm": 0.5765709280967712,
	"learning_rate": 0.0017946120609781276,
	"loss": 1.3534,
	"step": 1110
	},
	{
	"epoch": 0.29365495542737285,
	"grad_norm": 0.48765453696250916,
	"learning_rate": 0.001788991748388796,
	"loss": 1.3693,
	"step": 1120
	},
	{
	"epoch": 0.2962768746722601,
	"grad_norm": 0.5916075110435486,
	"learning_rate": 0.001783304625701524,
	"loss": 1.3697,
	"step": 1130
	},
	{
	"epoch": 0.29889879391714735,
	"grad_norm": 0.411699503660202,
	"learning_rate": 0.0017775511744894384,
	"loss": 1.3588,
	"step": 1140
	},
	{
	"epoch": 0.3015207131620346,
	"grad_norm": 0.5155631899833679,
	"learning_rate": 0.0017717318819422214,
	"loss": 1.3697,
	"step": 1150
	},
	{
	"epoch": 0.30414263240692185,
	"grad_norm": 0.5687488913536072,
	"learning_rate": 0.0017658472408248551,
	"loss": 1.3558,
	"step": 1160
	},
	{
	"epoch": 0.3067645516518091,
	"grad_norm": 0.5609891414642334,
	"learning_rate": 0.0017598977494358967,
	"loss": 1.3376,
	"step": 1170
	},
	{
	"epoch": 0.3093864708966964,
	"grad_norm": 0.5137512683868408,
	"learning_rate": 0.0017538839115652817,
	"loss": 1.3534,
	"step": 1180
	},
	{
	"epoch": 0.31200839014158366,
	"grad_norm": 0.5840641260147095,
	"learning_rate": 0.001747806236451666,
	"loss": 1.3394,
	"step": 1190
	},
	{
	"epoch": 0.3146303093864709,
	"grad_norm": 0.5758949518203735,
	"learning_rate": 0.0017416652387393027,
	"loss": 1.3417,
	"step": 1200
	},
	{
	"epoch": 0.31725222863135816,
	"grad_norm": 0.5121742486953735,
	"learning_rate": 0.0017354614384344658,
	"loss": 1.341,
	"step": 1210
	},
	{
	"epoch": 0.3198741478762454,
	"grad_norm": 0.5056650638580322,
	"learning_rate": 0.001729195360861414,
	"loss": 1.316,
	"step": 1220
	},
	{
	"epoch": 0.32249606712113266,
	"grad_norm": 0.4782615602016449,
	"learning_rate": 0.0017228675366179106,
	"loss": 1.3226,
	"step": 1230
	},
	{
	"epoch": 0.3251179863660199,
	"grad_norm": 0.49403342604637146,
	"learning_rate": 0.0017164785015302906,
	"loss": 1.37,
	"step": 1240
	},
	{
	"epoch": 0.3277399056109072,
	"grad_norm": 0.4836321175098419,
	"learning_rate": 0.0017100287966080906,
	"loss": 1.3272,
	"step": 1250
	},
	{
	"epoch": 0.33036182485579446,
	"grad_norm": 0.48174890875816345,
	"learning_rate": 0.001703518967998236,
	"loss": 1.3148,
	"step": 1260
	},
	{
	"epoch": 0.3329837441006817,
	"grad_norm": 0.4627121090888977,
	"learning_rate": 0.001696949566938795,
	"loss": 1.3161,
	"step": 1270
	},
	{
	"epoch": 0.33560566334556896,
	"grad_norm": 0.470414936542511,
	"learning_rate": 0.0016903211497123003,
	"loss": 1.3313,
	"step": 1280
	},
	{
	"epoch": 0.3382275825904562,
	"grad_norm": 0.4437310993671417,
	"learning_rate": 0.0016836342775986446,
	"loss": 1.3073,
	"step": 1290
	},
	{
	"epoch": 0.34084950183534346,
	"grad_norm": 0.47688329219818115,
	"learning_rate": 0.0016768895168275534,
	"loss": 1.3128,
	"step": 1300
	},
	{
	"epoch": 0.3434714210802307,
	"grad_norm": 0.5143507122993469,
	"learning_rate": 0.0016700874385306363,
	"loss": 1.3357,
	"step": 1310
	},
	{
	"epoch": 0.34609334032511796,
	"grad_norm": 0.4100657105445862,
	"learning_rate": 0.0016632286186930275,
	"loss": 1.3061,
	"step": 1320
	},
	{
	"epoch": 0.34871525957000526,
	"grad_norm": 0.4421868920326233,
	"learning_rate": 0.0016563136381046088,
	"loss": 1.3158,
	"step": 1330
	},
	{
	"epoch": 0.3513371788148925,
	"grad_norm": 0.4668099582195282,
	"learning_rate": 0.0016493430823108332,
	"loss": 1.3088,
	"step": 1340
	},
	{
	"epoch": 0.35395909805977976,
	"grad_norm": 0.5451709032058716,
	"learning_rate": 0.0016423175415631404,
	"loss": 1.3344,
	"step": 1350
	},
	{
	"epoch": 0.356581017304667,
	"grad_norm": 0.45294106006622314,
	"learning_rate": 0.0016352376107689754,
	"loss": 1.2778,
	"step": 1360
	},
	{
	"epoch": 0.35920293654955426,
	"grad_norm": 0.4404051601886749,
	"learning_rate": 0.0016281038894414143,
	"loss": 1.2871,
	"step": 1370
	},
	{
	"epoch": 0.3618248557944415,
	"grad_norm": 0.45863279700279236,
	"learning_rate": 0.0016209169816483971,
	"loss": 1.3286,
	"step": 1380
	},
	{
	"epoch": 0.36444677503932876,
	"grad_norm": 0.45011425018310547,
	"learning_rate": 0.0016136774959615784,
	"loss": 1.2979,
	"step": 1390
	},
	{
	"epoch": 0.36706869428421607,
	"grad_norm": 0.5113876461982727,
	"learning_rate": 0.0016063860454047943,
	"loss": 1.3088,
	"step": 1400
	},
	{
	"epoch": 0.3696906135291033,
	"grad_norm": 0.40740302205085754,
	"learning_rate": 0.001599043247402151,
	"loss": 1.2703,
	"step": 1410
	},
	{
	"epoch": 0.37231253277399057,
	"grad_norm": 0.4261358976364136,
	"learning_rate": 0.0015916497237257455,
	"loss": 1.2681,
	"step": 1420
	},
	{
	"epoch": 0.3749344520188778,
	"grad_norm": 0.4349290132522583,
	"learning_rate": 0.0015842061004430145,
	"loss": 1.317,
	"step": 1430
	},
	{
	"epoch": 0.37755637126376507,
	"grad_norm": 0.4363626539707184,
	"learning_rate": 0.0015767130078637183,
	"loss": 1.2707,
	"step": 1440
	},
	{
	"epoch": 0.3801782905086523,
	"grad_norm": 0.41238006949424744,
	"learning_rate": 0.0015691710804865706,
	"loss": 1.2763,
	"step": 1450
	},
	{
	"epoch": 0.38280020975353957,
	"grad_norm": 0.476226270198822,
	"learning_rate": 0.0015615809569455089,
	"loss": 1.3037,
	"step": 1460
	},
	{
	"epoch": 0.38542212899842687,
	"grad_norm": 0.45900896191596985,
	"learning_rate": 0.0015539432799556159,
	"loss": 1.287,
	"step": 1470
	},
	{
	"epoch": 0.3880440482433141,
	"grad_norm": 0.3873949348926544,
	"learning_rate": 0.0015462586962586972,
	"loss": 1.2793,
	"step": 1480
	},
	{
	"epoch": 0.39066596748820137,
	"grad_norm": 0.4380306601524353,
	"learning_rate": 0.001538527856568515,
	"loss": 1.2916,
	"step": 1490
	},
	{
	"epoch": 0.3932878867330886,
	"grad_norm": 0.39479300379753113,
	"learning_rate": 0.0015307514155156895,
	"loss": 1.272,
	"step": 1500
	}
	],
	"logging_steps": 10,
	"max_steps": 3814,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 9223372036854775807,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 1.0095396499845284e+18,
	"train_batch_size": 64,
	"trial_name": null,
	"trial_params": null
	}