LFM2-150M-1.5B / trainer_state.json
MostLime's picture
init upload
39cd858 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3932878867330886,
"eval_steps": 500,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00026219192448872575,
"grad_norm": 22.20619010925293,
"learning_rate": 0.0,
"loss": 10.5131,
"step": 1
},
{
"epoch": 0.0026219192448872575,
"grad_norm": 22.429588317871094,
"learning_rate": 4.4999999999999996e-05,
"loss": 10.4662,
"step": 10
},
{
"epoch": 0.005243838489774515,
"grad_norm": 22.83245086669922,
"learning_rate": 9.5e-05,
"loss": 10.1612,
"step": 20
},
{
"epoch": 0.007865757734661772,
"grad_norm": 23.247602462768555,
"learning_rate": 0.000145,
"loss": 9.5256,
"step": 30
},
{
"epoch": 0.01048767697954903,
"grad_norm": 23.51291275024414,
"learning_rate": 0.00019500000000000002,
"loss": 8.5708,
"step": 40
},
{
"epoch": 0.013109596224436287,
"grad_norm": 22.496492385864258,
"learning_rate": 0.000245,
"loss": 7.3388,
"step": 50
},
{
"epoch": 0.015731515469323543,
"grad_norm": 16.345460891723633,
"learning_rate": 0.000295,
"loss": 5.9703,
"step": 60
},
{
"epoch": 0.018353434714210803,
"grad_norm": 3.921259880065918,
"learning_rate": 0.000345,
"loss": 4.9478,
"step": 70
},
{
"epoch": 0.02097535395909806,
"grad_norm": 7.0385589599609375,
"learning_rate": 0.000395,
"loss": 4.6803,
"step": 80
},
{
"epoch": 0.023597273203985317,
"grad_norm": 2.6207873821258545,
"learning_rate": 0.00044500000000000003,
"loss": 4.4974,
"step": 90
},
{
"epoch": 0.026219192448872573,
"grad_norm": 1.9961260557174683,
"learning_rate": 0.000495,
"loss": 4.3314,
"step": 100
},
{
"epoch": 0.028841111693759833,
"grad_norm": 1.6183704137802124,
"learning_rate": 0.000545,
"loss": 4.1959,
"step": 110
},
{
"epoch": 0.03146303093864709,
"grad_norm": 1.331021785736084,
"learning_rate": 0.0005949999999999999,
"loss": 4.0158,
"step": 120
},
{
"epoch": 0.03408495018353435,
"grad_norm": 1.14554762840271,
"learning_rate": 0.0006450000000000001,
"loss": 3.9321,
"step": 130
},
{
"epoch": 0.03670686942842161,
"grad_norm": 0.9175837635993958,
"learning_rate": 0.000695,
"loss": 3.802,
"step": 140
},
{
"epoch": 0.03932878867330886,
"grad_norm": 0.7335033416748047,
"learning_rate": 0.000745,
"loss": 3.6618,
"step": 150
},
{
"epoch": 0.04195070791819612,
"grad_norm": 0.5916274785995483,
"learning_rate": 0.000795,
"loss": 3.5341,
"step": 160
},
{
"epoch": 0.04457262716308338,
"grad_norm": 0.4947799742221832,
"learning_rate": 0.0008449999999999999,
"loss": 3.5311,
"step": 170
},
{
"epoch": 0.04719454640797063,
"grad_norm": 0.40263015031814575,
"learning_rate": 0.0008950000000000001,
"loss": 3.4709,
"step": 180
},
{
"epoch": 0.04981646565285789,
"grad_norm": 0.32677406072616577,
"learning_rate": 0.000945,
"loss": 3.2973,
"step": 190
},
{
"epoch": 0.05243838489774515,
"grad_norm": 0.3071628212928772,
"learning_rate": 0.000995,
"loss": 3.28,
"step": 200
},
{
"epoch": 0.05506030414263241,
"grad_norm": 0.3233015835285187,
"learning_rate": 0.001045,
"loss": 3.2038,
"step": 210
},
{
"epoch": 0.05768222338751967,
"grad_norm": 0.39402100443840027,
"learning_rate": 0.001095,
"loss": 3.1627,
"step": 220
},
{
"epoch": 0.060304142632406924,
"grad_norm": 0.5528343915939331,
"learning_rate": 0.001145,
"loss": 3.1341,
"step": 230
},
{
"epoch": 0.06292606187729417,
"grad_norm": 0.4888489842414856,
"learning_rate": 0.001195,
"loss": 3.0192,
"step": 240
},
{
"epoch": 0.06554798112218144,
"grad_norm": 0.5662292838096619,
"learning_rate": 0.0012450000000000002,
"loss": 2.991,
"step": 250
},
{
"epoch": 0.0681699003670687,
"grad_norm": 0.5800466537475586,
"learning_rate": 0.001295,
"loss": 2.992,
"step": 260
},
{
"epoch": 0.07079181961195595,
"grad_norm": 0.5511091947555542,
"learning_rate": 0.001345,
"loss": 2.9246,
"step": 270
},
{
"epoch": 0.07341373885684321,
"grad_norm": 0.7486537098884583,
"learning_rate": 0.001395,
"loss": 2.8996,
"step": 280
},
{
"epoch": 0.07603565810173046,
"grad_norm": 0.6995801329612732,
"learning_rate": 0.001445,
"loss": 2.7945,
"step": 290
},
{
"epoch": 0.07865757734661773,
"grad_norm": 0.7938666939735413,
"learning_rate": 0.0014950000000000002,
"loss": 2.7632,
"step": 300
},
{
"epoch": 0.08127949659150498,
"grad_norm": 0.7555065155029297,
"learning_rate": 0.001545,
"loss": 2.7513,
"step": 310
},
{
"epoch": 0.08390141583639224,
"grad_norm": 0.7714865803718567,
"learning_rate": 0.001595,
"loss": 2.6165,
"step": 320
},
{
"epoch": 0.08652333508127949,
"grad_norm": 0.7604843974113464,
"learning_rate": 0.001645,
"loss": 2.6391,
"step": 330
},
{
"epoch": 0.08914525432616675,
"grad_norm": 0.7840315699577332,
"learning_rate": 0.0016950000000000001,
"loss": 2.5818,
"step": 340
},
{
"epoch": 0.09176717357105402,
"grad_norm": 1.0126832723617554,
"learning_rate": 0.0017450000000000002,
"loss": 2.5417,
"step": 350
},
{
"epoch": 0.09438909281594127,
"grad_norm": 1.0092129707336426,
"learning_rate": 0.001795,
"loss": 2.4844,
"step": 360
},
{
"epoch": 0.09701101206082853,
"grad_norm": 1.1585489511489868,
"learning_rate": 0.001845,
"loss": 2.4645,
"step": 370
},
{
"epoch": 0.09963293130571578,
"grad_norm": 1.0778034925460815,
"learning_rate": 0.001895,
"loss": 2.4003,
"step": 380
},
{
"epoch": 0.10225485055060304,
"grad_norm": 1.146636962890625,
"learning_rate": 0.0019450000000000001,
"loss": 2.3466,
"step": 390
},
{
"epoch": 0.1048767697954903,
"grad_norm": 0.9742526412010193,
"learning_rate": 0.0019950000000000002,
"loss": 2.3088,
"step": 400
},
{
"epoch": 0.10749868904037756,
"grad_norm": 1.3035728931427002,
"learning_rate": 0.0019999657054386192,
"loss": 2.2834,
"step": 410
},
{
"epoch": 0.11012060828526482,
"grad_norm": 1.0689384937286377,
"learning_rate": 0.0019998471593574603,
"loss": 2.2473,
"step": 420
},
{
"epoch": 0.11274252753015207,
"grad_norm": 1.1519441604614258,
"learning_rate": 0.001999643948402709,
"loss": 2.1925,
"step": 430
},
{
"epoch": 0.11536444677503933,
"grad_norm": 0.9427940249443054,
"learning_rate": 0.0019993560897818255,
"loss": 2.1774,
"step": 440
},
{
"epoch": 0.11798636601992658,
"grad_norm": 0.9017934203147888,
"learning_rate": 0.0019989836078700496,
"loss": 2.152,
"step": 450
},
{
"epoch": 0.12060828526481385,
"grad_norm": 1.018966555595398,
"learning_rate": 0.001998526534208335,
"loss": 2.0825,
"step": 460
},
{
"epoch": 0.1232302045097011,
"grad_norm": 1.0533466339111328,
"learning_rate": 0.0019979849075006813,
"loss": 2.1358,
"step": 470
},
{
"epoch": 0.12585212375458835,
"grad_norm": 0.941605806350708,
"learning_rate": 0.001997358773610856,
"loss": 2.0524,
"step": 480
},
{
"epoch": 0.12847404299947562,
"grad_norm": 0.8877449035644531,
"learning_rate": 0.0019966481855585075,
"loss": 2.0308,
"step": 490
},
{
"epoch": 0.13109596224436287,
"grad_norm": 0.8652307391166687,
"learning_rate": 0.001995853203514682,
"loss": 2.012,
"step": 500
},
{
"epoch": 0.13371788148925012,
"grad_norm": 0.8943641781806946,
"learning_rate": 0.0019949738947967217,
"loss": 1.9729,
"step": 510
},
{
"epoch": 0.1363398007341374,
"grad_norm": 0.9359736442565918,
"learning_rate": 0.001994010333862568,
"loss": 1.9997,
"step": 520
},
{
"epoch": 0.13896171997902465,
"grad_norm": 1.0085017681121826,
"learning_rate": 0.001992962602304456,
"loss": 1.937,
"step": 530
},
{
"epoch": 0.1415836392239119,
"grad_norm": 0.7549618482589722,
"learning_rate": 0.0019918307888420065,
"loss": 1.9268,
"step": 540
},
{
"epoch": 0.14420555846879915,
"grad_norm": 0.8932085037231445,
"learning_rate": 0.0019906149893147104,
"loss": 1.9014,
"step": 550
},
{
"epoch": 0.14682747771368643,
"grad_norm": 0.8130724430084229,
"learning_rate": 0.001989315306673817,
"loss": 1.8577,
"step": 560
},
{
"epoch": 0.14944939695857368,
"grad_norm": 0.8497139811515808,
"learning_rate": 0.0019879318509736137,
"loss": 1.8185,
"step": 570
},
{
"epoch": 0.15207131620346093,
"grad_norm": 0.6299962997436523,
"learning_rate": 0.001986464739362106,
"loss": 1.811,
"step": 580
},
{
"epoch": 0.1546932354483482,
"grad_norm": 0.7180768251419067,
"learning_rate": 0.0019849140960711024,
"loss": 1.7944,
"step": 590
},
{
"epoch": 0.15731515469323545,
"grad_norm": 0.8082334399223328,
"learning_rate": 0.0019832800524056888,
"loss": 1.8333,
"step": 600
},
{
"epoch": 0.1599370739381227,
"grad_norm": 0.8284159302711487,
"learning_rate": 0.0019815627467331142,
"loss": 1.811,
"step": 610
},
{
"epoch": 0.16255899318300995,
"grad_norm": 0.7332941293716431,
"learning_rate": 0.0019797623244710715,
"loss": 1.7704,
"step": 620
},
{
"epoch": 0.16518091242789723,
"grad_norm": 0.7234723567962646,
"learning_rate": 0.0019778789380753862,
"loss": 1.7558,
"step": 630
},
{
"epoch": 0.16780283167278448,
"grad_norm": 0.693242073059082,
"learning_rate": 0.001975912747027104,
"loss": 1.742,
"step": 640
},
{
"epoch": 0.17042475091767173,
"grad_norm": 0.8523733019828796,
"learning_rate": 0.0019738639178189885,
"loss": 1.7438,
"step": 650
},
{
"epoch": 0.17304667016255898,
"grad_norm": 0.7505561709403992,
"learning_rate": 0.001971732623941422,
"loss": 1.7251,
"step": 660
},
{
"epoch": 0.17566858940744626,
"grad_norm": 0.7338821887969971,
"learning_rate": 0.0019695190458677144,
"loss": 1.7281,
"step": 670
},
{
"epoch": 0.1782905086523335,
"grad_norm": 0.8278585076332092,
"learning_rate": 0.001967223371038823,
"loss": 1.6983,
"step": 680
},
{
"epoch": 0.18091242789722076,
"grad_norm": 0.6785498261451721,
"learning_rate": 0.0019648457938474776,
"loss": 1.7018,
"step": 690
},
{
"epoch": 0.18353434714210803,
"grad_norm": 0.7954968810081482,
"learning_rate": 0.0019623865156217215,
"loss": 1.6978,
"step": 700
},
{
"epoch": 0.18615626638699528,
"grad_norm": 0.6877925992012024,
"learning_rate": 0.001959845744607864,
"loss": 1.6693,
"step": 710
},
{
"epoch": 0.18877818563188253,
"grad_norm": 0.6183112859725952,
"learning_rate": 0.001957223695952844,
"loss": 1.656,
"step": 720
},
{
"epoch": 0.19140010487676978,
"grad_norm": 0.6864896416664124,
"learning_rate": 0.0019545205916860152,
"loss": 1.6188,
"step": 730
},
{
"epoch": 0.19402202412165706,
"grad_norm": 0.6678555011749268,
"learning_rate": 0.0019517366607003429,
"loss": 1.6195,
"step": 740
},
{
"epoch": 0.1966439433665443,
"grad_norm": 0.724320113658905,
"learning_rate": 0.0019488721387330222,
"loss": 1.6067,
"step": 750
},
{
"epoch": 0.19926586261143156,
"grad_norm": 0.6665757298469543,
"learning_rate": 0.0019459272683455162,
"loss": 1.5781,
"step": 760
},
{
"epoch": 0.20188778185631884,
"grad_norm": 0.7139772772789001,
"learning_rate": 0.0019429022989030176,
"loss": 1.5647,
"step": 770
},
{
"epoch": 0.2045097011012061,
"grad_norm": 0.6505457758903503,
"learning_rate": 0.0019397974865533315,
"loss": 1.5869,
"step": 780
},
{
"epoch": 0.20713162034609334,
"grad_norm": 0.6815754175186157,
"learning_rate": 0.001936613094205186,
"loss": 1.5848,
"step": 790
},
{
"epoch": 0.2097535395909806,
"grad_norm": 0.6977171897888184,
"learning_rate": 0.00193334939150597,
"loss": 1.5284,
"step": 800
},
{
"epoch": 0.21237545883586786,
"grad_norm": 0.5965753197669983,
"learning_rate": 0.0019300066548188998,
"loss": 1.5468,
"step": 810
},
{
"epoch": 0.2149973780807551,
"grad_norm": 0.596052885055542,
"learning_rate": 0.001926585167199616,
"loss": 1.5579,
"step": 820
},
{
"epoch": 0.21761929732564236,
"grad_norm": 0.6821017861366272,
"learning_rate": 0.001923085218372218,
"loss": 1.4984,
"step": 830
},
{
"epoch": 0.22024121657052964,
"grad_norm": 0.6523297429084778,
"learning_rate": 0.0019195071047047277,
"loss": 1.537,
"step": 840
},
{
"epoch": 0.2228631358154169,
"grad_norm": 0.648935079574585,
"learning_rate": 0.0019158511291839945,
"loss": 1.5192,
"step": 850
},
{
"epoch": 0.22548505506030414,
"grad_norm": 0.6102792620658875,
"learning_rate": 0.0019121176013900407,
"loss": 1.5209,
"step": 860
},
{
"epoch": 0.2281069743051914,
"grad_norm": 0.6573307514190674,
"learning_rate": 0.0019083068374698448,
"loss": 1.49,
"step": 870
},
{
"epoch": 0.23072889355007867,
"grad_norm": 0.6355723738670349,
"learning_rate": 0.0019044191601105727,
"loss": 1.4929,
"step": 880
},
{
"epoch": 0.23335081279496592,
"grad_norm": 0.5931225419044495,
"learning_rate": 0.0019004548985122511,
"loss": 1.4813,
"step": 890
},
{
"epoch": 0.23597273203985317,
"grad_norm": 0.6640650629997253,
"learning_rate": 0.0018964143883598936,
"loss": 1.4808,
"step": 900
},
{
"epoch": 0.23859465128474042,
"grad_norm": 0.6377866268157959,
"learning_rate": 0.0018922979717950748,
"loss": 1.4901,
"step": 910
},
{
"epoch": 0.2412165705296277,
"grad_norm": 0.6502982378005981,
"learning_rate": 0.0018881059973869581,
"loss": 1.4501,
"step": 920
},
{
"epoch": 0.24383848977451494,
"grad_norm": 0.602969765663147,
"learning_rate": 0.0018838388201027805,
"loss": 1.4661,
"step": 930
},
{
"epoch": 0.2464604090194022,
"grad_norm": 0.6061879396438599,
"learning_rate": 0.001879496801277794,
"loss": 1.4408,
"step": 940
},
{
"epoch": 0.24908232826428947,
"grad_norm": 0.8049127459526062,
"learning_rate": 0.001875080308584669,
"loss": 1.4466,
"step": 950
},
{
"epoch": 0.2517042475091767,
"grad_norm": 0.46771517395973206,
"learning_rate": 0.00187058971600236,
"loss": 1.4382,
"step": 960
},
{
"epoch": 0.254326166754064,
"grad_norm": 0.6081333756446838,
"learning_rate": 0.001866025403784439,
"loss": 1.4518,
"step": 970
},
{
"epoch": 0.25694808599895125,
"grad_norm": 0.6247040033340454,
"learning_rate": 0.0018613877584268944,
"loss": 1.4639,
"step": 980
},
{
"epoch": 0.2595700052438385,
"grad_norm": 0.5699506998062134,
"learning_rate": 0.0018566771726354063,
"loss": 1.4218,
"step": 990
},
{
"epoch": 0.26219192448872575,
"grad_norm": 0.5360729694366455,
"learning_rate": 0.0018518940452920906,
"loss": 1.4189,
"step": 1000
},
{
"epoch": 0.264813843733613,
"grad_norm": 0.5921474695205688,
"learning_rate": 0.0018470387814217232,
"loss": 1.424,
"step": 1010
},
{
"epoch": 0.26743576297850025,
"grad_norm": 0.6162559986114502,
"learning_rate": 0.0018421117921574438,
"loss": 1.4307,
"step": 1020
},
{
"epoch": 0.2700576822233875,
"grad_norm": 0.5530286431312561,
"learning_rate": 0.001837113494705942,
"loss": 1.4158,
"step": 1030
},
{
"epoch": 0.2726796014682748,
"grad_norm": 0.5585499405860901,
"learning_rate": 0.0018320443123121283,
"loss": 1.3861,
"step": 1040
},
{
"epoch": 0.27530152071316205,
"grad_norm": 0.6225973963737488,
"learning_rate": 0.0018269046742232966,
"loss": 1.3942,
"step": 1050
},
{
"epoch": 0.2779234399580493,
"grad_norm": 0.49642321467399597,
"learning_rate": 0.0018216950156527737,
"loss": 1.3912,
"step": 1060
},
{
"epoch": 0.28054535920293655,
"grad_norm": 0.6089576482772827,
"learning_rate": 0.0018164157777430681,
"loss": 1.3732,
"step": 1070
},
{
"epoch": 0.2831672784478238,
"grad_norm": 0.5753847360610962,
"learning_rate": 0.0018110674075285157,
"loss": 1.398,
"step": 1080
},
{
"epoch": 0.28578919769271105,
"grad_norm": 0.5357734560966492,
"learning_rate": 0.0018056503578974242,
"loss": 1.3851,
"step": 1090
},
{
"epoch": 0.2884111169375983,
"grad_norm": 0.5319791436195374,
"learning_rate": 0.001800165087553724,
"loss": 1.3804,
"step": 1100
},
{
"epoch": 0.2910330361824856,
"grad_norm": 0.5765709280967712,
"learning_rate": 0.0017946120609781276,
"loss": 1.3534,
"step": 1110
},
{
"epoch": 0.29365495542737285,
"grad_norm": 0.48765453696250916,
"learning_rate": 0.001788991748388796,
"loss": 1.3693,
"step": 1120
},
{
"epoch": 0.2962768746722601,
"grad_norm": 0.5916075110435486,
"learning_rate": 0.001783304625701524,
"loss": 1.3697,
"step": 1130
},
{
"epoch": 0.29889879391714735,
"grad_norm": 0.411699503660202,
"learning_rate": 0.0017775511744894384,
"loss": 1.3588,
"step": 1140
},
{
"epoch": 0.3015207131620346,
"grad_norm": 0.5155631899833679,
"learning_rate": 0.0017717318819422214,
"loss": 1.3697,
"step": 1150
},
{
"epoch": 0.30414263240692185,
"grad_norm": 0.5687488913536072,
"learning_rate": 0.0017658472408248551,
"loss": 1.3558,
"step": 1160
},
{
"epoch": 0.3067645516518091,
"grad_norm": 0.5609891414642334,
"learning_rate": 0.0017598977494358967,
"loss": 1.3376,
"step": 1170
},
{
"epoch": 0.3093864708966964,
"grad_norm": 0.5137512683868408,
"learning_rate": 0.0017538839115652817,
"loss": 1.3534,
"step": 1180
},
{
"epoch": 0.31200839014158366,
"grad_norm": 0.5840641260147095,
"learning_rate": 0.001747806236451666,
"loss": 1.3394,
"step": 1190
},
{
"epoch": 0.3146303093864709,
"grad_norm": 0.5758949518203735,
"learning_rate": 0.0017416652387393027,
"loss": 1.3417,
"step": 1200
},
{
"epoch": 0.31725222863135816,
"grad_norm": 0.5121742486953735,
"learning_rate": 0.0017354614384344658,
"loss": 1.341,
"step": 1210
},
{
"epoch": 0.3198741478762454,
"grad_norm": 0.5056650638580322,
"learning_rate": 0.001729195360861414,
"loss": 1.316,
"step": 1220
},
{
"epoch": 0.32249606712113266,
"grad_norm": 0.4782615602016449,
"learning_rate": 0.0017228675366179106,
"loss": 1.3226,
"step": 1230
},
{
"epoch": 0.3251179863660199,
"grad_norm": 0.49403342604637146,
"learning_rate": 0.0017164785015302906,
"loss": 1.37,
"step": 1240
},
{
"epoch": 0.3277399056109072,
"grad_norm": 0.4836321175098419,
"learning_rate": 0.0017100287966080906,
"loss": 1.3272,
"step": 1250
},
{
"epoch": 0.33036182485579446,
"grad_norm": 0.48174890875816345,
"learning_rate": 0.001703518967998236,
"loss": 1.3148,
"step": 1260
},
{
"epoch": 0.3329837441006817,
"grad_norm": 0.4627121090888977,
"learning_rate": 0.001696949566938795,
"loss": 1.3161,
"step": 1270
},
{
"epoch": 0.33560566334556896,
"grad_norm": 0.470414936542511,
"learning_rate": 0.0016903211497123003,
"loss": 1.3313,
"step": 1280
},
{
"epoch": 0.3382275825904562,
"grad_norm": 0.4437310993671417,
"learning_rate": 0.0016836342775986446,
"loss": 1.3073,
"step": 1290
},
{
"epoch": 0.34084950183534346,
"grad_norm": 0.47688329219818115,
"learning_rate": 0.0016768895168275534,
"loss": 1.3128,
"step": 1300
},
{
"epoch": 0.3434714210802307,
"grad_norm": 0.5143507122993469,
"learning_rate": 0.0016700874385306363,
"loss": 1.3357,
"step": 1310
},
{
"epoch": 0.34609334032511796,
"grad_norm": 0.4100657105445862,
"learning_rate": 0.0016632286186930275,
"loss": 1.3061,
"step": 1320
},
{
"epoch": 0.34871525957000526,
"grad_norm": 0.4421868920326233,
"learning_rate": 0.0016563136381046088,
"loss": 1.3158,
"step": 1330
},
{
"epoch": 0.3513371788148925,
"grad_norm": 0.4668099582195282,
"learning_rate": 0.0016493430823108332,
"loss": 1.3088,
"step": 1340
},
{
"epoch": 0.35395909805977976,
"grad_norm": 0.5451709032058716,
"learning_rate": 0.0016423175415631404,
"loss": 1.3344,
"step": 1350
},
{
"epoch": 0.356581017304667,
"grad_norm": 0.45294106006622314,
"learning_rate": 0.0016352376107689754,
"loss": 1.2778,
"step": 1360
},
{
"epoch": 0.35920293654955426,
"grad_norm": 0.4404051601886749,
"learning_rate": 0.0016281038894414143,
"loss": 1.2871,
"step": 1370
},
{
"epoch": 0.3618248557944415,
"grad_norm": 0.45863279700279236,
"learning_rate": 0.0016209169816483971,
"loss": 1.3286,
"step": 1380
},
{
"epoch": 0.36444677503932876,
"grad_norm": 0.45011425018310547,
"learning_rate": 0.0016136774959615784,
"loss": 1.2979,
"step": 1390
},
{
"epoch": 0.36706869428421607,
"grad_norm": 0.5113876461982727,
"learning_rate": 0.0016063860454047943,
"loss": 1.3088,
"step": 1400
},
{
"epoch": 0.3696906135291033,
"grad_norm": 0.40740302205085754,
"learning_rate": 0.001599043247402151,
"loss": 1.2703,
"step": 1410
},
{
"epoch": 0.37231253277399057,
"grad_norm": 0.4261358976364136,
"learning_rate": 0.0015916497237257455,
"loss": 1.2681,
"step": 1420
},
{
"epoch": 0.3749344520188778,
"grad_norm": 0.4349290132522583,
"learning_rate": 0.0015842061004430145,
"loss": 1.317,
"step": 1430
},
{
"epoch": 0.37755637126376507,
"grad_norm": 0.4363626539707184,
"learning_rate": 0.0015767130078637183,
"loss": 1.2707,
"step": 1440
},
{
"epoch": 0.3801782905086523,
"grad_norm": 0.41238006949424744,
"learning_rate": 0.0015691710804865706,
"loss": 1.2763,
"step": 1450
},
{
"epoch": 0.38280020975353957,
"grad_norm": 0.476226270198822,
"learning_rate": 0.0015615809569455089,
"loss": 1.3037,
"step": 1460
},
{
"epoch": 0.38542212899842687,
"grad_norm": 0.45900896191596985,
"learning_rate": 0.0015539432799556159,
"loss": 1.287,
"step": 1470
},
{
"epoch": 0.3880440482433141,
"grad_norm": 0.3873949348926544,
"learning_rate": 0.0015462586962586972,
"loss": 1.2793,
"step": 1480
},
{
"epoch": 0.39066596748820137,
"grad_norm": 0.4380306601524353,
"learning_rate": 0.001538527856568515,
"loss": 1.2916,
"step": 1490
},
{
"epoch": 0.3932878867330886,
"grad_norm": 0.39479300379753113,
"learning_rate": 0.0015307514155156895,
"loss": 1.272,
"step": 1500
}
],
"logging_steps": 10,
"max_steps": 3814,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.0095396499845284e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}