SykoLLM-v6.6 / trainer_state.json
SykoSLM's picture
SykoLLM v6.6
ad43b15 verified
Raw
History Blame Contribute Delete
49.6 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.105875,
"eval_steps": 500,
"global_step": 2800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00125,
"grad_norm": 0.3465445339679718,
"learning_rate": 7.4204999999999995e-06,
"loss": 2.7873512268066407,
"step": 10
},
{
"epoch": 0.0025,
"grad_norm": 0.32606860995292664,
"learning_rate": 1.56655e-05,
"loss": 2.760304069519043,
"step": 20
},
{
"epoch": 0.00375,
"grad_norm": 0.33225658535957336,
"learning_rate": 2.3910499999999997e-05,
"loss": 2.7759071350097657,
"step": 30
},
{
"epoch": 0.005,
"grad_norm": 0.31996211409568787,
"learning_rate": 3.21555e-05,
"loss": 2.7292430877685545,
"step": 40
},
{
"epoch": 0.00625,
"grad_norm": 0.3153120279312134,
"learning_rate": 4.04005e-05,
"loss": 2.733371353149414,
"step": 50
},
{
"epoch": 0.0075,
"grad_norm": 0.3135412037372589,
"learning_rate": 4.8645499999999994e-05,
"loss": 2.7492229461669924,
"step": 60
},
{
"epoch": 0.00875,
"grad_norm": 0.3155956268310547,
"learning_rate": 5.6890499999999993e-05,
"loss": 2.7486228942871094,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 0.3149002194404602,
"learning_rate": 6.51355e-05,
"loss": 2.760879898071289,
"step": 80
},
{
"epoch": 0.01125,
"grad_norm": 0.3194095492362976,
"learning_rate": 7.33805e-05,
"loss": 2.734035873413086,
"step": 90
},
{
"epoch": 0.0125,
"grad_norm": 0.3121851980686188,
"learning_rate": 8.16255e-05,
"loss": 2.7368759155273437,
"step": 100
},
{
"epoch": 0.01375,
"grad_norm": 0.3184032440185547,
"learning_rate": 8.98705e-05,
"loss": 2.736837387084961,
"step": 110
},
{
"epoch": 0.015,
"grad_norm": 0.3170839250087738,
"learning_rate": 9.81155e-05,
"loss": 2.7051807403564454,
"step": 120
},
{
"epoch": 0.01625,
"grad_norm": 0.3180184066295624,
"learning_rate": 0.0001063605,
"loss": 2.7603172302246093,
"step": 130
},
{
"epoch": 0.0175,
"grad_norm": 0.31468942761421204,
"learning_rate": 0.00011460549999999999,
"loss": 2.7113197326660154,
"step": 140
},
{
"epoch": 0.01875,
"grad_norm": 0.31729385256767273,
"learning_rate": 0.00012285049999999999,
"loss": 2.7222190856933595,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 0.3197655975818634,
"learning_rate": 0.0001310955,
"loss": 2.7241241455078127,
"step": 160
},
{
"epoch": 0.02125,
"grad_norm": 0.3256337642669678,
"learning_rate": 0.00013934049999999998,
"loss": 2.7403392791748047,
"step": 170
},
{
"epoch": 0.0225,
"grad_norm": 0.3098828196525574,
"learning_rate": 0.0001475855,
"loss": 2.7496837615966796,
"step": 180
},
{
"epoch": 0.02375,
"grad_norm": 0.3134927749633789,
"learning_rate": 0.00015583049999999998,
"loss": 2.750768280029297,
"step": 190
},
{
"epoch": 0.025,
"grad_norm": 0.3353506922721863,
"learning_rate": 0.0001640755,
"loss": 2.7532047271728515,
"step": 200
},
{
"epoch": 0.02625,
"grad_norm": 0.3203900456428528,
"learning_rate": 0.0001648994583038516,
"loss": 2.7489036560058593,
"step": 210
},
{
"epoch": 0.0275,
"grad_norm": 0.3266359865665436,
"learning_rate": 0.00016489758578309418,
"loss": 2.7316030502319335,
"step": 220
},
{
"epoch": 0.02875,
"grad_norm": 0.31027814745903015,
"learning_rate": 0.00016489437578049018,
"loss": 2.7714206695556642,
"step": 230
},
{
"epoch": 0.03,
"grad_norm": 0.325736939907074,
"learning_rate": 0.0001648898283481129,
"loss": 2.7439931869506835,
"step": 240
},
{
"epoch": 0.03125,
"grad_norm": 0.30528295040130615,
"learning_rate": 0.00016488394355973176,
"loss": 2.766144943237305,
"step": 250
},
{
"epoch": 0.0325,
"grad_norm": 0.31271758675575256,
"learning_rate": 0.000164876721510811,
"loss": 2.7013065338134767,
"step": 260
},
{
"epoch": 0.03375,
"grad_norm": 0.3198724687099457,
"learning_rate": 0.0001648681623185082,
"loss": 2.7379714965820314,
"step": 270
},
{
"epoch": 0.035,
"grad_norm": 0.33557799458503723,
"learning_rate": 0.00016485826612167237,
"loss": 2.76102352142334,
"step": 280
},
{
"epoch": 0.03625,
"grad_norm": 0.3212190568447113,
"learning_rate": 0.00016484703308084162,
"loss": 2.7475757598876953,
"step": 290
},
{
"epoch": 0.0375,
"grad_norm": 0.31533411145210266,
"learning_rate": 0.00016483446337824071,
"loss": 2.747650718688965,
"step": 300
},
{
"epoch": 0.03875,
"grad_norm": 0.33507952094078064,
"learning_rate": 0.00016482055721777798,
"loss": 2.739873504638672,
"step": 310
},
{
"epoch": 0.04,
"grad_norm": 0.31843748688697815,
"learning_rate": 0.00016480531482504198,
"loss": 2.7478389739990234,
"step": 320
},
{
"epoch": 0.04125,
"grad_norm": 0.3268890380859375,
"learning_rate": 0.00016478873644729805,
"loss": 2.7712429046630858,
"step": 330
},
{
"epoch": 0.0425,
"grad_norm": 0.315518319606781,
"learning_rate": 0.00016477082235348404,
"loss": 2.7189746856689454,
"step": 340
},
{
"epoch": 0.04375,
"grad_norm": 0.31345367431640625,
"learning_rate": 0.0001647515728342061,
"loss": 2.7359670639038085,
"step": 350
},
{
"epoch": 0.045,
"grad_norm": 0.325610876083374,
"learning_rate": 0.0001647309882017339,
"loss": 2.748139572143555,
"step": 360
},
{
"epoch": 0.04625,
"grad_norm": 0.3194393813610077,
"learning_rate": 0.00016470906878999564,
"loss": 2.7462692260742188,
"step": 370
},
{
"epoch": 0.0475,
"grad_norm": 0.3070792555809021,
"learning_rate": 0.0001646858149545726,
"loss": 2.757720184326172,
"step": 380
},
{
"epoch": 0.04875,
"grad_norm": 0.32982465624809265,
"learning_rate": 0.00016466122707269328,
"loss": 2.7279708862304686,
"step": 390
},
{
"epoch": 0.05,
"grad_norm": 0.309640496969223,
"learning_rate": 0.0001646353055432274,
"loss": 2.724739837646484,
"step": 400
},
{
"epoch": 0.05125,
"grad_norm": 0.31954118609428406,
"learning_rate": 0.00016460805078667945,
"loss": 2.7295236587524414,
"step": 410
},
{
"epoch": 0.0525,
"grad_norm": 0.30906999111175537,
"learning_rate": 0.00016457946324518165,
"loss": 2.734362030029297,
"step": 420
},
{
"epoch": 0.05375,
"grad_norm": 0.3177924156188965,
"learning_rate": 0.00016454954338248712,
"loss": 2.7312435150146483,
"step": 430
},
{
"epoch": 0.055,
"grad_norm": 0.3104606866836548,
"learning_rate": 0.00016451829168396203,
"loss": 2.7339248657226562,
"step": 440
},
{
"epoch": 0.05625,
"grad_norm": 0.31935980916023254,
"learning_rate": 0.0001644857086565779,
"loss": 2.762462043762207,
"step": 450
},
{
"epoch": 0.0575,
"grad_norm": 0.321206659078598,
"learning_rate": 0.0001644517948289035,
"loss": 2.7401878356933596,
"step": 460
},
{
"epoch": 0.05875,
"grad_norm": 0.31553006172180176,
"learning_rate": 0.00016441655075109576,
"loss": 2.7154884338378906,
"step": 470
},
{
"epoch": 0.06,
"grad_norm": 0.3156311810016632,
"learning_rate": 0.0001643799769948916,
"loss": 2.731028747558594,
"step": 480
},
{
"epoch": 0.06125,
"grad_norm": 0.31830424070358276,
"learning_rate": 0.00016434207415359802,
"loss": 2.748556137084961,
"step": 490
},
{
"epoch": 0.0625,
"grad_norm": 0.3151983320713043,
"learning_rate": 0.0001643028428420828,
"loss": 2.7336639404296874,
"step": 500
},
{
"epoch": 0.06375,
"grad_norm": 0.32100728154182434,
"learning_rate": 0.00016426228369676436,
"loss": 2.733713150024414,
"step": 510
},
{
"epoch": 0.065,
"grad_norm": 0.3150577247142792,
"learning_rate": 0.00016422039737560163,
"loss": 2.747536849975586,
"step": 520
},
{
"epoch": 0.06625,
"grad_norm": 0.32159915566444397,
"learning_rate": 0.0001641771845580832,
"loss": 2.7145294189453124,
"step": 530
},
{
"epoch": 0.0675,
"grad_norm": 0.32887545228004456,
"learning_rate": 0.0001641326459452163,
"loss": 2.7391708374023436,
"step": 540
},
{
"epoch": 0.06875,
"grad_norm": 0.3189705014228821,
"learning_rate": 0.00016408678225951563,
"loss": 2.724725341796875,
"step": 550
},
{
"epoch": 0.07,
"grad_norm": 0.3386867940425873,
"learning_rate": 0.0001640395942449914,
"loss": 2.7544118881225588,
"step": 560
},
{
"epoch": 0.07125,
"grad_norm": 0.30630990862846375,
"learning_rate": 0.00016399108266713735,
"loss": 2.746489715576172,
"step": 570
},
{
"epoch": 0.0725,
"grad_norm": 0.3187973201274872,
"learning_rate": 0.00016394124831291837,
"loss": 2.7217391967773437,
"step": 580
},
{
"epoch": 0.07375,
"grad_norm": 0.316847562789917,
"learning_rate": 0.00016389009199075774,
"loss": 2.7319801330566404,
"step": 590
},
{
"epoch": 0.075,
"grad_norm": 0.3210533559322357,
"learning_rate": 0.00016383761453052384,
"loss": 2.7253528594970704,
"step": 600
},
{
"epoch": 0.07625,
"grad_norm": 0.30917614698410034,
"learning_rate": 0.00016378381678351702,
"loss": 2.7291168212890624,
"step": 610
},
{
"epoch": 0.0775,
"grad_norm": 0.3088016211986542,
"learning_rate": 0.0001637286996224554,
"loss": 2.696218490600586,
"step": 620
},
{
"epoch": 0.07875,
"grad_norm": 0.32467445731163025,
"learning_rate": 0.0001636722639414611,
"loss": 2.7149139404296876,
"step": 630
},
{
"epoch": 0.08,
"grad_norm": 0.3138329088687897,
"learning_rate": 0.0001636145106560454,
"loss": 2.73681755065918,
"step": 640
},
{
"epoch": 0.08125,
"grad_norm": 0.3167824447154999,
"learning_rate": 0.0001635554407030941,
"loss": 2.7229454040527346,
"step": 650
},
{
"epoch": 0.0825,
"grad_norm": 0.3144330680370331,
"learning_rate": 0.0001634950550408522,
"loss": 2.6987558364868165,
"step": 660
},
{
"epoch": 0.08375,
"grad_norm": 0.311829149723053,
"learning_rate": 0.00016343335464890846,
"loss": 2.706182861328125,
"step": 670
},
{
"epoch": 0.085,
"grad_norm": 0.3265558183193207,
"learning_rate": 0.00016337034052817947,
"loss": 2.7086441040039064,
"step": 680
},
{
"epoch": 0.08625,
"grad_norm": 0.3073708117008209,
"learning_rate": 0.00016330601370089334,
"loss": 2.7448238372802733,
"step": 690
},
{
"epoch": 0.0875,
"grad_norm": 0.30871179699897766,
"learning_rate": 0.0001632403752105732,
"loss": 2.7313838958740235,
"step": 700
},
{
"epoch": 0.08875,
"grad_norm": 0.31331929564476013,
"learning_rate": 0.00016317342612202036,
"loss": 2.7109472274780275,
"step": 710
},
{
"epoch": 0.09,
"grad_norm": 0.3069651424884796,
"learning_rate": 0.0001631051675212967,
"loss": 2.698355865478516,
"step": 720
},
{
"epoch": 0.09125,
"grad_norm": 0.3077262341976166,
"learning_rate": 0.00016303560051570746,
"loss": 2.707406997680664,
"step": 730
},
{
"epoch": 0.0925,
"grad_norm": 0.3193919062614441,
"learning_rate": 0.00016296472623378308,
"loss": 2.709014129638672,
"step": 740
},
{
"epoch": 0.09375,
"grad_norm": 0.31028079986572266,
"learning_rate": 0.0001628925458252608,
"loss": 2.7283496856689453,
"step": 750
},
{
"epoch": 0.095,
"grad_norm": 0.3203696310520172,
"learning_rate": 0.00016281906046106622,
"loss": 2.723176193237305,
"step": 760
},
{
"epoch": 0.09625,
"grad_norm": 0.31216055154800415,
"learning_rate": 0.0001627442713332942,
"loss": 2.740637016296387,
"step": 770
},
{
"epoch": 0.0975,
"grad_norm": 0.3120918869972229,
"learning_rate": 0.00016266817965518942,
"loss": 2.720622444152832,
"step": 780
},
{
"epoch": 0.09875,
"grad_norm": 0.3088921308517456,
"learning_rate": 0.00016259078666112692,
"loss": 2.714591217041016,
"step": 790
},
{
"epoch": 0.1,
"grad_norm": 0.30949750542640686,
"learning_rate": 0.00016251209360659192,
"loss": 2.7191795349121093,
"step": 800
},
{
"epoch": 0.10125,
"grad_norm": 0.32115787267684937,
"learning_rate": 0.00016243210176815944,
"loss": 2.6966245651245115,
"step": 810
},
{
"epoch": 0.1025,
"grad_norm": 0.307424396276474,
"learning_rate": 0.00016235081244347373,
"loss": 2.730236625671387,
"step": 820
},
{
"epoch": 0.10375,
"grad_norm": 0.31429022550582886,
"learning_rate": 0.00016226822695122704,
"loss": 2.691334533691406,
"step": 830
},
{
"epoch": 0.105,
"grad_norm": 0.30951419472694397,
"learning_rate": 0.00016218434663113843,
"loss": 2.690280532836914,
"step": 840
},
{
"epoch": 0.10625,
"grad_norm": 0.31636205315589905,
"learning_rate": 0.00016209917284393176,
"loss": 2.7146608352661135,
"step": 850
},
{
"epoch": 0.1075,
"grad_norm": 0.31698304414749146,
"learning_rate": 0.00016201270697131396,
"loss": 2.739955520629883,
"step": 860
},
{
"epoch": 0.10875,
"grad_norm": 0.30591675639152527,
"learning_rate": 0.00016192495041595235,
"loss": 2.725113868713379,
"step": 870
},
{
"epoch": 0.11,
"grad_norm": 0.3064011037349701,
"learning_rate": 0.00016183590460145194,
"loss": 2.7186939239501955,
"step": 880
},
{
"epoch": 0.11125,
"grad_norm": 0.30639246106147766,
"learning_rate": 0.00016174557097233246,
"loss": 2.713937187194824,
"step": 890
},
{
"epoch": 0.1125,
"grad_norm": 0.3199147880077362,
"learning_rate": 0.00016165395099400478,
"loss": 2.7232639312744142,
"step": 900
},
{
"epoch": 0.11375,
"grad_norm": 0.3103027045726776,
"learning_rate": 0.00016156104615274719,
"loss": 2.7207107543945312,
"step": 910
},
{
"epoch": 0.115,
"grad_norm": 0.32256069779396057,
"learning_rate": 0.0001614668579556813,
"loss": 2.7164112091064454,
"step": 920
},
{
"epoch": 0.11625,
"grad_norm": 0.31795644760131836,
"learning_rate": 0.0001613713879307476,
"loss": 2.704681396484375,
"step": 930
},
{
"epoch": 0.1175,
"grad_norm": 0.32402339577674866,
"learning_rate": 0.00016127463762668064,
"loss": 2.733686065673828,
"step": 940
},
{
"epoch": 0.11875,
"grad_norm": 0.33603930473327637,
"learning_rate": 0.00016117660861298395,
"loss": 2.736924743652344,
"step": 950
},
{
"epoch": 0.12,
"grad_norm": 0.325527161359787,
"learning_rate": 0.0001610773024799045,
"loss": 2.7135137557983398,
"step": 960
},
{
"epoch": 0.12125,
"grad_norm": 0.31715628504753113,
"learning_rate": 0.000160976720838407,
"loss": 2.702963638305664,
"step": 970
},
{
"epoch": 0.1225,
"grad_norm": 0.3281555771827698,
"learning_rate": 0.0001608748653201477,
"loss": 2.718802261352539,
"step": 980
},
{
"epoch": 0.12375,
"grad_norm": 0.3280923366546631,
"learning_rate": 0.00016077173757744805,
"loss": 2.722803497314453,
"step": 990
},
{
"epoch": 0.125,
"grad_norm": 0.3167899549007416,
"learning_rate": 0.00016066733928326755,
"loss": 2.7145980834960937,
"step": 1000
},
{
"epoch": 0.12625,
"grad_norm": 0.3199998438358307,
"learning_rate": 0.0001605616721311771,
"loss": 2.713690185546875,
"step": 1010
},
{
"epoch": 0.1275,
"grad_norm": 0.33201882243156433,
"learning_rate": 0.00016045473783533111,
"loss": 2.7083156585693358,
"step": 1020
},
{
"epoch": 0.12875,
"grad_norm": 0.321409672498703,
"learning_rate": 0.00016034653813043993,
"loss": 2.6916542053222656,
"step": 1030
},
{
"epoch": 0.13,
"grad_norm": 0.3114752769470215,
"learning_rate": 0.00016023707477174167,
"loss": 2.7114416122436524,
"step": 1040
},
{
"epoch": 0.13125,
"grad_norm": 0.3244589567184448,
"learning_rate": 0.0001601263495349736,
"loss": 2.678660202026367,
"step": 1050
},
{
"epoch": 0.1325,
"grad_norm": 0.3137204945087433,
"learning_rate": 0.0001600143642163435,
"loss": 2.7046539306640627,
"step": 1060
},
{
"epoch": 0.13375,
"grad_norm": 0.3140222430229187,
"learning_rate": 0.0001599011206325005,
"loss": 2.7146488189697267,
"step": 1070
},
{
"epoch": 0.135,
"grad_norm": 0.31908106803894043,
"learning_rate": 0.0001597866206205054,
"loss": 2.713479995727539,
"step": 1080
},
{
"epoch": 0.13625,
"grad_norm": 0.3061647415161133,
"learning_rate": 0.00015967086603780128,
"loss": 2.714076805114746,
"step": 1090
},
{
"epoch": 0.1375,
"grad_norm": 0.3262089490890503,
"learning_rate": 0.00015955385876218297,
"loss": 2.709738540649414,
"step": 1100
},
{
"epoch": 0.13875,
"grad_norm": 0.3090061545372009,
"learning_rate": 0.0001594356006917667,
"loss": 2.682490921020508,
"step": 1110
},
{
"epoch": 0.14,
"grad_norm": 0.3089563548564911,
"learning_rate": 0.00015931609374495955,
"loss": 2.707094192504883,
"step": 1120
},
{
"epoch": 0.14125,
"grad_norm": 0.3150913417339325,
"learning_rate": 0.00015919533986042794,
"loss": 2.6884944915771483,
"step": 1130
},
{
"epoch": 0.1425,
"grad_norm": 0.3184945285320282,
"learning_rate": 0.00015907334099706644,
"loss": 2.668732833862305,
"step": 1140
},
{
"epoch": 0.14375,
"grad_norm": 0.3181245028972626,
"learning_rate": 0.00015895009913396594,
"loss": 2.699263000488281,
"step": 1150
},
{
"epoch": 0.145,
"grad_norm": 0.3286084234714508,
"learning_rate": 0.00015882561627038154,
"loss": 2.6974639892578125,
"step": 1160
},
{
"epoch": 0.14625,
"grad_norm": 0.30604103207588196,
"learning_rate": 0.00015869989442570008,
"loss": 2.691238212585449,
"step": 1170
},
{
"epoch": 0.1475,
"grad_norm": 0.31512096524238586,
"learning_rate": 0.0001585729356394074,
"loss": 2.6900882720947266,
"step": 1180
},
{
"epoch": 0.14875,
"grad_norm": 0.324313759803772,
"learning_rate": 0.0001584447419710553,
"loss": 2.6862293243408204,
"step": 1190
},
{
"epoch": 0.15,
"grad_norm": 0.32386448979377747,
"learning_rate": 0.00015831531550022804,
"loss": 2.7286815643310547,
"step": 1200
},
{
"epoch": 0.15125,
"grad_norm": 0.3133200705051422,
"learning_rate": 0.0001581846583265087,
"loss": 2.697834014892578,
"step": 1210
},
{
"epoch": 0.1525,
"grad_norm": 0.30789715051651,
"learning_rate": 0.00015805277256944507,
"loss": 2.6866151809692385,
"step": 1220
},
{
"epoch": 0.15375,
"grad_norm": 0.3052247166633606,
"learning_rate": 0.00015791966036851529,
"loss": 2.7111629486083983,
"step": 1230
},
{
"epoch": 0.155,
"grad_norm": 0.312637597322464,
"learning_rate": 0.00015778532388309308,
"loss": 2.6961734771728514,
"step": 1240
},
{
"epoch": 0.15625,
"grad_norm": 0.3095453977584839,
"learning_rate": 0.0001576497652924128,
"loss": 2.6890350341796876,
"step": 1250
},
{
"epoch": 0.1575,
"grad_norm": 0.31984061002731323,
"learning_rate": 0.00015751298679553402,
"loss": 2.6957382202148437,
"step": 1260
},
{
"epoch": 0.15875,
"grad_norm": 0.3066132366657257,
"learning_rate": 0.00015737499061130596,
"loss": 2.721820068359375,
"step": 1270
},
{
"epoch": 0.16,
"grad_norm": 0.31295251846313477,
"learning_rate": 0.00015723577897833128,
"loss": 2.688478469848633,
"step": 1280
},
{
"epoch": 0.16125,
"grad_norm": 0.326561838388443,
"learning_rate": 0.00015709535415493002,
"loss": 2.72012939453125,
"step": 1290
},
{
"epoch": 0.1625,
"grad_norm": 0.31419870257377625,
"learning_rate": 0.0001569537184191028,
"loss": 2.697279167175293,
"step": 1300
},
{
"epoch": 0.16375,
"grad_norm": 0.3069676160812378,
"learning_rate": 0.00015681087406849395,
"loss": 2.6784629821777344,
"step": 1310
},
{
"epoch": 0.165,
"grad_norm": 0.3102596402168274,
"learning_rate": 0.00015666682342035414,
"loss": 2.7019378662109377,
"step": 1320
},
{
"epoch": 0.16625,
"grad_norm": 0.33090364933013916,
"learning_rate": 0.00015652156881150288,
"loss": 2.698979949951172,
"step": 1330
},
{
"epoch": 0.1675,
"grad_norm": 0.3196777105331421,
"learning_rate": 0.00015637511259829055,
"loss": 2.670425796508789,
"step": 1340
},
{
"epoch": 0.16875,
"grad_norm": 0.3207469582557678,
"learning_rate": 0.0001562274571565603,
"loss": 2.687581443786621,
"step": 1350
},
{
"epoch": 0.17,
"grad_norm": 0.30899399518966675,
"learning_rate": 0.00015607860488160927,
"loss": 2.703385925292969,
"step": 1360
},
{
"epoch": 0.17125,
"grad_norm": 0.32463735342025757,
"learning_rate": 0.00015592855818815003,
"loss": 2.7129638671875,
"step": 1370
},
{
"epoch": 0.1725,
"grad_norm": 0.29863590002059937,
"learning_rate": 0.00015577731951027114,
"loss": 2.6898262023925783,
"step": 1380
},
{
"epoch": 0.17375,
"grad_norm": 0.30260539054870605,
"learning_rate": 0.00015562489130139783,
"loss": 2.696180725097656,
"step": 1390
},
{
"epoch": 0.175,
"grad_norm": 0.30247101187705994,
"learning_rate": 0.0001554712760342521,
"loss": 2.667018508911133,
"step": 1400
},
{
"epoch": 0.17625,
"grad_norm": 0.3163856565952301,
"learning_rate": 0.0001553164762008128,
"loss": 2.7117942810058593,
"step": 1410
},
{
"epoch": 0.1775,
"grad_norm": 0.31918948888778687,
"learning_rate": 0.0001551604943122748,
"loss": 2.6868515014648438,
"step": 1420
},
{
"epoch": 0.17875,
"grad_norm": 0.3069145083427429,
"learning_rate": 0.00015500333289900878,
"loss": 2.665867042541504,
"step": 1430
},
{
"epoch": 0.18,
"grad_norm": 0.3310893774032593,
"learning_rate": 0.00015484499451051976,
"loss": 2.6680227279663087,
"step": 1440
},
{
"epoch": 0.18125,
"grad_norm": 0.32211220264434814,
"learning_rate": 0.00015468548171540595,
"loss": 2.7012916564941407,
"step": 1450
},
{
"epoch": 0.1825,
"grad_norm": 0.3143543303012848,
"learning_rate": 0.00015452479710131699,
"loss": 2.711798667907715,
"step": 1460
},
{
"epoch": 0.18375,
"grad_norm": 0.33350202441215515,
"learning_rate": 0.00015436294327491207,
"loss": 2.692435455322266,
"step": 1470
},
{
"epoch": 0.185,
"grad_norm": 0.3231949508190155,
"learning_rate": 0.00015419992286181756,
"loss": 2.6712711334228514,
"step": 1480
},
{
"epoch": 0.18625,
"grad_norm": 0.3143308758735657,
"learning_rate": 0.00015403573850658438,
"loss": 2.6955425262451174,
"step": 1490
},
{
"epoch": 0.1875,
"grad_norm": 0.3118044137954712,
"learning_rate": 0.0001538703928726452,
"loss": 2.6801069259643553,
"step": 1500
},
{
"epoch": 0.18875,
"grad_norm": 0.3099926710128784,
"learning_rate": 0.00015370388864227133,
"loss": 2.669751739501953,
"step": 1510
},
{
"epoch": 0.19,
"grad_norm": 0.31752023100852966,
"learning_rate": 0.0001535362285165288,
"loss": 2.6963922500610353,
"step": 1520
},
{
"epoch": 0.19125,
"grad_norm": 0.3166843056678772,
"learning_rate": 0.00015336741521523506,
"loss": 2.6759317398071287,
"step": 1530
},
{
"epoch": 0.1925,
"grad_norm": 0.30386143922805786,
"learning_rate": 0.0001531974514769145,
"loss": 2.663748359680176,
"step": 1540
},
{
"epoch": 0.19375,
"grad_norm": 0.3149690628051758,
"learning_rate": 0.0001530263400587541,
"loss": 2.672575759887695,
"step": 1550
},
{
"epoch": 0.195,
"grad_norm": 0.32157933712005615,
"learning_rate": 0.0001528540837365589,
"loss": 2.7002744674682617,
"step": 1560
},
{
"epoch": 0.19625,
"grad_norm": 0.31378722190856934,
"learning_rate": 0.0001526806853047066,
"loss": 2.7025676727294923,
"step": 1570
},
{
"epoch": 0.1975,
"grad_norm": 0.313424676656723,
"learning_rate": 0.00015250614757610258,
"loss": 2.7100372314453125,
"step": 1580
},
{
"epoch": 0.19875,
"grad_norm": 0.32746565341949463,
"learning_rate": 0.00015233047338213414,
"loss": 2.721282196044922,
"step": 1590
},
{
"epoch": 0.2,
"grad_norm": 0.3191785216331482,
"learning_rate": 0.00015215366557262444,
"loss": 2.6832775115966796,
"step": 1600
},
{
"epoch": 0.20125,
"grad_norm": 0.3307384252548218,
"learning_rate": 0.00015197572701578654,
"loss": 2.683314323425293,
"step": 1610
},
{
"epoch": 0.2025,
"grad_norm": 0.3074938952922821,
"learning_rate": 0.00015179666059817658,
"loss": 2.6983566284179688,
"step": 1620
},
{
"epoch": 0.20375,
"grad_norm": 0.31642141938209534,
"learning_rate": 0.00015161646922464713,
"loss": 2.67681770324707,
"step": 1630
},
{
"epoch": 0.205,
"grad_norm": 0.3204726576805115,
"learning_rate": 0.0001514351558183001,
"loss": 2.673402786254883,
"step": 1640
},
{
"epoch": 0.20625,
"grad_norm": 0.31102851033210754,
"learning_rate": 0.00015125272332043916,
"loss": 2.6676706314086913,
"step": 1650
},
{
"epoch": 0.2075,
"grad_norm": 0.31576183438301086,
"learning_rate": 0.00015106917469052215,
"loss": 2.691006088256836,
"step": 1660
},
{
"epoch": 0.20875,
"grad_norm": 0.3049616515636444,
"learning_rate": 0.00015088451290611304,
"loss": 2.6852401733398437,
"step": 1670
},
{
"epoch": 0.21,
"grad_norm": 0.32038211822509766,
"learning_rate": 0.00015069874096283362,
"loss": 2.6850494384765624,
"step": 1680
},
{
"epoch": 0.21125,
"grad_norm": 0.31499341130256653,
"learning_rate": 0.00015051186187431495,
"loss": 2.685712432861328,
"step": 1690
},
{
"epoch": 0.2125,
"grad_norm": 0.3252309262752533,
"learning_rate": 0.0001503238786721483,
"loss": 2.6800838470458985,
"step": 1700
},
{
"epoch": 0.21375,
"grad_norm": 0.33030372858047485,
"learning_rate": 0.00015013479440583626,
"loss": 2.6957000732421874,
"step": 1710
},
{
"epoch": 0.215,
"grad_norm": 0.31104838848114014,
"learning_rate": 0.00014994461214274302,
"loss": 2.6724735260009767,
"step": 1720
},
{
"epoch": 0.21625,
"grad_norm": 0.31927284598350525,
"learning_rate": 0.00014975333496804468,
"loss": 2.6581308364868166,
"step": 1730
},
{
"epoch": 0.2175,
"grad_norm": 0.3242516815662384,
"learning_rate": 0.00014956096598467932,
"loss": 2.6579944610595705,
"step": 1740
},
{
"epoch": 0.21875,
"grad_norm": 0.3098279535770416,
"learning_rate": 0.00014936750831329645,
"loss": 2.6656078338623046,
"step": 1750
},
{
"epoch": 0.22,
"grad_norm": 0.309610515832901,
"learning_rate": 0.0001491729650922066,
"loss": 2.6563575744628904,
"step": 1760
},
{
"epoch": 0.22125,
"grad_norm": 0.31657662987709045,
"learning_rate": 0.00014897733947733031,
"loss": 2.6570175170898436,
"step": 1770
},
{
"epoch": 0.2225,
"grad_norm": 0.31096142530441284,
"learning_rate": 0.00014878063464214683,
"loss": 2.6638370513916017,
"step": 1780
},
{
"epoch": 0.22375,
"grad_norm": 0.3048711121082306,
"learning_rate": 0.00014858285377764284,
"loss": 2.6526607513427733,
"step": 1790
},
{
"epoch": 0.225,
"grad_norm": 0.32042643427848816,
"learning_rate": 0.0001483840000922606,
"loss": 2.6601219177246094,
"step": 1800
},
{
"epoch": 0.22625,
"grad_norm": 0.324494332075119,
"learning_rate": 0.00014818407681184585,
"loss": 2.6538795471191405,
"step": 1810
},
{
"epoch": 0.2275,
"grad_norm": 0.3241287171840668,
"learning_rate": 0.00014798308717959552,
"loss": 2.678963851928711,
"step": 1820
},
{
"epoch": 0.22875,
"grad_norm": 0.31064486503601074,
"learning_rate": 0.00014778103445600512,
"loss": 2.6616994857788088,
"step": 1830
},
{
"epoch": 0.23,
"grad_norm": 0.31154972314834595,
"learning_rate": 0.0001475779219188159,
"loss": 2.6822179794311523,
"step": 1840
},
{
"epoch": 0.23125,
"grad_norm": 0.32366329431533813,
"learning_rate": 0.00014737375286296158,
"loss": 2.689762496948242,
"step": 1850
},
{
"epoch": 0.2325,
"grad_norm": 0.3157241642475128,
"learning_rate": 0.00014716853060051493,
"loss": 2.6725814819335936,
"step": 1860
},
{
"epoch": 0.23375,
"grad_norm": 0.31811729073524475,
"learning_rate": 0.0001469622584606341,
"loss": 2.6730297088623045,
"step": 1870
},
{
"epoch": 0.235,
"grad_norm": 0.3240484893321991,
"learning_rate": 0.00014675493978950855,
"loss": 2.6649261474609376,
"step": 1880
},
{
"epoch": 0.23625,
"grad_norm": 0.3145361542701721,
"learning_rate": 0.0001465465779503048,
"loss": 2.6716739654541017,
"step": 1890
},
{
"epoch": 0.2375,
"grad_norm": 0.30439531803131104,
"learning_rate": 0.0001463371763231118,
"loss": 2.6668254852294924,
"step": 1900
},
{
"epoch": 0.23875,
"grad_norm": 0.3104805052280426,
"learning_rate": 0.00014612673830488625,
"loss": 2.6472827911376955,
"step": 1910
},
{
"epoch": 0.24,
"grad_norm": 0.3249180316925049,
"learning_rate": 0.00014591526730939734,
"loss": 2.6549278259277345,
"step": 1920
},
{
"epoch": 0.24125,
"grad_norm": 0.31549057364463806,
"learning_rate": 0.00014570276676717145,
"loss": 2.672433853149414,
"step": 1930
},
{
"epoch": 0.2425,
"grad_norm": 0.32735103368759155,
"learning_rate": 0.00014548924012543646,
"loss": 2.6650619506835938,
"step": 1940
},
{
"epoch": 0.24375,
"grad_norm": 0.3208616375923157,
"learning_rate": 0.00014527469084806585,
"loss": 2.6924251556396483,
"step": 1950
},
{
"epoch": 1.000875,
"grad_norm": 0.3361559808254242,
"learning_rate": 0.00014505912241552255,
"loss": 2.918643760681152,
"step": 1960
},
{
"epoch": 1.002125,
"grad_norm": 0.32232168316841125,
"learning_rate": 0.00014484253832480244,
"loss": 2.6152179718017576,
"step": 1970
},
{
"epoch": 1.003375,
"grad_norm": 0.32902058959007263,
"learning_rate": 0.0001446249420893775,
"loss": 2.6433155059814455,
"step": 1980
},
{
"epoch": 1.004625,
"grad_norm": 0.31211215257644653,
"learning_rate": 0.0001444063372391391,
"loss": 2.5884145736694335,
"step": 1990
},
{
"epoch": 1.005875,
"grad_norm": 0.32412853837013245,
"learning_rate": 0.00014418672732034043,
"loss": 2.5942047119140623,
"step": 2000
},
{
"epoch": 1.007125,
"grad_norm": 0.32079222798347473,
"learning_rate": 0.0001439661158955392,
"loss": 2.5999183654785156,
"step": 2010
},
{
"epoch": 1.008375,
"grad_norm": 0.3363247811794281,
"learning_rate": 0.00014374450654353968,
"loss": 2.5693603515625,
"step": 2020
},
{
"epoch": 1.009625,
"grad_norm": 0.3330596685409546,
"learning_rate": 0.00014352190285933487,
"loss": 2.577710723876953,
"step": 2030
},
{
"epoch": 1.010875,
"grad_norm": 0.31830593943595886,
"learning_rate": 0.00014329830845404782,
"loss": 2.580182647705078,
"step": 2040
},
{
"epoch": 1.012125,
"grad_norm": 0.3276713490486145,
"learning_rate": 0.00014307372695487343,
"loss": 2.5742984771728517,
"step": 2050
},
{
"epoch": 1.013375,
"grad_norm": 0.32609084248542786,
"learning_rate": 0.00014284816200501937,
"loss": 2.5697860717773438,
"step": 2060
},
{
"epoch": 1.014625,
"grad_norm": 0.32425832748413086,
"learning_rate": 0.00014262161726364707,
"loss": 2.5537353515625,
"step": 2070
},
{
"epoch": 1.015875,
"grad_norm": 0.3417907953262329,
"learning_rate": 0.00014239409640581238,
"loss": 2.5780372619628906,
"step": 2080
},
{
"epoch": 1.017125,
"grad_norm": 0.3302324116230011,
"learning_rate": 0.0001421656031224058,
"loss": 2.5682140350341798,
"step": 2090
},
{
"epoch": 1.018375,
"grad_norm": 0.33167314529418945,
"learning_rate": 0.00014193614112009283,
"loss": 2.545709228515625,
"step": 2100
},
{
"epoch": 1.019625,
"grad_norm": 0.3396015763282776,
"learning_rate": 0.00014170571412125367,
"loss": 2.544954299926758,
"step": 2110
},
{
"epoch": 1.020875,
"grad_norm": 0.33836308121681213,
"learning_rate": 0.00014147432586392297,
"loss": 2.5545772552490233,
"step": 2120
},
{
"epoch": 1.022125,
"grad_norm": 0.3312232196331024,
"learning_rate": 0.00014124198010172898,
"loss": 2.559113883972168,
"step": 2130
},
{
"epoch": 1.023375,
"grad_norm": 0.33059218525886536,
"learning_rate": 0.00014100868060383292,
"loss": 2.533283805847168,
"step": 2140
},
{
"epoch": 1.024625,
"grad_norm": 0.32571902871131897,
"learning_rate": 0.00014077443115486767,
"loss": 2.551566314697266,
"step": 2150
},
{
"epoch": 1.025875,
"grad_norm": 0.3243643045425415,
"learning_rate": 0.00014053923555487638,
"loss": 2.564662551879883,
"step": 2160
},
{
"epoch": 1.027125,
"grad_norm": 0.31755268573760986,
"learning_rate": 0.0001403030976192509,
"loss": 2.522117042541504,
"step": 2170
},
{
"epoch": 1.028375,
"grad_norm": 0.34630700945854187,
"learning_rate": 0.00014006602117866982,
"loss": 2.529287910461426,
"step": 2180
},
{
"epoch": 1.029625,
"grad_norm": 0.33032891154289246,
"learning_rate": 0.0001398280100790363,
"loss": 2.521525192260742,
"step": 2190
},
{
"epoch": 1.030875,
"grad_norm": 0.3408825993537903,
"learning_rate": 0.0001395890681814159,
"loss": 2.5370689392089845,
"step": 2200
},
{
"epoch": 1.032125,
"grad_norm": 0.3269711434841156,
"learning_rate": 0.0001393491993619736,
"loss": 2.5100967407226564,
"step": 2210
},
{
"epoch": 1.033375,
"grad_norm": 0.32242265343666077,
"learning_rate": 0.0001391084075119112,
"loss": 2.5302288055419924,
"step": 2220
},
{
"epoch": 1.034625,
"grad_norm": 0.3222724199295044,
"learning_rate": 0.000138866696537404,
"loss": 2.517455291748047,
"step": 2230
},
{
"epoch": 1.035875,
"grad_norm": 0.3199198246002197,
"learning_rate": 0.0001386240703595377,
"loss": 2.5055145263671874,
"step": 2240
},
{
"epoch": 1.037125,
"grad_norm": 0.33094459772109985,
"learning_rate": 0.0001383805329142444,
"loss": 2.5067977905273438,
"step": 2250
},
{
"epoch": 1.038375,
"grad_norm": 0.33781564235687256,
"learning_rate": 0.00013813608815223914,
"loss": 2.4964527130126952,
"step": 2260
},
{
"epoch": 1.039625,
"grad_norm": 0.34187182784080505,
"learning_rate": 0.00013789074003895557,
"loss": 2.4876964569091795,
"step": 2270
},
{
"epoch": 1.040875,
"grad_norm": 0.3467255234718323,
"learning_rate": 0.00013764449255448166,
"loss": 2.527250862121582,
"step": 2280
},
{
"epoch": 1.042125,
"grad_norm": 0.34287887811660767,
"learning_rate": 0.00013739734969349526,
"loss": 2.5136051177978516,
"step": 2290
},
{
"epoch": 1.043375,
"grad_norm": 0.3415592908859253,
"learning_rate": 0.0001371493154651991,
"loss": 2.5083173751831054,
"step": 2300
},
{
"epoch": 1.044625,
"grad_norm": 0.34434187412261963,
"learning_rate": 0.00013690039389325595,
"loss": 2.491905403137207,
"step": 2310
},
{
"epoch": 1.045875,
"grad_norm": 0.35805854201316833,
"learning_rate": 0.0001366505890157232,
"loss": 2.509074401855469,
"step": 2320
},
{
"epoch": 1.047125,
"grad_norm": 0.3360929787158966,
"learning_rate": 0.00013639990488498738,
"loss": 2.5023418426513673,
"step": 2330
},
{
"epoch": 1.048375,
"grad_norm": 0.33336424827575684,
"learning_rate": 0.00013614834556769853,
"loss": 2.5313945770263673,
"step": 2340
},
{
"epoch": 1.049625,
"grad_norm": 0.3515946567058563,
"learning_rate": 0.00013589591514470408,
"loss": 2.49786491394043,
"step": 2350
},
{
"epoch": 1.050875,
"grad_norm": 0.3500429391860962,
"learning_rate": 0.00013564261771098268,
"loss": 2.501786804199219,
"step": 2360
},
{
"epoch": 1.052125,
"grad_norm": 0.3275656998157501,
"learning_rate": 0.00013538845737557796,
"loss": 2.511077117919922,
"step": 2370
},
{
"epoch": 1.053375,
"grad_norm": 0.3502793610095978,
"learning_rate": 0.00013513343826153157,
"loss": 2.4827537536621094,
"step": 2380
},
{
"epoch": 1.054625,
"grad_norm": 0.3351482152938843,
"learning_rate": 0.0001348775645058165,
"loss": 2.5033424377441404,
"step": 2390
},
{
"epoch": 1.055875,
"grad_norm": 0.3501179814338684,
"learning_rate": 0.00013462084025927,
"loss": 2.4896453857421874,
"step": 2400
},
{
"epoch": 1.057125,
"grad_norm": 0.3435039222240448,
"learning_rate": 0.00013436326968652593,
"loss": 2.5125568389892576,
"step": 2410
},
{
"epoch": 1.058375,
"grad_norm": 0.34035417437553406,
"learning_rate": 0.00013410485696594768,
"loss": 2.4909286499023438,
"step": 2420
},
{
"epoch": 1.059625,
"grad_norm": 0.33129122853279114,
"learning_rate": 0.00013384560628956,
"loss": 2.556411361694336,
"step": 2430
},
{
"epoch": 1.060875,
"grad_norm": 0.3542681932449341,
"learning_rate": 0.0001335855218629812,
"loss": 2.469993782043457,
"step": 2440
},
{
"epoch": 1.062125,
"grad_norm": 0.3372875154018402,
"learning_rate": 0.00013332460790535473,
"loss": 2.4866575241088866,
"step": 2450
},
{
"epoch": 1.063375,
"grad_norm": 0.3469390273094177,
"learning_rate": 0.000133062868649281,
"loss": 2.4757783889770506,
"step": 2460
},
{
"epoch": 1.064625,
"grad_norm": 0.3474292457103729,
"learning_rate": 0.0001328003083407486,
"loss": 2.4724506378173827,
"step": 2470
},
{
"epoch": 1.065875,
"grad_norm": 0.3638366758823395,
"learning_rate": 0.0001325369312390653,
"loss": 2.5047348022460936,
"step": 2480
},
{
"epoch": 1.067125,
"grad_norm": 0.33961430191993713,
"learning_rate": 0.0001322727416167891,
"loss": 2.50977783203125,
"step": 2490
},
{
"epoch": 1.068375,
"grad_norm": 0.34472352266311646,
"learning_rate": 0.00013200774375965883,
"loss": 2.4912172317504884,
"step": 2500
},
{
"epoch": 1.069625,
"grad_norm": 0.3468291461467743,
"learning_rate": 0.00013174194196652477,
"loss": 2.4802589416503906,
"step": 2510
},
{
"epoch": 1.070875,
"grad_norm": 0.3530188500881195,
"learning_rate": 0.00013147534054927878,
"loss": 2.4657310485839843,
"step": 2520
},
{
"epoch": 1.072125,
"grad_norm": 0.3488495647907257,
"learning_rate": 0.00013120794383278438,
"loss": 2.4873979568481444,
"step": 2530
},
{
"epoch": 1.073375,
"grad_norm": 0.3713776171207428,
"learning_rate": 0.0001309397561548066,
"loss": 2.47833137512207,
"step": 2540
},
{
"epoch": 1.074625,
"grad_norm": 0.33978715538978577,
"learning_rate": 0.00013067078186594156,
"loss": 2.4833608627319337,
"step": 2550
},
{
"epoch": 1.075875,
"grad_norm": 0.34602415561676025,
"learning_rate": 0.000130401025329546,
"loss": 2.50838623046875,
"step": 2560
},
{
"epoch": 1.077125,
"grad_norm": 0.33973225951194763,
"learning_rate": 0.00013013049092166652,
"loss": 2.4615432739257814,
"step": 2570
},
{
"epoch": 1.078375,
"grad_norm": 0.35132381319999695,
"learning_rate": 0.00012985918303096833,
"loss": 2.4790775299072267,
"step": 2580
},
{
"epoch": 1.079625,
"grad_norm": 0.3466247618198395,
"learning_rate": 0.00012958710605866436,
"loss": 2.4747478485107424,
"step": 2590
},
{
"epoch": 1.080875,
"grad_norm": 0.3456020951271057,
"learning_rate": 0.00012931426441844374,
"loss": 2.5099910736083983,
"step": 2600
},
{
"epoch": 1.082125,
"grad_norm": 0.3543621301651001,
"learning_rate": 0.00012904066253640017,
"loss": 2.4894287109375,
"step": 2610
},
{
"epoch": 1.083375,
"grad_norm": 0.3563762605190277,
"learning_rate": 0.00012876630485096017,
"loss": 2.476998138427734,
"step": 2620
},
{
"epoch": 1.084625,
"grad_norm": 0.35365182161331177,
"learning_rate": 0.000128491195812811,
"loss": 2.479985809326172,
"step": 2630
},
{
"epoch": 1.085875,
"grad_norm": 0.3529748320579529,
"learning_rate": 0.00012821533988482863,
"loss": 2.4728267669677733,
"step": 2640
},
{
"epoch": 1.087125,
"grad_norm": 0.33992883563041687,
"learning_rate": 0.00012793874154200515,
"loss": 2.4903228759765623,
"step": 2650
},
{
"epoch": 1.088375,
"grad_norm": 0.3493664562702179,
"learning_rate": 0.00012766140527137627,
"loss": 2.4863492965698244,
"step": 2660
},
{
"epoch": 1.089625,
"grad_norm": 0.3460827171802521,
"learning_rate": 0.00012738333557194855,
"loss": 2.449415588378906,
"step": 2670
},
{
"epoch": 1.090875,
"grad_norm": 0.34357935190200806,
"learning_rate": 0.00012710453695462633,
"loss": 2.463718795776367,
"step": 2680
},
{
"epoch": 1.092125,
"grad_norm": 0.3472927510738373,
"learning_rate": 0.00012682501394213866,
"loss": 2.445463943481445,
"step": 2690
},
{
"epoch": 1.093375,
"grad_norm": 0.350157767534256,
"learning_rate": 0.00012654477106896584,
"loss": 2.4972408294677733,
"step": 2700
},
{
"epoch": 1.094625,
"grad_norm": 0.34597107768058777,
"learning_rate": 0.00012626381288126593,
"loss": 2.487579917907715,
"step": 2710
},
{
"epoch": 1.095875,
"grad_norm": 0.3472701609134674,
"learning_rate": 0.00012598214393680097,
"loss": 2.493511199951172,
"step": 2720
},
{
"epoch": 1.097125,
"grad_norm": 0.35611027479171753,
"learning_rate": 0.00012569976880486298,
"loss": 2.4602516174316404,
"step": 2730
},
{
"epoch": 1.098375,
"grad_norm": 0.34450188279151917,
"learning_rate": 0.00012541669206620002,
"loss": 2.47379093170166,
"step": 2740
},
{
"epoch": 1.099625,
"grad_norm": 0.3402315080165863,
"learning_rate": 0.0001251329183129416,
"loss": 2.4741775512695314,
"step": 2750
},
{
"epoch": 1.100875,
"grad_norm": 0.36293742060661316,
"learning_rate": 0.00012484845214852453,
"loss": 2.478403854370117,
"step": 2760
},
{
"epoch": 1.102125,
"grad_norm": 0.34437844157218933,
"learning_rate": 0.00012456329818761794,
"loss": 2.489897918701172,
"step": 2770
},
{
"epoch": 1.103375,
"grad_norm": 0.34425976872444153,
"learning_rate": 0.0001242774610560485,
"loss": 2.484636688232422,
"step": 2780
},
{
"epoch": 1.104625,
"grad_norm": 0.3487798273563385,
"learning_rate": 0.00012399094539072557,
"loss": 2.4807788848876955,
"step": 2790
},
{
"epoch": 1.105875,
"grad_norm": 0.36608222126960754,
"learning_rate": 0.00012370375583956562,
"loss": 2.498831939697266,
"step": 2800
}
],
"logging_steps": 10,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.7508190343633306e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}