SykoLLM-v6.5 / trainer_state.json
SykoSLM's picture
SykoLLM v6.5
a00cbcc verified
Raw
History Blame Contribute Delete
49.6 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.105875,
"eval_steps": 500,
"global_step": 2800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00125,
"grad_norm": 0.32223036885261536,
"learning_rate": 9.890999999999999e-06,
"loss": 2.9122833251953124,
"step": 10
},
{
"epoch": 0.0025,
"grad_norm": 0.3097129762172699,
"learning_rate": 2.0881000000000002e-05,
"loss": 2.881389617919922,
"step": 20
},
{
"epoch": 0.00375,
"grad_norm": 0.30452761054039,
"learning_rate": 3.1871e-05,
"loss": 2.8967803955078124,
"step": 30
},
{
"epoch": 0.005,
"grad_norm": 0.2955208420753479,
"learning_rate": 4.2861e-05,
"loss": 2.8681930541992187,
"step": 40
},
{
"epoch": 0.00625,
"grad_norm": 0.303114652633667,
"learning_rate": 5.3850999999999997e-05,
"loss": 2.8751144409179688,
"step": 50
},
{
"epoch": 0.0075,
"grad_norm": 0.299868106842041,
"learning_rate": 6.4841e-05,
"loss": 2.8749458312988283,
"step": 60
},
{
"epoch": 0.00875,
"grad_norm": 0.31019559502601624,
"learning_rate": 7.5831e-05,
"loss": 2.860850524902344,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 0.30683860182762146,
"learning_rate": 8.6821e-05,
"loss": 2.8604888916015625,
"step": 80
},
{
"epoch": 0.01125,
"grad_norm": 0.30718618631362915,
"learning_rate": 9.7811e-05,
"loss": 2.8761545181274415,
"step": 90
},
{
"epoch": 0.0125,
"grad_norm": 0.32211124897003174,
"learning_rate": 0.000108801,
"loss": 2.8468536376953124,
"step": 100
},
{
"epoch": 0.01375,
"grad_norm": 0.31109386682510376,
"learning_rate": 0.000119791,
"loss": 2.8445552825927733,
"step": 110
},
{
"epoch": 0.015,
"grad_norm": 0.3102831542491913,
"learning_rate": 0.000130781,
"loss": 2.864554214477539,
"step": 120
},
{
"epoch": 0.01625,
"grad_norm": 0.3220812976360321,
"learning_rate": 0.000141771,
"loss": 2.8806329727172852,
"step": 130
},
{
"epoch": 0.0175,
"grad_norm": 0.30876684188842773,
"learning_rate": 0.00015276099999999998,
"loss": 2.8452987670898438,
"step": 140
},
{
"epoch": 0.01875,
"grad_norm": 0.31868258118629456,
"learning_rate": 0.000163751,
"loss": 2.8517589569091797,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 0.3087399899959564,
"learning_rate": 0.000174741,
"loss": 2.8347522735595705,
"step": 160
},
{
"epoch": 0.02125,
"grad_norm": 0.3106062710285187,
"learning_rate": 0.000185731,
"loss": 2.85534553527832,
"step": 170
},
{
"epoch": 0.0225,
"grad_norm": 0.32315531373023987,
"learning_rate": 0.00019672100000000002,
"loss": 2.858936309814453,
"step": 180
},
{
"epoch": 0.02375,
"grad_norm": 0.3293415307998657,
"learning_rate": 0.000207711,
"loss": 2.8992713928222655,
"step": 190
},
{
"epoch": 0.025,
"grad_norm": 0.3309278190135956,
"learning_rate": 0.000218701,
"loss": 2.863359832763672,
"step": 200
},
{
"epoch": 0.02625,
"grad_norm": 0.3089866638183594,
"learning_rate": 0.0002197992779574687,
"loss": 2.8769275665283205,
"step": 210
},
{
"epoch": 0.0275,
"grad_norm": 0.32934558391571045,
"learning_rate": 0.0002197967820201583,
"loss": 2.8595829010009766,
"step": 220
},
{
"epoch": 0.02875,
"grad_norm": 0.3154727518558502,
"learning_rate": 0.00021979250331444358,
"loss": 2.8655704498291015,
"step": 230
},
{
"epoch": 0.03,
"grad_norm": 0.31462976336479187,
"learning_rate": 0.0002197864419097345,
"loss": 2.8554920196533202,
"step": 240
},
{
"epoch": 0.03125,
"grad_norm": 0.3277083933353424,
"learning_rate": 0.00021977859790436047,
"loss": 2.896647262573242,
"step": 250
},
{
"epoch": 0.0325,
"grad_norm": 0.3230266571044922,
"learning_rate": 0.00021976897142556858,
"loss": 2.8914859771728514,
"step": 260
},
{
"epoch": 0.03375,
"grad_norm": 0.3097373843193054,
"learning_rate": 0.00021975756262952153,
"loss": 2.867509460449219,
"step": 270
},
{
"epoch": 0.035,
"grad_norm": 0.31526896357536316,
"learning_rate": 0.00021974437170129525,
"loss": 2.861627388000488,
"step": 280
},
{
"epoch": 0.03625,
"grad_norm": 0.3264971077442169,
"learning_rate": 0.0002197293988548756,
"loss": 2.8434619903564453,
"step": 290
},
{
"epoch": 0.0375,
"grad_norm": 0.31985947489738464,
"learning_rate": 0.00021971264433315533,
"loss": 2.858683776855469,
"step": 300
},
{
"epoch": 0.03875,
"grad_norm": 0.3007575571537018,
"learning_rate": 0.00021969410840792965,
"loss": 2.856831359863281,
"step": 310
},
{
"epoch": 0.04,
"grad_norm": 0.32181960344314575,
"learning_rate": 0.00021967379137989224,
"loss": 2.8669090270996094,
"step": 320
},
{
"epoch": 0.04125,
"grad_norm": 0.3142366111278534,
"learning_rate": 0.00021965169357863014,
"loss": 2.864155578613281,
"step": 330
},
{
"epoch": 0.0425,
"grad_norm": 0.31411442160606384,
"learning_rate": 0.00021962781536261853,
"loss": 2.8719043731689453,
"step": 340
},
{
"epoch": 0.04375,
"grad_norm": 0.3069416880607605,
"learning_rate": 0.00021960215711921467,
"loss": 2.8788784027099608,
"step": 350
},
{
"epoch": 0.045,
"grad_norm": 0.3287704586982727,
"learning_rate": 0.00021957471926465198,
"loss": 2.8686893463134764,
"step": 360
},
{
"epoch": 0.04625,
"grad_norm": 0.32815802097320557,
"learning_rate": 0.00021954550224403304,
"loss": 2.872859573364258,
"step": 370
},
{
"epoch": 0.0475,
"grad_norm": 0.3123241066932678,
"learning_rate": 0.0002195145065313224,
"loss": 2.861919975280762,
"step": 380
},
{
"epoch": 0.04875,
"grad_norm": 0.3143039643764496,
"learning_rate": 0.0002194817326293389,
"loss": 2.8754358291625977,
"step": 390
},
{
"epoch": 0.05,
"grad_norm": 0.32305410504341125,
"learning_rate": 0.00021944718106974763,
"loss": 2.830820083618164,
"step": 400
},
{
"epoch": 0.05125,
"grad_norm": 0.3187738060951233,
"learning_rate": 0.00021941085241305118,
"loss": 2.8469779968261717,
"step": 410
},
{
"epoch": 0.0525,
"grad_norm": 0.3240358829498291,
"learning_rate": 0.00021937274724858052,
"loss": 2.872676467895508,
"step": 420
},
{
"epoch": 0.05375,
"grad_norm": 0.3307654857635498,
"learning_rate": 0.00021933286619448556,
"loss": 2.868929862976074,
"step": 430
},
{
"epoch": 0.055,
"grad_norm": 0.31867194175720215,
"learning_rate": 0.00021929120989772503,
"loss": 2.837067794799805,
"step": 440
},
{
"epoch": 0.05625,
"grad_norm": 0.3109733760356903,
"learning_rate": 0.00021924777903405596,
"loss": 2.8356159210205076,
"step": 450
},
{
"epoch": 0.0575,
"grad_norm": 0.33047595620155334,
"learning_rate": 0.00021920257430802295,
"loss": 2.859963226318359,
"step": 460
},
{
"epoch": 0.05875,
"grad_norm": 0.3140341341495514,
"learning_rate": 0.00021915559645294634,
"loss": 2.864061737060547,
"step": 470
},
{
"epoch": 0.06,
"grad_norm": 0.30880865454673767,
"learning_rate": 0.0002191068462309107,
"loss": 2.8523515701293944,
"step": 480
},
{
"epoch": 0.06125,
"grad_norm": 0.3137487769126892,
"learning_rate": 0.00021905632443275225,
"loss": 2.8639093399047852,
"step": 490
},
{
"epoch": 0.0625,
"grad_norm": 0.340537965297699,
"learning_rate": 0.00021900403187804607,
"loss": 2.8927494049072267,
"step": 500
},
{
"epoch": 0.06375,
"grad_norm": 0.31051260232925415,
"learning_rate": 0.00021894996941509282,
"loss": 2.840711212158203,
"step": 510
},
{
"epoch": 0.065,
"grad_norm": 0.3152431547641754,
"learning_rate": 0.00021889413792090502,
"loss": 2.862700653076172,
"step": 520
},
{
"epoch": 0.06625,
"grad_norm": 0.3119368553161621,
"learning_rate": 0.00021883653830119274,
"loss": 2.8526124954223633,
"step": 530
},
{
"epoch": 0.0675,
"grad_norm": 0.31616318225860596,
"learning_rate": 0.00021877717149034896,
"loss": 2.855159378051758,
"step": 540
},
{
"epoch": 0.06875,
"grad_norm": 0.30254286527633667,
"learning_rate": 0.00021871603845143443,
"loss": 2.854717254638672,
"step": 550
},
{
"epoch": 0.07,
"grad_norm": 0.3120061159133911,
"learning_rate": 0.000218653140176162,
"loss": 2.850946044921875,
"step": 560
},
{
"epoch": 0.07125,
"grad_norm": 0.30754292011260986,
"learning_rate": 0.00021858847768488048,
"loss": 2.8386112213134767,
"step": 570
},
{
"epoch": 0.0725,
"grad_norm": 0.3003309667110443,
"learning_rate": 0.0002185220520265583,
"loss": 2.858784294128418,
"step": 580
},
{
"epoch": 0.07375,
"grad_norm": 0.31817367672920227,
"learning_rate": 0.00021845386427876622,
"loss": 2.8400810241699217,
"step": 590
},
{
"epoch": 0.075,
"grad_norm": 0.31158024072647095,
"learning_rate": 0.00021838391554766004,
"loss": 2.8315425872802735,
"step": 600
},
{
"epoch": 0.07625,
"grad_norm": 0.31356877088546753,
"learning_rate": 0.00021831220696796264,
"loss": 2.85643310546875,
"step": 610
},
{
"epoch": 0.0775,
"grad_norm": 0.3057396411895752,
"learning_rate": 0.00021823873970294543,
"loss": 2.8644752502441406,
"step": 620
},
{
"epoch": 0.07875,
"grad_norm": 0.30540961027145386,
"learning_rate": 0.00021816351494440965,
"loss": 2.840130615234375,
"step": 630
},
{
"epoch": 0.08,
"grad_norm": 0.3201405107975006,
"learning_rate": 0.00021808653391266697,
"loss": 2.81726016998291,
"step": 640
},
{
"epoch": 0.08125,
"grad_norm": 0.31356149911880493,
"learning_rate": 0.0002180077978565196,
"loss": 2.841321563720703,
"step": 650
},
{
"epoch": 0.0825,
"grad_norm": 0.3322322368621826,
"learning_rate": 0.00021792730805324023,
"loss": 2.833037185668945,
"step": 660
},
{
"epoch": 0.08375,
"grad_norm": 0.3101900517940521,
"learning_rate": 0.0002178450658085511,
"loss": 2.8306228637695314,
"step": 670
},
{
"epoch": 0.085,
"grad_norm": 0.31162750720977783,
"learning_rate": 0.00021776107245660307,
"loss": 2.849654769897461,
"step": 680
},
{
"epoch": 0.08625,
"grad_norm": 0.3168909251689911,
"learning_rate": 0.00021767532935995366,
"loss": 2.882074737548828,
"step": 690
},
{
"epoch": 0.0875,
"grad_norm": 0.2994805574417114,
"learning_rate": 0.00021758783790954515,
"loss": 2.834335517883301,
"step": 700
},
{
"epoch": 0.08875,
"grad_norm": 0.3097037672996521,
"learning_rate": 0.0002174985995246821,
"loss": 2.8143672943115234,
"step": 710
},
{
"epoch": 0.09,
"grad_norm": 0.32182541489601135,
"learning_rate": 0.00021740761565300799,
"loss": 2.845683288574219,
"step": 720
},
{
"epoch": 0.09125,
"grad_norm": 0.32514718174934387,
"learning_rate": 0.00021731488777048213,
"loss": 2.8221324920654296,
"step": 730
},
{
"epoch": 0.0925,
"grad_norm": 0.3028743267059326,
"learning_rate": 0.0002172204173813555,
"loss": 2.8356349945068358,
"step": 740
},
{
"epoch": 0.09375,
"grad_norm": 0.3133573830127716,
"learning_rate": 0.0002171242060181463,
"loss": 2.838234710693359,
"step": 750
},
{
"epoch": 0.095,
"grad_norm": 0.3107962906360626,
"learning_rate": 0.00021702625524161527,
"loss": 2.8331020355224608,
"step": 760
},
{
"epoch": 0.09625,
"grad_norm": 0.3266642987728119,
"learning_rate": 0.00021692656664074023,
"loss": 2.847811698913574,
"step": 770
},
{
"epoch": 0.0975,
"grad_norm": 0.3073740303516388,
"learning_rate": 0.00021682514183269034,
"loss": 2.8351299285888674,
"step": 780
},
{
"epoch": 0.09875,
"grad_norm": 0.3130224645137787,
"learning_rate": 0.00021672198246279985,
"loss": 2.7890214920043945,
"step": 790
},
{
"epoch": 0.1,
"grad_norm": 0.3218679130077362,
"learning_rate": 0.00021661709020454157,
"loss": 2.8209762573242188,
"step": 800
},
{
"epoch": 0.10125,
"grad_norm": 0.2967888414859772,
"learning_rate": 0.00021651046675949938,
"loss": 2.819289207458496,
"step": 810
},
{
"epoch": 0.1025,
"grad_norm": 0.32564085721969604,
"learning_rate": 0.000216402113857341,
"loss": 2.8148468017578123,
"step": 820
},
{
"epoch": 0.10375,
"grad_norm": 0.30720430612564087,
"learning_rate": 0.00021629203325578962,
"loss": 2.832720947265625,
"step": 830
},
{
"epoch": 0.105,
"grad_norm": 0.31553003191947937,
"learning_rate": 0.00021618022674059568,
"loss": 2.8313037872314455,
"step": 840
},
{
"epoch": 0.10625,
"grad_norm": 0.2927679121494293,
"learning_rate": 0.0002160666961255076,
"loss": 2.822229766845703,
"step": 850
},
{
"epoch": 0.1075,
"grad_norm": 0.3168841302394867,
"learning_rate": 0.00021595144325224264,
"loss": 2.8234331130981447,
"step": 860
},
{
"epoch": 0.10875,
"grad_norm": 0.3195788860321045,
"learning_rate": 0.0002158344699904568,
"loss": 2.8171760559082033,
"step": 870
},
{
"epoch": 0.11,
"grad_norm": 0.30483055114746094,
"learning_rate": 0.00021571577823771462,
"loss": 2.82617244720459,
"step": 880
},
{
"epoch": 0.11125,
"grad_norm": 0.31678906083106995,
"learning_rate": 0.00021559536991945833,
"loss": 2.8162193298339844,
"step": 890
},
{
"epoch": 0.1125,
"grad_norm": 0.30715152621269226,
"learning_rate": 0.00021547324698897665,
"loss": 2.8252620697021484,
"step": 900
},
{
"epoch": 0.11375,
"grad_norm": 0.303281307220459,
"learning_rate": 0.00021534941142737314,
"loss": 2.8220481872558594,
"step": 910
},
{
"epoch": 0.115,
"grad_norm": 0.3042793571949005,
"learning_rate": 0.00021522386524353395,
"loss": 2.825517272949219,
"step": 920
},
{
"epoch": 0.11625,
"grad_norm": 0.328135222196579,
"learning_rate": 0.00021509661047409534,
"loss": 2.806531524658203,
"step": 930
},
{
"epoch": 0.1175,
"grad_norm": 0.30471575260162354,
"learning_rate": 0.00021496764918341058,
"loss": 2.8206180572509765,
"step": 940
},
{
"epoch": 0.11875,
"grad_norm": 0.3096025884151459,
"learning_rate": 0.0002148369834635165,
"loss": 2.8001310348510744,
"step": 950
},
{
"epoch": 0.12,
"grad_norm": 0.30915719270706177,
"learning_rate": 0.0002147046154340995,
"loss": 2.838936996459961,
"step": 960
},
{
"epoch": 0.12125,
"grad_norm": 0.30633190274238586,
"learning_rate": 0.00021457054724246125,
"loss": 2.8280914306640623,
"step": 970
},
{
"epoch": 0.1225,
"grad_norm": 0.3169943392276764,
"learning_rate": 0.00021443478106348375,
"loss": 2.8208492279052733,
"step": 980
},
{
"epoch": 0.12375,
"grad_norm": 0.31402623653411865,
"learning_rate": 0.00021429731909959417,
"loss": 2.803514099121094,
"step": 990
},
{
"epoch": 0.125,
"grad_norm": 0.31064271926879883,
"learning_rate": 0.00021415816358072898,
"loss": 2.828254508972168,
"step": 1000
},
{
"epoch": 0.12625,
"grad_norm": 0.3190893530845642,
"learning_rate": 0.00021401731676429792,
"loss": 2.814365196228027,
"step": 1010
},
{
"epoch": 0.1275,
"grad_norm": 0.3164026141166687,
"learning_rate": 0.00021387478093514724,
"loss": 2.803851509094238,
"step": 1020
},
{
"epoch": 0.12875,
"grad_norm": 0.3159414529800415,
"learning_rate": 0.00021373055840552275,
"loss": 2.8509082794189453,
"step": 1030
},
{
"epoch": 0.13,
"grad_norm": 0.3224294185638428,
"learning_rate": 0.00021358465151503225,
"loss": 2.789044952392578,
"step": 1040
},
{
"epoch": 0.13125,
"grad_norm": 0.31033849716186523,
"learning_rate": 0.00021343706263060765,
"loss": 2.8226268768310545,
"step": 1050
},
{
"epoch": 0.1325,
"grad_norm": 0.3086431622505188,
"learning_rate": 0.00021328779414646635,
"loss": 2.8077007293701173,
"step": 1060
},
{
"epoch": 0.13375,
"grad_norm": 0.3155769109725952,
"learning_rate": 0.00021313684848407282,
"loss": 2.8190916061401365,
"step": 1070
},
{
"epoch": 0.135,
"grad_norm": 0.3062079846858978,
"learning_rate": 0.0002129842280920988,
"loss": 2.8035049438476562,
"step": 1080
},
{
"epoch": 0.13625,
"grad_norm": 0.3113609552383423,
"learning_rate": 0.000212829935446384,
"loss": 2.808064842224121,
"step": 1090
},
{
"epoch": 0.1375,
"grad_norm": 0.3248916566371918,
"learning_rate": 0.0002126739730498958,
"loss": 2.8036418914794923,
"step": 1100
},
{
"epoch": 0.13875,
"grad_norm": 0.314177930355072,
"learning_rate": 0.00021251634343268845,
"loss": 2.8073291778564453,
"step": 1110
},
{
"epoch": 0.14,
"grad_norm": 0.31667032837867737,
"learning_rate": 0.00021235704915186242,
"loss": 2.8247406005859377,
"step": 1120
},
{
"epoch": 0.14125,
"grad_norm": 0.32587730884552,
"learning_rate": 0.0002121960927915225,
"loss": 2.81424560546875,
"step": 1130
},
{
"epoch": 0.1425,
"grad_norm": 0.3099067509174347,
"learning_rate": 0.00021203347696273621,
"loss": 2.833042526245117,
"step": 1140
},
{
"epoch": 0.14375,
"grad_norm": 0.3176534175872803,
"learning_rate": 0.0002118692043034913,
"loss": 2.8056007385253907,
"step": 1150
},
{
"epoch": 0.145,
"grad_norm": 0.32910725474357605,
"learning_rate": 0.00021170327747865292,
"loss": 2.791951370239258,
"step": 1160
},
{
"epoch": 0.14625,
"grad_norm": 0.31169673800468445,
"learning_rate": 0.00021153569917992042,
"loss": 2.809808540344238,
"step": 1170
},
{
"epoch": 0.1475,
"grad_norm": 0.31293970346450806,
"learning_rate": 0.00021136647212578378,
"loss": 2.7925342559814452,
"step": 1180
},
{
"epoch": 0.14875,
"grad_norm": 0.3170998990535736,
"learning_rate": 0.00021119559906147942,
"loss": 2.809326934814453,
"step": 1190
},
{
"epoch": 0.15,
"grad_norm": 0.30116304755210876,
"learning_rate": 0.00021102308275894555,
"loss": 2.7981502532958986,
"step": 1200
},
{
"epoch": 0.15125,
"grad_norm": 0.30669230222702026,
"learning_rate": 0.0002108489260167775,
"loss": 2.7857837677001953,
"step": 1210
},
{
"epoch": 0.1525,
"grad_norm": 0.30800774693489075,
"learning_rate": 0.00021067313166018209,
"loss": 2.806937408447266,
"step": 1220
},
{
"epoch": 0.15375,
"grad_norm": 0.3087230622768402,
"learning_rate": 0.00021049570254093184,
"loss": 2.8145347595214845,
"step": 1230
},
{
"epoch": 0.155,
"grad_norm": 0.30576276779174805,
"learning_rate": 0.00021031664153731874,
"loss": 2.806387710571289,
"step": 1240
},
{
"epoch": 0.15625,
"grad_norm": 0.3263702392578125,
"learning_rate": 0.00021013595155410756,
"loss": 2.773836135864258,
"step": 1250
},
{
"epoch": 0.1575,
"grad_norm": 0.3177431523799896,
"learning_rate": 0.00020995363552248867,
"loss": 2.7588844299316406,
"step": 1260
},
{
"epoch": 0.15875,
"grad_norm": 0.30336225032806396,
"learning_rate": 0.00020976969640003064,
"loss": 2.8113712310791015,
"step": 1270
},
{
"epoch": 0.16,
"grad_norm": 0.32169830799102783,
"learning_rate": 0.000209584137170632,
"loss": 2.788315773010254,
"step": 1280
},
{
"epoch": 0.16125,
"grad_norm": 0.30413737893104553,
"learning_rate": 0.00020939696084447314,
"loss": 2.7458065032958983,
"step": 1290
},
{
"epoch": 0.1625,
"grad_norm": 0.3089154064655304,
"learning_rate": 0.00020920817045796727,
"loss": 2.7877056121826174,
"step": 1300
},
{
"epoch": 0.16375,
"grad_norm": 0.30705851316452026,
"learning_rate": 0.00020901776907371116,
"loss": 2.773893356323242,
"step": 1310
},
{
"epoch": 0.165,
"grad_norm": 0.3133976459503174,
"learning_rate": 0.00020882575978043566,
"loss": 2.784181594848633,
"step": 1320
},
{
"epoch": 0.16625,
"grad_norm": 0.31440430879592896,
"learning_rate": 0.00020863214569295533,
"loss": 2.8143083572387697,
"step": 1330
},
{
"epoch": 0.1675,
"grad_norm": 0.29583054780960083,
"learning_rate": 0.00020843692995211805,
"loss": 2.7985980987548826,
"step": 1340
},
{
"epoch": 0.16875,
"grad_norm": 0.3040190637111664,
"learning_rate": 0.0002082401157247541,
"loss": 2.774214744567871,
"step": 1350
},
{
"epoch": 0.17,
"grad_norm": 0.30737006664276123,
"learning_rate": 0.00020804170620362475,
"loss": 2.803047943115234,
"step": 1360
},
{
"epoch": 0.17125,
"grad_norm": 0.30594661831855774,
"learning_rate": 0.0002078417046073704,
"loss": 2.7990367889404295,
"step": 1370
},
{
"epoch": 0.1725,
"grad_norm": 0.3074641823768616,
"learning_rate": 0.00020764011418045845,
"loss": 2.770071792602539,
"step": 1380
},
{
"epoch": 0.17375,
"grad_norm": 0.304598331451416,
"learning_rate": 0.00020743693819313063,
"loss": 2.7667999267578125,
"step": 1390
},
{
"epoch": 0.175,
"grad_norm": 0.32464832067489624,
"learning_rate": 0.00020723217994135003,
"loss": 2.8097129821777345,
"step": 1400
},
{
"epoch": 0.17625,
"grad_norm": 0.3164089620113373,
"learning_rate": 0.00020702584274674742,
"loss": 2.7955820083618166,
"step": 1410
},
{
"epoch": 0.1775,
"grad_norm": 0.310068279504776,
"learning_rate": 0.00020681792995656763,
"loss": 2.7704933166503904,
"step": 1420
},
{
"epoch": 0.17875,
"grad_norm": 0.3000030219554901,
"learning_rate": 0.00020660844494361513,
"loss": 2.8106201171875,
"step": 1430
},
{
"epoch": 0.18,
"grad_norm": 0.3196377456188202,
"learning_rate": 0.00020639739110619917,
"loss": 2.7796897888183594,
"step": 1440
},
{
"epoch": 0.18125,
"grad_norm": 0.3006730079650879,
"learning_rate": 0.000206184771868079,
"loss": 2.791950225830078,
"step": 1450
},
{
"epoch": 0.1825,
"grad_norm": 0.3123355805873871,
"learning_rate": 0.000205970590678408,
"loss": 2.7847476959228517,
"step": 1460
},
{
"epoch": 0.18375,
"grad_norm": 0.31853538751602173,
"learning_rate": 0.00020575485101167782,
"loss": 2.7865251541137694,
"step": 1470
},
{
"epoch": 0.185,
"grad_norm": 0.30936041474342346,
"learning_rate": 0.0002055375563676622,
"loss": 2.7906095504760744,
"step": 1480
},
{
"epoch": 0.18625,
"grad_norm": 0.30842670798301697,
"learning_rate": 0.0002053187102713599,
"loss": 2.7754417419433595,
"step": 1490
},
{
"epoch": 0.1875,
"grad_norm": 0.3201216757297516,
"learning_rate": 0.00020509831627293766,
"loss": 2.796547698974609,
"step": 1500
},
{
"epoch": 0.18875,
"grad_norm": 0.3134450316429138,
"learning_rate": 0.00020487637794767275,
"loss": 2.7649627685546876,
"step": 1510
},
{
"epoch": 0.19,
"grad_norm": 0.3114699721336365,
"learning_rate": 0.00020465289889589467,
"loss": 2.8279897689819338,
"step": 1520
},
{
"epoch": 0.19125,
"grad_norm": 0.3171784281730652,
"learning_rate": 0.00020442788274292704,
"loss": 2.776567840576172,
"step": 1530
},
{
"epoch": 0.1925,
"grad_norm": 0.30708587169647217,
"learning_rate": 0.00020420133313902856,
"loss": 2.786650466918945,
"step": 1540
},
{
"epoch": 0.19375,
"grad_norm": 0.3005415201187134,
"learning_rate": 0.00020397325375933387,
"loss": 2.7795650482177736,
"step": 1550
},
{
"epoch": 0.195,
"grad_norm": 0.30447477102279663,
"learning_rate": 0.0002037436483037941,
"loss": 2.7910282135009767,
"step": 1560
},
{
"epoch": 0.19625,
"grad_norm": 0.308108389377594,
"learning_rate": 0.0002035125204971165,
"loss": 2.7864933013916016,
"step": 1570
},
{
"epoch": 0.1975,
"grad_norm": 0.31156831979751587,
"learning_rate": 0.00020327987408870436,
"loss": 2.77624397277832,
"step": 1580
},
{
"epoch": 0.19875,
"grad_norm": 0.30407053232192993,
"learning_rate": 0.00020304571285259602,
"loss": 2.786225509643555,
"step": 1590
},
{
"epoch": 0.2,
"grad_norm": 0.30873724818229675,
"learning_rate": 0.0002028100405874036,
"loss": 2.7831089019775392,
"step": 1600
},
{
"epoch": 0.20125,
"grad_norm": 0.305469274520874,
"learning_rate": 0.00020257286111625156,
"loss": 2.770510673522949,
"step": 1610
},
{
"epoch": 0.2025,
"grad_norm": 0.3133813440799713,
"learning_rate": 0.00020233417828671444,
"loss": 2.7937782287597654,
"step": 1620
},
{
"epoch": 0.20375,
"grad_norm": 0.3113247752189636,
"learning_rate": 0.00020209399597075463,
"loss": 2.811221694946289,
"step": 1630
},
{
"epoch": 0.205,
"grad_norm": 0.29653653502464294,
"learning_rate": 0.00020185231806465958,
"loss": 2.736056900024414,
"step": 1640
},
{
"epoch": 0.20625,
"grad_norm": 0.296674519777298,
"learning_rate": 0.00020160914848897833,
"loss": 2.773727035522461,
"step": 1650
},
{
"epoch": 0.2075,
"grad_norm": 0.3117091953754425,
"learning_rate": 0.00020136449118845828,
"loss": 2.7696605682373048,
"step": 1660
},
{
"epoch": 0.20875,
"grad_norm": 0.3065008819103241,
"learning_rate": 0.00020111835013198088,
"loss": 2.7859319686889648,
"step": 1670
},
{
"epoch": 0.21,
"grad_norm": 0.30614563822746277,
"learning_rate": 0.00020087072931249746,
"loss": 2.761496734619141,
"step": 1680
},
{
"epoch": 0.21125,
"grad_norm": 0.3214632272720337,
"learning_rate": 0.0002006216327469644,
"loss": 2.795328140258789,
"step": 1690
},
{
"epoch": 0.2125,
"grad_norm": 0.3141666054725647,
"learning_rate": 0.00020037106447627772,
"loss": 2.7613990783691404,
"step": 1700
},
{
"epoch": 0.21375,
"grad_norm": 0.32107681035995483,
"learning_rate": 0.00020011902856520807,
"loss": 2.7515789031982423,
"step": 1710
},
{
"epoch": 0.215,
"grad_norm": 0.3231985867023468,
"learning_rate": 0.00019986552910233424,
"loss": 2.7852977752685546,
"step": 1720
},
{
"epoch": 0.21625,
"grad_norm": 0.3149876892566681,
"learning_rate": 0.00019961057019997707,
"loss": 2.754520225524902,
"step": 1730
},
{
"epoch": 0.2175,
"grad_norm": 0.31885862350463867,
"learning_rate": 0.00019935415599413287,
"loss": 2.7804901123046877,
"step": 1740
},
{
"epoch": 0.21875,
"grad_norm": 0.30009323358535767,
"learning_rate": 0.0001990962906444061,
"loss": 2.766156005859375,
"step": 1750
},
{
"epoch": 0.22,
"grad_norm": 0.31249675154685974,
"learning_rate": 0.00019883697833394186,
"loss": 2.779193878173828,
"step": 1760
},
{
"epoch": 0.22125,
"grad_norm": 0.30822932720184326,
"learning_rate": 0.0001985762232693584,
"loss": 2.7579469680786133,
"step": 1770
},
{
"epoch": 0.2225,
"grad_norm": 0.3053094446659088,
"learning_rate": 0.00019831402968067843,
"loss": 2.76893310546875,
"step": 1780
},
{
"epoch": 0.22375,
"grad_norm": 0.31457704305648804,
"learning_rate": 0.00019805040182126077,
"loss": 2.781879425048828,
"step": 1790
},
{
"epoch": 0.225,
"grad_norm": 0.30379778146743774,
"learning_rate": 0.00019778534396773127,
"loss": 2.783489799499512,
"step": 1800
},
{
"epoch": 0.22625,
"grad_norm": 0.31210359930992126,
"learning_rate": 0.0001975188604199134,
"loss": 2.7574298858642576,
"step": 1810
},
{
"epoch": 0.2275,
"grad_norm": 0.3024740219116211,
"learning_rate": 0.00019725095550075862,
"loss": 2.7888748168945314,
"step": 1820
},
{
"epoch": 0.22875,
"grad_norm": 0.3073548376560211,
"learning_rate": 0.0001969816335562761,
"loss": 2.7340553283691404,
"step": 1830
},
{
"epoch": 0.23,
"grad_norm": 0.31958791613578796,
"learning_rate": 0.00019671089895546232,
"loss": 2.804524230957031,
"step": 1840
},
{
"epoch": 0.23125,
"grad_norm": 0.3051760196685791,
"learning_rate": 0.00019643875609023017,
"loss": 2.775598907470703,
"step": 1850
},
{
"epoch": 0.2325,
"grad_norm": 0.3086925148963928,
"learning_rate": 0.0001961652093753377,
"loss": 2.7774431228637697,
"step": 1860
},
{
"epoch": 0.23375,
"grad_norm": 0.3144133388996124,
"learning_rate": 0.00019589026324831643,
"loss": 2.7702011108398437,
"step": 1870
},
{
"epoch": 0.235,
"grad_norm": 0.3036665916442871,
"learning_rate": 0.00019561392216939954,
"loss": 2.7927045822143555,
"step": 1880
},
{
"epoch": 0.23625,
"grad_norm": 0.30784451961517334,
"learning_rate": 0.00019533619062144934,
"loss": 2.741124725341797,
"step": 1890
},
{
"epoch": 0.2375,
"grad_norm": 0.29786407947540283,
"learning_rate": 0.00019505707310988463,
"loss": 2.748614501953125,
"step": 1900
},
{
"epoch": 0.23875,
"grad_norm": 0.30479830503463745,
"learning_rate": 0.00019477657416260764,
"loss": 2.7626161575317383,
"step": 1910
},
{
"epoch": 0.24,
"grad_norm": 0.30530789494514465,
"learning_rate": 0.0001944946983299305,
"loss": 2.7705900192260744,
"step": 1920
},
{
"epoch": 0.24125,
"grad_norm": 0.30881696939468384,
"learning_rate": 0.00019421145018450145,
"loss": 2.7753509521484374,
"step": 1930
},
{
"epoch": 0.2425,
"grad_norm": 0.30990368127822876,
"learning_rate": 0.00019392683432123065,
"loss": 2.7618339538574217,
"step": 1940
},
{
"epoch": 0.24375,
"grad_norm": 0.30068239569664,
"learning_rate": 0.00019364085535721574,
"loss": 2.751456451416016,
"step": 1950
},
{
"epoch": 1.000875,
"grad_norm": 0.32766178250312805,
"learning_rate": 0.00019335351793166682,
"loss": 2.9953849792480467,
"step": 1960
},
{
"epoch": 1.002125,
"grad_norm": 0.31653207540512085,
"learning_rate": 0.00019306482670583127,
"loss": 2.7172924041748048,
"step": 1970
},
{
"epoch": 1.003375,
"grad_norm": 0.30188634991645813,
"learning_rate": 0.000192774786362918,
"loss": 2.718875503540039,
"step": 1980
},
{
"epoch": 1.004625,
"grad_norm": 0.3092830777168274,
"learning_rate": 0.00019248340160802165,
"loss": 2.6953250885009767,
"step": 1990
},
{
"epoch": 1.005875,
"grad_norm": 0.3100144863128662,
"learning_rate": 0.00019219067716804626,
"loss": 2.7128387451171876,
"step": 2000
},
{
"epoch": 1.007125,
"grad_norm": 0.32156386971473694,
"learning_rate": 0.00019189661779162834,
"loss": 2.7038270950317385,
"step": 2010
},
{
"epoch": 1.008375,
"grad_norm": 0.3106272518634796,
"learning_rate": 0.00019160122824906018,
"loss": 2.7032100677490236,
"step": 2020
},
{
"epoch": 1.009625,
"grad_norm": 0.3121194541454315,
"learning_rate": 0.00019130451333221226,
"loss": 2.6769741058349608,
"step": 2030
},
{
"epoch": 1.010875,
"grad_norm": 0.31094688177108765,
"learning_rate": 0.0001910064778544555,
"loss": 2.6934465408325194,
"step": 2040
},
{
"epoch": 1.012125,
"grad_norm": 0.3150351941585541,
"learning_rate": 0.00019070712665058325,
"loss": 2.674116325378418,
"step": 2050
},
{
"epoch": 1.013375,
"grad_norm": 0.3132378160953522,
"learning_rate": 0.00019040646457673294,
"loss": 2.667017936706543,
"step": 2060
},
{
"epoch": 1.014625,
"grad_norm": 0.30859819054603577,
"learning_rate": 0.000190104496510307,
"loss": 2.6529170989990236,
"step": 2070
},
{
"epoch": 1.015875,
"grad_norm": 0.3140536844730377,
"learning_rate": 0.00018980122734989425,
"loss": 2.649005889892578,
"step": 2080
},
{
"epoch": 1.017125,
"grad_norm": 0.3163485825061798,
"learning_rate": 0.00018949666201518978,
"loss": 2.658115005493164,
"step": 2090
},
{
"epoch": 1.018375,
"grad_norm": 0.3046296536922455,
"learning_rate": 0.00018919080544691573,
"loss": 2.637746238708496,
"step": 2100
},
{
"epoch": 1.019625,
"grad_norm": 0.30639058351516724,
"learning_rate": 0.00018888366260674078,
"loss": 2.6267181396484376,
"step": 2110
},
{
"epoch": 1.020875,
"grad_norm": 0.3216869831085205,
"learning_rate": 0.00018857523847719992,
"loss": 2.6571407318115234,
"step": 2120
},
{
"epoch": 1.022125,
"grad_norm": 0.32431310415267944,
"learning_rate": 0.0001882655380616133,
"loss": 2.6225955963134764,
"step": 2130
},
{
"epoch": 1.023375,
"grad_norm": 0.3109528720378876,
"learning_rate": 0.0001879545663840053,
"loss": 2.633950042724609,
"step": 2140
},
{
"epoch": 1.024625,
"grad_norm": 0.32065126299858093,
"learning_rate": 0.00018764232848902314,
"loss": 2.602225494384766,
"step": 2150
},
{
"epoch": 1.025875,
"grad_norm": 0.32300078868865967,
"learning_rate": 0.00018732882944185462,
"loss": 2.615239715576172,
"step": 2160
},
{
"epoch": 1.027125,
"grad_norm": 0.3188120424747467,
"learning_rate": 0.00018701407432814644,
"loss": 2.594603157043457,
"step": 2170
},
{
"epoch": 1.028375,
"grad_norm": 0.3217035233974457,
"learning_rate": 0.00018669806825392132,
"loss": 2.601702117919922,
"step": 2180
},
{
"epoch": 1.029625,
"grad_norm": 0.322839617729187,
"learning_rate": 0.00018638081634549534,
"loss": 2.597119903564453,
"step": 2190
},
{
"epoch": 1.030875,
"grad_norm": 0.33341312408447266,
"learning_rate": 0.00018606232374939488,
"loss": 2.604803466796875,
"step": 2200
},
{
"epoch": 1.032125,
"grad_norm": 0.32422640919685364,
"learning_rate": 0.00018574259563227289,
"loss": 2.622762107849121,
"step": 2210
},
{
"epoch": 1.033375,
"grad_norm": 0.3312685787677765,
"learning_rate": 0.00018542163718082523,
"loss": 2.623911666870117,
"step": 2220
},
{
"epoch": 1.034625,
"grad_norm": 0.3332018256187439,
"learning_rate": 0.0001850994536017065,
"loss": 2.5997699737548827,
"step": 2230
},
{
"epoch": 1.035875,
"grad_norm": 0.32356560230255127,
"learning_rate": 0.00018477605012144564,
"loss": 2.59320182800293,
"step": 2240
},
{
"epoch": 1.037125,
"grad_norm": 0.30938515067100525,
"learning_rate": 0.00018445143198636093,
"loss": 2.5783287048339845,
"step": 2250
},
{
"epoch": 1.038375,
"grad_norm": 0.33119791746139526,
"learning_rate": 0.0001841256044624752,
"loss": 2.6023700714111326,
"step": 2260
},
{
"epoch": 1.039625,
"grad_norm": 0.32936912775039673,
"learning_rate": 0.00018379857283543015,
"loss": 2.595666694641113,
"step": 2270
},
{
"epoch": 1.040875,
"grad_norm": 0.34784626960754395,
"learning_rate": 0.00018347034241040066,
"loss": 2.6071990966796874,
"step": 2280
},
{
"epoch": 1.042125,
"grad_norm": 0.3317442238330841,
"learning_rate": 0.00018314091851200881,
"loss": 2.5899078369140627,
"step": 2290
},
{
"epoch": 1.043375,
"grad_norm": 0.3433104157447815,
"learning_rate": 0.0001828103064842375,
"loss": 2.6167388916015626,
"step": 2300
},
{
"epoch": 1.044625,
"grad_norm": 0.3177641034126282,
"learning_rate": 0.00018247851169034358,
"loss": 2.5915859222412108,
"step": 2310
},
{
"epoch": 1.045875,
"grad_norm": 0.33989644050598145,
"learning_rate": 0.00018214553951277114,
"loss": 2.5995319366455076,
"step": 2320
},
{
"epoch": 1.047125,
"grad_norm": 0.3309226930141449,
"learning_rate": 0.00018181139535306383,
"loss": 2.5778053283691404,
"step": 2330
},
{
"epoch": 1.048375,
"grad_norm": 0.33091750741004944,
"learning_rate": 0.00018147608463177768,
"loss": 2.6125743865966795,
"step": 2340
},
{
"epoch": 1.049625,
"grad_norm": 0.32603928446769714,
"learning_rate": 0.00018113961278839268,
"loss": 2.5618928909301757,
"step": 2350
},
{
"epoch": 1.050875,
"grad_norm": 0.3253335654735565,
"learning_rate": 0.00018080198528122495,
"loss": 2.592588424682617,
"step": 2360
},
{
"epoch": 1.052125,
"grad_norm": 0.3284412622451782,
"learning_rate": 0.000180463207587338,
"loss": 2.568330764770508,
"step": 2370
},
{
"epoch": 1.053375,
"grad_norm": 0.32107362151145935,
"learning_rate": 0.00018012328520245385,
"loss": 2.5809921264648437,
"step": 2380
},
{
"epoch": 1.054625,
"grad_norm": 0.3348993957042694,
"learning_rate": 0.000179782223640864,
"loss": 2.5713642120361326,
"step": 2390
},
{
"epoch": 1.055875,
"grad_norm": 0.3235042095184326,
"learning_rate": 0.00017944002843533986,
"loss": 2.608296203613281,
"step": 2400
},
{
"epoch": 1.057125,
"grad_norm": 0.33322450518608093,
"learning_rate": 0.00017909670513704306,
"loss": 2.587118911743164,
"step": 2410
},
{
"epoch": 1.058375,
"grad_norm": 0.32530325651168823,
"learning_rate": 0.00017875225931543543,
"loss": 2.5887866973876954,
"step": 2420
},
{
"epoch": 1.059625,
"grad_norm": 0.3360804319381714,
"learning_rate": 0.00017840669655818856,
"loss": 2.598593902587891,
"step": 2430
},
{
"epoch": 1.060875,
"grad_norm": 0.3203558921813965,
"learning_rate": 0.00017806002247109317,
"loss": 2.5644474029541016,
"step": 2440
},
{
"epoch": 1.062125,
"grad_norm": 0.34525611996650696,
"learning_rate": 0.00017771224267796828,
"loss": 2.5811479568481444,
"step": 2450
},
{
"epoch": 1.063375,
"grad_norm": 0.33284473419189453,
"learning_rate": 0.00017736336282056986,
"loss": 2.5817935943603514,
"step": 2460
},
{
"epoch": 1.064625,
"grad_norm": 0.34238749742507935,
"learning_rate": 0.00017701338855849938,
"loss": 2.570195770263672,
"step": 2470
},
{
"epoch": 1.065875,
"grad_norm": 0.330721378326416,
"learning_rate": 0.0001766623255691119,
"loss": 2.5676502227783202,
"step": 2480
},
{
"epoch": 1.067125,
"grad_norm": 0.33618465065956116,
"learning_rate": 0.00017631017954742415,
"loss": 2.581513595581055,
"step": 2490
},
{
"epoch": 1.068375,
"grad_norm": 0.3335385322570801,
"learning_rate": 0.00017595695620602192,
"loss": 2.6056888580322264,
"step": 2500
},
{
"epoch": 1.069625,
"grad_norm": 0.3303595781326294,
"learning_rate": 0.00017560266127496753,
"loss": 2.5539363861083983,
"step": 2510
},
{
"epoch": 1.070875,
"grad_norm": 0.32198089361190796,
"learning_rate": 0.00017524730050170697,
"loss": 2.569991683959961,
"step": 2520
},
{
"epoch": 1.072125,
"grad_norm": 0.3235105872154236,
"learning_rate": 0.0001748908796509764,
"loss": 2.5943014144897463,
"step": 2530
},
{
"epoch": 1.073375,
"grad_norm": 0.3448514938354492,
"learning_rate": 0.00017453340450470885,
"loss": 2.5967823028564454,
"step": 2540
},
{
"epoch": 1.074625,
"grad_norm": 0.32868677377700806,
"learning_rate": 0.00017417488086194028,
"loss": 2.5600149154663088,
"step": 2550
},
{
"epoch": 1.075875,
"grad_norm": 0.3214341104030609,
"learning_rate": 0.00017381531453871567,
"loss": 2.5800102233886717,
"step": 2560
},
{
"epoch": 1.077125,
"grad_norm": 0.33103859424591064,
"learning_rate": 0.00017345471136799454,
"loss": 2.568808364868164,
"step": 2570
},
{
"epoch": 1.078375,
"grad_norm": 0.31372782588005066,
"learning_rate": 0.00017309307719955632,
"loss": 2.554553413391113,
"step": 2580
},
{
"epoch": 1.079625,
"grad_norm": 0.3474419116973877,
"learning_rate": 0.00017273041789990558,
"loss": 2.540375900268555,
"step": 2590
},
{
"epoch": 1.080875,
"grad_norm": 0.3302421569824219,
"learning_rate": 0.0001723667393521767,
"loss": 2.5536571502685548,
"step": 2600
},
{
"epoch": 1.082125,
"grad_norm": 0.3372521996498108,
"learning_rate": 0.00017200204745603854,
"loss": 2.5786903381347654,
"step": 2610
},
{
"epoch": 1.083375,
"grad_norm": 0.34310850501060486,
"learning_rate": 0.00017163634812759882,
"loss": 2.56533203125,
"step": 2620
},
{
"epoch": 1.084625,
"grad_norm": 0.3463296890258789,
"learning_rate": 0.00017126964729930784,
"loss": 2.5742265701293947,
"step": 2630
},
{
"epoch": 1.085875,
"grad_norm": 0.3372081220149994,
"learning_rate": 0.00017090195091986254,
"loss": 2.5609130859375,
"step": 2640
},
{
"epoch": 1.087125,
"grad_norm": 0.33471760153770447,
"learning_rate": 0.00017053326495410998,
"loss": 2.570426177978516,
"step": 2650
},
{
"epoch": 1.088375,
"grad_norm": 0.3420524299144745,
"learning_rate": 0.0001701635953829503,
"loss": 2.5492122650146483,
"step": 2660
},
{
"epoch": 1.089625,
"grad_norm": 0.33050400018692017,
"learning_rate": 0.0001697929482032401,
"loss": 2.5594730377197266,
"step": 2670
},
{
"epoch": 1.090875,
"grad_norm": 0.33682385087013245,
"learning_rate": 0.00016942132942769476,
"loss": 2.560088348388672,
"step": 2680
},
{
"epoch": 1.092125,
"grad_norm": 0.34267619252204895,
"learning_rate": 0.00016904874508479127,
"loss": 2.5474054336547853,
"step": 2690
},
{
"epoch": 1.093375,
"grad_norm": 0.33607542514801025,
"learning_rate": 0.00016867520121867006,
"loss": 2.5770172119140624,
"step": 2700
},
{
"epoch": 1.094625,
"grad_norm": 0.3332061171531677,
"learning_rate": 0.0001683007038890373,
"loss": 2.5588443756103514,
"step": 2710
},
{
"epoch": 1.095875,
"grad_norm": 0.34043437242507935,
"learning_rate": 0.00016792525917106642,
"loss": 2.5765233993530274,
"step": 2720
},
{
"epoch": 1.097125,
"grad_norm": 0.3437064290046692,
"learning_rate": 0.00016754887315529948,
"loss": 2.598227691650391,
"step": 2730
},
{
"epoch": 1.098375,
"grad_norm": 0.3502216935157776,
"learning_rate": 0.0001671715519475486,
"loss": 2.5620880126953125,
"step": 2740
},
{
"epoch": 1.099625,
"grad_norm": 0.32694822549819946,
"learning_rate": 0.00016679330166879665,
"loss": 2.5393630981445314,
"step": 2750
},
{
"epoch": 1.100875,
"grad_norm": 0.3365384042263031,
"learning_rate": 0.00016641412845509818,
"loss": 2.5454193115234376,
"step": 2760
},
{
"epoch": 1.102125,
"grad_norm": 0.3421364426612854,
"learning_rate": 0.00016603403845747984,
"loss": 2.5687324523925783,
"step": 2770
},
{
"epoch": 1.103375,
"grad_norm": 0.32685622572898865,
"learning_rate": 0.0001656530378418403,
"loss": 2.564802551269531,
"step": 2780
},
{
"epoch": 1.104625,
"grad_norm": 0.32674023509025574,
"learning_rate": 0.0001652711327888507,
"loss": 2.5603107452392577,
"step": 2790
},
{
"epoch": 1.105875,
"grad_norm": 0.3370579481124878,
"learning_rate": 0.00016488832949385402,
"loss": 2.537816619873047,
"step": 2800
}
],
"logging_steps": 10,
"max_steps": 8000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.7508190343633306e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}