DiffPO-2B / trainer_state.json
RuizheChen's picture
Add files using upload-large-folder tool
ac2dea4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998985080686086,
"eval_steps": 500,
"global_step": 2463,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0040596772556581754,
"grad_norm": 132.0478057861328,
"learning_rate": 1.3513513513513515e-10,
"loss": 15.9449,
"step": 10
},
{
"epoch": 0.008119354511316351,
"grad_norm": 185.23609924316406,
"learning_rate": 2.702702702702703e-10,
"loss": 17.0759,
"step": 20
},
{
"epoch": 0.012179031766974525,
"grad_norm": 138.67050170898438,
"learning_rate": 4.0540540540540546e-10,
"loss": 16.5406,
"step": 30
},
{
"epoch": 0.016238709022632702,
"grad_norm": 154.48605346679688,
"learning_rate": 5.405405405405406e-10,
"loss": 16.9727,
"step": 40
},
{
"epoch": 0.020298386278290875,
"grad_norm": 157.55892944335938,
"learning_rate": 6.756756756756757e-10,
"loss": 16.3976,
"step": 50
},
{
"epoch": 0.02435806353394905,
"grad_norm": 195.1601104736328,
"learning_rate": 8.108108108108109e-10,
"loss": 17.7916,
"step": 60
},
{
"epoch": 0.028417740789607227,
"grad_norm": 185.73776245117188,
"learning_rate": 9.45945945945946e-10,
"loss": 16.9119,
"step": 70
},
{
"epoch": 0.032477418045265403,
"grad_norm": 200.99549865722656,
"learning_rate": 9.9998443648451e-10,
"loss": 17.2071,
"step": 80
},
{
"epoch": 0.036537095300923576,
"grad_norm": 204.63539123535156,
"learning_rate": 9.99889329620792e-10,
"loss": 16.493,
"step": 90
},
{
"epoch": 0.04059677255658175,
"grad_norm": 176.94541931152344,
"learning_rate": 9.997077787173976e-10,
"loss": 17.9207,
"step": 100
},
{
"epoch": 0.04465644981223993,
"grad_norm": 208.50137329101562,
"learning_rate": 9.99439815169263e-10,
"loss": 16.9535,
"step": 110
},
{
"epoch": 0.0487161270678981,
"grad_norm": 169.84095764160156,
"learning_rate": 9.990854853143476e-10,
"loss": 16.6511,
"step": 120
},
{
"epoch": 0.052775804323556275,
"grad_norm": 176.42965698242188,
"learning_rate": 9.98644850425622e-10,
"loss": 17.7791,
"step": 130
},
{
"epoch": 0.056835481579214454,
"grad_norm": 171.8761749267578,
"learning_rate": 9.981179867004708e-10,
"loss": 17.931,
"step": 140
},
{
"epoch": 0.06089515883487263,
"grad_norm": 222.7410888671875,
"learning_rate": 9.97504985247518e-10,
"loss": 16.1024,
"step": 150
},
{
"epoch": 0.06495483609053081,
"grad_norm": 173.39248657226562,
"learning_rate": 9.968059520708706e-10,
"loss": 17.2411,
"step": 160
},
{
"epoch": 0.06901451334618898,
"grad_norm": 202.84156799316406,
"learning_rate": 9.960210080517876e-10,
"loss": 17.6544,
"step": 170
},
{
"epoch": 0.07307419060184715,
"grad_norm": 195.48196411132812,
"learning_rate": 9.951502889277773e-10,
"loss": 17.2764,
"step": 180
},
{
"epoch": 0.07713386785750533,
"grad_norm": 204.30767822265625,
"learning_rate": 9.941939452691238e-10,
"loss": 17.3761,
"step": 190
},
{
"epoch": 0.0811935451131635,
"grad_norm": 233.84881591796875,
"learning_rate": 9.931521424528503e-10,
"loss": 17.5323,
"step": 200
},
{
"epoch": 0.08525322236882169,
"grad_norm": 171.0516357421875,
"learning_rate": 9.920250606341204e-10,
"loss": 17.3739,
"step": 210
},
{
"epoch": 0.08931289962447986,
"grad_norm": 225.2151641845703,
"learning_rate": 9.908128947150849e-10,
"loss": 17.3732,
"step": 220
},
{
"epoch": 0.09337257688013803,
"grad_norm": 144.97401428222656,
"learning_rate": 9.895158543111775e-10,
"loss": 16.4779,
"step": 230
},
{
"epoch": 0.0974322541357962,
"grad_norm": 194.57334899902344,
"learning_rate": 9.881341637148678e-10,
"loss": 17.972,
"step": 240
},
{
"epoch": 0.10149193139145438,
"grad_norm": 182.3833770751953,
"learning_rate": 9.866680618568744e-10,
"loss": 17.1289,
"step": 250
},
{
"epoch": 0.10555160864711255,
"grad_norm": 139.1823272705078,
"learning_rate": 9.851178022648477e-10,
"loss": 16.7695,
"step": 260
},
{
"epoch": 0.10961128590277074,
"grad_norm": 147.51815795898438,
"learning_rate": 9.834836530195282e-10,
"loss": 16.6021,
"step": 270
},
{
"epoch": 0.11367096315842891,
"grad_norm": 141.7736358642578,
"learning_rate": 9.817658967083883e-10,
"loss": 17.0966,
"step": 280
},
{
"epoch": 0.11773064041408708,
"grad_norm": 188.4720001220703,
"learning_rate": 9.799648303767659e-10,
"loss": 16.9828,
"step": 290
},
{
"epoch": 0.12179031766974525,
"grad_norm": 148.27267456054688,
"learning_rate": 9.780807654764966e-10,
"loss": 17.4211,
"step": 300
},
{
"epoch": 0.12584999492540344,
"grad_norm": 149.38882446289062,
"learning_rate": 9.761140278120562e-10,
"loss": 16.9751,
"step": 310
},
{
"epoch": 0.12990967218106161,
"grad_norm": 159.62318420410156,
"learning_rate": 9.740649574842206e-10,
"loss": 16.3416,
"step": 320
},
{
"epoch": 0.1339693494367198,
"grad_norm": 158.93472290039062,
"learning_rate": 9.719339088312521e-10,
"loss": 16.6636,
"step": 330
},
{
"epoch": 0.13802902669237796,
"grad_norm": 197.2371826171875,
"learning_rate": 9.697212503676272e-10,
"loss": 18.0939,
"step": 340
},
{
"epoch": 0.14208870394803613,
"grad_norm": 149.5404510498047,
"learning_rate": 9.674273647203087e-10,
"loss": 16.6984,
"step": 350
},
{
"epoch": 0.1461483812036943,
"grad_norm": 159.680908203125,
"learning_rate": 9.650526485625804e-10,
"loss": 17.404,
"step": 360
},
{
"epoch": 0.15020805845935248,
"grad_norm": 171.5859832763672,
"learning_rate": 9.625975125454515e-10,
"loss": 16.8117,
"step": 370
},
{
"epoch": 0.15426773571501065,
"grad_norm": 199.5972137451172,
"learning_rate": 9.600623812266447e-10,
"loss": 17.56,
"step": 380
},
{
"epoch": 0.15832741297066882,
"grad_norm": 147.773681640625,
"learning_rate": 9.57447692997178e-10,
"loss": 17.4291,
"step": 390
},
{
"epoch": 0.162387090226327,
"grad_norm": 218.25433349609375,
"learning_rate": 9.54753900005557e-10,
"loss": 17.5885,
"step": 400
},
{
"epoch": 0.16644676748198517,
"grad_norm": 169.73826599121094,
"learning_rate": 9.519814680795842e-10,
"loss": 16.6519,
"step": 410
},
{
"epoch": 0.17050644473764337,
"grad_norm": 166.3510284423828,
"learning_rate": 9.491308766458076e-10,
"loss": 17.2467,
"step": 420
},
{
"epoch": 0.17456612199330154,
"grad_norm": 182.20436096191406,
"learning_rate": 9.462026186466134e-10,
"loss": 17.4754,
"step": 430
},
{
"epoch": 0.17862579924895972,
"grad_norm": 162.10064697265625,
"learning_rate": 9.431972004549834e-10,
"loss": 16.3912,
"step": 440
},
{
"epoch": 0.1826854765046179,
"grad_norm": 211.85739135742188,
"learning_rate": 9.40115141786931e-10,
"loss": 18.0005,
"step": 450
},
{
"epoch": 0.18674515376027606,
"grad_norm": 156.6442413330078,
"learning_rate": 9.369569756116282e-10,
"loss": 16.4153,
"step": 460
},
{
"epoch": 0.19080483101593423,
"grad_norm": 160.61705017089844,
"learning_rate": 9.337232480592392e-10,
"loss": 17.6727,
"step": 470
},
{
"epoch": 0.1948645082715924,
"grad_norm": 152.6673583984375,
"learning_rate": 9.304145183264834e-10,
"loss": 17.8167,
"step": 480
},
{
"epoch": 0.19892418552725058,
"grad_norm": 200.42538452148438,
"learning_rate": 9.270313585799328e-10,
"loss": 17.4904,
"step": 490
},
{
"epoch": 0.20298386278290875,
"grad_norm": 232.02088928222656,
"learning_rate": 9.235743538570709e-10,
"loss": 16.3814,
"step": 500
},
{
"epoch": 0.20704354003856693,
"grad_norm": 159.0342254638672,
"learning_rate": 9.200441019651237e-10,
"loss": 16.7111,
"step": 510
},
{
"epoch": 0.2111032172942251,
"grad_norm": 189.23023986816406,
"learning_rate": 9.164412133776831e-10,
"loss": 17.5323,
"step": 520
},
{
"epoch": 0.21516289454988327,
"grad_norm": 126.5080337524414,
"learning_rate": 9.127663111291399e-10,
"loss": 17.2915,
"step": 530
},
{
"epoch": 0.21922257180554147,
"grad_norm": 206.55093383789062,
"learning_rate": 9.09020030706945e-10,
"loss": 17.1491,
"step": 540
},
{
"epoch": 0.22328224906119964,
"grad_norm": 219.1647491455078,
"learning_rate": 9.052030199417168e-10,
"loss": 17.3283,
"step": 550
},
{
"epoch": 0.22734192631685782,
"grad_norm": 178.6308135986328,
"learning_rate": 9.013159388952136e-10,
"loss": 16.8583,
"step": 560
},
{
"epoch": 0.231401603572516,
"grad_norm": 173.4936065673828,
"learning_rate": 8.973594597461927e-10,
"loss": 17.5231,
"step": 570
},
{
"epoch": 0.23546128082817416,
"grad_norm": 162.73269653320312,
"learning_rate": 8.933342666741717e-10,
"loss": 17.1647,
"step": 580
},
{
"epoch": 0.23952095808383234,
"grad_norm": 200.88888549804688,
"learning_rate": 8.892410557411171e-10,
"loss": 17.3196,
"step": 590
},
{
"epoch": 0.2435806353394905,
"grad_norm": 187.9020233154297,
"learning_rate": 8.850805347710753e-10,
"loss": 17.6811,
"step": 600
},
{
"epoch": 0.24764031259514868,
"grad_norm": 201.80628967285156,
"learning_rate": 8.80853423227773e-10,
"loss": 18.0601,
"step": 610
},
{
"epoch": 0.2516999898508069,
"grad_norm": 130.6888427734375,
"learning_rate": 8.765604520902013e-10,
"loss": 15.8318,
"step": 620
},
{
"epoch": 0.255759667106465,
"grad_norm": 174.96263122558594,
"learning_rate": 8.722023637262114e-10,
"loss": 17.533,
"step": 630
},
{
"epoch": 0.25981934436212323,
"grad_norm": 191.3065643310547,
"learning_rate": 8.677799117641387e-10,
"loss": 17.1311,
"step": 640
},
{
"epoch": 0.2638790216177814,
"grad_norm": 202.28585815429688,
"learning_rate": 8.632938609624813e-10,
"loss": 17.2724,
"step": 650
},
{
"epoch": 0.2679386988734396,
"grad_norm": 222.07972717285156,
"learning_rate": 8.587449870776526e-10,
"loss": 17.2216,
"step": 660
},
{
"epoch": 0.2719983761290977,
"grad_norm": 214.5223846435547,
"learning_rate": 8.541340767298328e-10,
"loss": 17.3321,
"step": 670
},
{
"epoch": 0.2760580533847559,
"grad_norm": 168.41421508789062,
"learning_rate": 8.494619272669418e-10,
"loss": 17.529,
"step": 680
},
{
"epoch": 0.28011773064041406,
"grad_norm": 182.80117797851562,
"learning_rate": 8.447293466267558e-10,
"loss": 18.1657,
"step": 690
},
{
"epoch": 0.28417740789607226,
"grad_norm": 190.1322784423828,
"learning_rate": 8.399371531971954e-10,
"loss": 18.3519,
"step": 700
},
{
"epoch": 0.2882370851517304,
"grad_norm": 195.91421508789062,
"learning_rate": 8.350861756748022e-10,
"loss": 17.8645,
"step": 710
},
{
"epoch": 0.2922967624073886,
"grad_norm": 133.7427215576172,
"learning_rate": 8.301772529214376e-10,
"loss": 17.2449,
"step": 720
},
{
"epoch": 0.2963564396630468,
"grad_norm": 197.28350830078125,
"learning_rate": 8.252112338192204e-10,
"loss": 17.3724,
"step": 730
},
{
"epoch": 0.30041611691870496,
"grad_norm": 185.95706176757812,
"learning_rate": 8.201889771237327e-10,
"loss": 16.9303,
"step": 740
},
{
"epoch": 0.30447579417436316,
"grad_norm": 167.47555541992188,
"learning_rate": 8.151113513155189e-10,
"loss": 17.2537,
"step": 750
},
{
"epoch": 0.3085354714300213,
"grad_norm": 232.46424865722656,
"learning_rate": 8.099792344499018e-10,
"loss": 17.4633,
"step": 760
},
{
"epoch": 0.3125951486856795,
"grad_norm": 188.18106079101562,
"learning_rate": 8.047935140051446e-10,
"loss": 17.2019,
"step": 770
},
{
"epoch": 0.31665482594133765,
"grad_norm": 177.53736877441406,
"learning_rate": 7.995550867289819e-10,
"loss": 16.7029,
"step": 780
},
{
"epoch": 0.32071450319699585,
"grad_norm": 184.06761169433594,
"learning_rate": 7.942648584835484e-10,
"loss": 18.0381,
"step": 790
},
{
"epoch": 0.324774180452654,
"grad_norm": 173.73468017578125,
"learning_rate": 7.889237440887321e-10,
"loss": 18.0302,
"step": 800
},
{
"epoch": 0.3288338577083122,
"grad_norm": 217.86709594726562,
"learning_rate": 7.835326671639764e-10,
"loss": 18.0424,
"step": 810
},
{
"epoch": 0.33289353496397034,
"grad_norm": 202.07118225097656,
"learning_rate": 7.780925599685638e-10,
"loss": 16.8956,
"step": 820
},
{
"epoch": 0.33695321221962854,
"grad_norm": 191.79275512695312,
"learning_rate": 7.726043632404022e-10,
"loss": 17.3942,
"step": 830
},
{
"epoch": 0.34101288947528674,
"grad_norm": 161.33790588378906,
"learning_rate": 7.670690260333475e-10,
"loss": 17.1583,
"step": 840
},
{
"epoch": 0.3450725667309449,
"grad_norm": 181.90426635742188,
"learning_rate": 7.614875055530866e-10,
"loss": 17.1477,
"step": 850
},
{
"epoch": 0.3491322439866031,
"grad_norm": 213.49960327148438,
"learning_rate": 7.558607669916116e-10,
"loss": 17.6481,
"step": 860
},
{
"epoch": 0.35319192124226123,
"grad_norm": 202.2276153564453,
"learning_rate": 7.501897833603124e-10,
"loss": 16.7866,
"step": 870
},
{
"epoch": 0.35725159849791943,
"grad_norm": 177.2437286376953,
"learning_rate": 7.444755353217177e-10,
"loss": 17.1007,
"step": 880
},
{
"epoch": 0.3613112757535776,
"grad_norm": 161.7916717529297,
"learning_rate": 7.387190110199122e-10,
"loss": 16.8443,
"step": 890
},
{
"epoch": 0.3653709530092358,
"grad_norm": 160.1624298095703,
"learning_rate": 7.32921205909661e-10,
"loss": 17.1523,
"step": 900
},
{
"epoch": 0.3694306302648939,
"grad_norm": 200.31753540039062,
"learning_rate": 7.270831225842692e-10,
"loss": 17.6586,
"step": 910
},
{
"epoch": 0.3734903075205521,
"grad_norm": 158.079833984375,
"learning_rate": 7.212057706022059e-10,
"loss": 17.1793,
"step": 920
},
{
"epoch": 0.37754998477621027,
"grad_norm": 224.93112182617188,
"learning_rate": 7.152901663125267e-10,
"loss": 18.1676,
"step": 930
},
{
"epoch": 0.38160966203186847,
"grad_norm": 199.39297485351562,
"learning_rate": 7.09337332679119e-10,
"loss": 15.8113,
"step": 940
},
{
"epoch": 0.38566933928752667,
"grad_norm": 202.5852508544922,
"learning_rate": 7.033482991038051e-10,
"loss": 17.3973,
"step": 950
},
{
"epoch": 0.3897290165431848,
"grad_norm": 206.29861450195312,
"learning_rate": 6.97324101248331e-10,
"loss": 16.953,
"step": 960
},
{
"epoch": 0.393788693798843,
"grad_norm": 140.68646240234375,
"learning_rate": 6.91265780855274e-10,
"loss": 17.5197,
"step": 970
},
{
"epoch": 0.39784837105450116,
"grad_norm": 191.14852905273438,
"learning_rate": 6.851743855678965e-10,
"loss": 17.6989,
"step": 980
},
{
"epoch": 0.40190804831015936,
"grad_norm": 152.35377502441406,
"learning_rate": 6.79050968748983e-10,
"loss": 17.5127,
"step": 990
},
{
"epoch": 0.4059677255658175,
"grad_norm": 181.50877380371094,
"learning_rate": 6.728965892986838e-10,
"loss": 16.8963,
"step": 1000
},
{
"epoch": 0.4100274028214757,
"grad_norm": 192.32125854492188,
"learning_rate": 6.667123114714048e-10,
"loss": 17.2991,
"step": 1010
},
{
"epoch": 0.41408708007713385,
"grad_norm": 202.2693634033203,
"learning_rate": 6.604992046917688e-10,
"loss": 16.8996,
"step": 1020
},
{
"epoch": 0.41814675733279205,
"grad_norm": 151.45115661621094,
"learning_rate": 6.542583433696846e-10,
"loss": 16.8886,
"step": 1030
},
{
"epoch": 0.4222064345884502,
"grad_norm": 157.8872528076172,
"learning_rate": 6.479908067145527e-10,
"loss": 17.0116,
"step": 1040
},
{
"epoch": 0.4262661118441084,
"grad_norm": 228.60235595703125,
"learning_rate": 6.416976785486416e-10,
"loss": 17.6079,
"step": 1050
},
{
"epoch": 0.43032578909976654,
"grad_norm": 219.45249938964844,
"learning_rate": 6.353800471196667e-10,
"loss": 16.9453,
"step": 1060
},
{
"epoch": 0.43438546635542474,
"grad_norm": 164.8721923828125,
"learning_rate": 6.290390049126031e-10,
"loss": 17.2325,
"step": 1070
},
{
"epoch": 0.43844514361108294,
"grad_norm": 184.8201904296875,
"learning_rate": 6.226756484607668e-10,
"loss": 17.1532,
"step": 1080
},
{
"epoch": 0.4425048208667411,
"grad_norm": 187.04025268554688,
"learning_rate": 6.162910781561946e-10,
"loss": 16.4238,
"step": 1090
},
{
"epoch": 0.4465644981223993,
"grad_norm": 200.2959747314453,
"learning_rate": 6.098863980593574e-10,
"loss": 18.0924,
"step": 1100
},
{
"epoch": 0.45062417537805743,
"grad_norm": 214.22193908691406,
"learning_rate": 6.034627157082394e-10,
"loss": 17.5339,
"step": 1110
},
{
"epoch": 0.45468385263371563,
"grad_norm": 219.8036651611328,
"learning_rate": 5.970211419268152e-10,
"loss": 17.7163,
"step": 1120
},
{
"epoch": 0.4587435298893738,
"grad_norm": 177.9528045654297,
"learning_rate": 5.905627906329592e-10,
"loss": 17.277,
"step": 1130
},
{
"epoch": 0.462803207145032,
"grad_norm": 181.62625122070312,
"learning_rate": 5.840887786458205e-10,
"loss": 17.0171,
"step": 1140
},
{
"epoch": 0.4668628844006901,
"grad_norm": 212.0501251220703,
"learning_rate": 5.776002254926935e-10,
"loss": 17.2654,
"step": 1150
},
{
"epoch": 0.4709225616563483,
"grad_norm": 185.97579956054688,
"learning_rate": 5.710982532154247e-10,
"loss": 17.6895,
"step": 1160
},
{
"epoch": 0.47498223891200647,
"grad_norm": 232.2166748046875,
"learning_rate": 5.645839861763805e-10,
"loss": 18.0333,
"step": 1170
},
{
"epoch": 0.47904191616766467,
"grad_norm": 176.52072143554688,
"learning_rate": 5.580585508640152e-10,
"loss": 16.8448,
"step": 1180
},
{
"epoch": 0.4831015934233229,
"grad_norm": 189.46929931640625,
"learning_rate": 5.515230756980719e-10,
"loss": 17.2395,
"step": 1190
},
{
"epoch": 0.487161270678981,
"grad_norm": 206.33079528808594,
"learning_rate": 5.449786908344499e-10,
"loss": 16.9241,
"step": 1200
},
{
"epoch": 0.4912209479346392,
"grad_norm": 186.9293670654297,
"learning_rate": 5.384265279697689e-10,
"loss": 16.7443,
"step": 1210
},
{
"epoch": 0.49528062519029736,
"grad_norm": 170.4814453125,
"learning_rate": 5.318677201456708e-10,
"loss": 16.6439,
"step": 1220
},
{
"epoch": 0.49934030244595556,
"grad_norm": 181.9535675048828,
"learning_rate": 5.253034015528856e-10,
"loss": 16.3063,
"step": 1230
},
{
"epoch": 0.5033999797016138,
"grad_norm": 181.1636505126953,
"learning_rate": 5.187347073351006e-10,
"loss": 17.3231,
"step": 1240
},
{
"epoch": 0.5074596569572719,
"grad_norm": 186.47972106933594,
"learning_rate": 5.121627733926641e-10,
"loss": 17.0968,
"step": 1250
},
{
"epoch": 0.51151933421293,
"grad_norm": 194.881591796875,
"learning_rate": 5.055887361861582e-10,
"loss": 18.201,
"step": 1260
},
{
"epoch": 0.5155790114685882,
"grad_norm": 199.1874237060547,
"learning_rate": 4.990137325398745e-10,
"loss": 16.7817,
"step": 1270
},
{
"epoch": 0.5196386887242465,
"grad_norm": 203.87411499023438,
"learning_rate": 4.924388994452276e-10,
"loss": 17.371,
"step": 1280
},
{
"epoch": 0.5236983659799046,
"grad_norm": 177.25927734375,
"learning_rate": 4.858653738641395e-10,
"loss": 16.6596,
"step": 1290
},
{
"epoch": 0.5277580432355627,
"grad_norm": 161.23329162597656,
"learning_rate": 4.792942925324285e-10,
"loss": 17.0887,
"step": 1300
},
{
"epoch": 0.531817720491221,
"grad_norm": 188.26792907714844,
"learning_rate": 4.727267917632377e-10,
"loss": 17.4645,
"step": 1310
},
{
"epoch": 0.5358773977468791,
"grad_norm": 204.50733947753906,
"learning_rate": 4.661640072505365e-10,
"loss": 17.5325,
"step": 1320
},
{
"epoch": 0.5399370750025373,
"grad_norm": 180.15682983398438,
"learning_rate": 4.5960707387272904e-10,
"loss": 17.7173,
"step": 1330
},
{
"epoch": 0.5439967522581954,
"grad_norm": 195.1600341796875,
"learning_rate": 4.5305712549640504e-10,
"loss": 16.8578,
"step": 1340
},
{
"epoch": 0.5480564295138537,
"grad_norm": 201.564208984375,
"learning_rate": 4.4651529478026227e-10,
"loss": 17.7686,
"step": 1350
},
{
"epoch": 0.5521161067695118,
"grad_norm": 207.5132293701172,
"learning_rate": 4.3998271297924156e-10,
"loss": 16.9821,
"step": 1360
},
{
"epoch": 0.55617578402517,
"grad_norm": 243.6310272216797,
"learning_rate": 4.3346050974890247e-10,
"loss": 17.9338,
"step": 1370
},
{
"epoch": 0.5602354612808281,
"grad_norm": 169.40707397460938,
"learning_rate": 4.269498129500762e-10,
"loss": 16.6915,
"step": 1380
},
{
"epoch": 0.5642951385364864,
"grad_norm": 209.2589569091797,
"learning_rate": 4.2045174845382885e-10,
"loss": 17.3758,
"step": 1390
},
{
"epoch": 0.5683548157921445,
"grad_norm": 171.83935546875,
"learning_rate": 4.139674399467684e-10,
"loss": 16.4755,
"step": 1400
},
{
"epoch": 0.5724144930478027,
"grad_norm": 206.6162109375,
"learning_rate": 4.074980087367294e-10,
"loss": 17.9797,
"step": 1410
},
{
"epoch": 0.5764741703034608,
"grad_norm": 173.0574951171875,
"learning_rate": 4.010445735588702e-10,
"loss": 16.503,
"step": 1420
},
{
"epoch": 0.5805338475591191,
"grad_norm": 206.66969299316406,
"learning_rate": 3.946082503822132e-10,
"loss": 17.5007,
"step": 1430
},
{
"epoch": 0.5845935248147772,
"grad_norm": 225.87709045410156,
"learning_rate": 3.881901522166649e-10,
"loss": 17.5912,
"step": 1440
},
{
"epoch": 0.5886532020704354,
"grad_norm": 180.4112091064453,
"learning_rate": 3.817913889205473e-10,
"loss": 17.6061,
"step": 1450
},
{
"epoch": 0.5927128793260936,
"grad_norm": 128.54103088378906,
"learning_rate": 3.7541306700867386e-10,
"loss": 16.0483,
"step": 1460
},
{
"epoch": 0.5967725565817518,
"grad_norm": 181.9798126220703,
"learning_rate": 3.6905628946100346e-10,
"loss": 16.802,
"step": 1470
},
{
"epoch": 0.6008322338374099,
"grad_norm": 149.59954833984375,
"learning_rate": 3.6272215553190727e-10,
"loss": 16.2398,
"step": 1480
},
{
"epoch": 0.6048919110930681,
"grad_norm": 170.98695373535156,
"learning_rate": 3.564117605600774e-10,
"loss": 16.2826,
"step": 1490
},
{
"epoch": 0.6089515883487263,
"grad_norm": 170.1161651611328,
"learning_rate": 3.5012619577911544e-10,
"loss": 17.1219,
"step": 1500
},
{
"epoch": 0.6130112656043845,
"grad_norm": 197.9274139404297,
"learning_rate": 3.438665481288278e-10,
"loss": 16.7303,
"step": 1510
},
{
"epoch": 0.6170709428600426,
"grad_norm": 187.0927276611328,
"learning_rate": 3.376339000672664e-10,
"loss": 17.0052,
"step": 1520
},
{
"epoch": 0.6211306201157007,
"grad_norm": 161.8428497314453,
"learning_rate": 3.3142932938354233e-10,
"loss": 16.2225,
"step": 1530
},
{
"epoch": 0.625190297371359,
"grad_norm": 198.08689880371094,
"learning_rate": 3.252539090114484e-10,
"loss": 17.4928,
"step": 1540
},
{
"epoch": 0.6292499746270172,
"grad_norm": 165.53260803222656,
"learning_rate": 3.1910870684392023e-10,
"loss": 17.0441,
"step": 1550
},
{
"epoch": 0.6333096518826753,
"grad_norm": 153.26893615722656,
"learning_rate": 3.1299478554836934e-10,
"loss": 16.6345,
"step": 1560
},
{
"epoch": 0.6373693291383336,
"grad_norm": 166.07655334472656,
"learning_rate": 3.069132023829202e-10,
"loss": 16.7557,
"step": 1570
},
{
"epoch": 0.6414290063939917,
"grad_norm": 206.7568817138672,
"learning_rate": 3.0086500901358233e-10,
"loss": 17.2537,
"step": 1580
},
{
"epoch": 0.6454886836496498,
"grad_norm": 187.02734375,
"learning_rate": 2.94851251332389e-10,
"loss": 16.7615,
"step": 1590
},
{
"epoch": 0.649548360905308,
"grad_norm": 183.49896240234375,
"learning_rate": 2.888729692765365e-10,
"loss": 17.6427,
"step": 1600
},
{
"epoch": 0.6536080381609662,
"grad_norm": 201.25961303710938,
"learning_rate": 2.8293119664854974e-10,
"loss": 16.8277,
"step": 1610
},
{
"epoch": 0.6576677154166244,
"grad_norm": 156.2751922607422,
"learning_rate": 2.770269609375114e-10,
"loss": 17.5363,
"step": 1620
},
{
"epoch": 0.6617273926722825,
"grad_norm": 205.1450958251953,
"learning_rate": 2.71161283141382e-10,
"loss": 18.4642,
"step": 1630
},
{
"epoch": 0.6657870699279407,
"grad_norm": 157.36001586914062,
"learning_rate": 2.653351775904427e-10,
"loss": 17.0324,
"step": 1640
},
{
"epoch": 0.6698467471835989,
"grad_norm": 197.63540649414062,
"learning_rate": 2.5954965177189e-10,
"loss": 17.0267,
"step": 1650
},
{
"epoch": 0.6739064244392571,
"grad_norm": 163.759033203125,
"learning_rate": 2.5380570615561564e-10,
"loss": 17.2452,
"step": 1660
},
{
"epoch": 0.6779661016949152,
"grad_norm": 161.09716796875,
"learning_rate": 2.481043340211986e-10,
"loss": 17.429,
"step": 1670
},
{
"epoch": 0.6820257789505735,
"grad_norm": 208.68508911132812,
"learning_rate": 2.4244652128614036e-10,
"loss": 17.7347,
"step": 1680
},
{
"epoch": 0.6860854562062316,
"grad_norm": 154.68821716308594,
"learning_rate": 2.3683324633537435e-10,
"loss": 16.7167,
"step": 1690
},
{
"epoch": 0.6901451334618898,
"grad_norm": 186.65939331054688,
"learning_rate": 2.3126547985207759e-10,
"loss": 17.0754,
"step": 1700
},
{
"epoch": 0.6942048107175479,
"grad_norm": 155.3890838623047,
"learning_rate": 2.2574418464981368e-10,
"loss": 17.0158,
"step": 1710
},
{
"epoch": 0.6982644879732062,
"grad_norm": 175.33834838867188,
"learning_rate": 2.2027031550603654e-10,
"loss": 17.5807,
"step": 1720
},
{
"epoch": 0.7023241652288643,
"grad_norm": 200.68499755859375,
"learning_rate": 2.148448189969854e-10,
"loss": 15.5709,
"step": 1730
},
{
"epoch": 0.7063838424845225,
"grad_norm": 186.82205200195312,
"learning_rate": 2.094686333339953e-10,
"loss": 16.648,
"step": 1740
},
{
"epoch": 0.7104435197401806,
"grad_norm": 187.7284698486328,
"learning_rate": 2.0414268820125654e-10,
"loss": 17.0848,
"step": 1750
},
{
"epoch": 0.7145031969958389,
"grad_norm": 170.2503662109375,
"learning_rate": 1.9886790459504857e-10,
"loss": 16.8571,
"step": 1760
},
{
"epoch": 0.718562874251497,
"grad_norm": 176.3491668701172,
"learning_rate": 1.9364519466447346e-10,
"loss": 16.7827,
"step": 1770
},
{
"epoch": 0.7226225515071552,
"grad_norm": 167.1256866455078,
"learning_rate": 1.8847546155372252e-10,
"loss": 16.8153,
"step": 1780
},
{
"epoch": 0.7266822287628134,
"grad_norm": 187.24716186523438,
"learning_rate": 1.8335959924589935e-10,
"loss": 17.8325,
"step": 1790
},
{
"epoch": 0.7307419060184716,
"grad_norm": 216.55247497558594,
"learning_rate": 1.7829849240842516e-10,
"loss": 17.5121,
"step": 1800
},
{
"epoch": 0.7348015832741297,
"grad_norm": 200.8616180419922,
"learning_rate": 1.732930162400579e-10,
"loss": 16.8064,
"step": 1810
},
{
"epoch": 0.7388612605297878,
"grad_norm": 183.3948516845703,
"learning_rate": 1.6834403631954642e-10,
"loss": 17.0833,
"step": 1820
},
{
"epoch": 0.7429209377854461,
"grad_norm": 166.15834045410156,
"learning_rate": 1.6345240845594933e-10,
"loss": 17.7809,
"step": 1830
},
{
"epoch": 0.7469806150411042,
"grad_norm": 165.8581085205078,
"learning_rate": 1.586189785406429e-10,
"loss": 17.0209,
"step": 1840
},
{
"epoch": 0.7510402922967624,
"grad_norm": 212.00918579101562,
"learning_rate": 1.5384458240104482e-10,
"loss": 17.0343,
"step": 1850
},
{
"epoch": 0.7550999695524205,
"grad_norm": 185.42967224121094,
"learning_rate": 1.4913004565607665e-10,
"loss": 16.6158,
"step": 1860
},
{
"epoch": 0.7591596468080788,
"grad_norm": 195.2454071044922,
"learning_rate": 1.4447618357339333e-10,
"loss": 16.4979,
"step": 1870
},
{
"epoch": 0.7632193240637369,
"grad_norm": 214.2625274658203,
"learning_rate": 1.398838009284016e-10,
"loss": 16.691,
"step": 1880
},
{
"epoch": 0.7672790013193951,
"grad_norm": 181.38255310058594,
"learning_rate": 1.3535369186509296e-10,
"loss": 16.9062,
"step": 1890
},
{
"epoch": 0.7713386785750533,
"grad_norm": 145.416748046875,
"learning_rate": 1.308866397587153e-10,
"loss": 17.7773,
"step": 1900
},
{
"epoch": 0.7753983558307115,
"grad_norm": 216.9072723388672,
"learning_rate": 1.264834170803072e-10,
"loss": 16.9568,
"step": 1910
},
{
"epoch": 0.7794580330863696,
"grad_norm": 200.9578094482422,
"learning_rate": 1.2214478526311674e-10,
"loss": 17.5622,
"step": 1920
},
{
"epoch": 0.7835177103420278,
"grad_norm": 162.93185424804688,
"learning_rate": 1.1787149457092962e-10,
"loss": 16.9736,
"step": 1930
},
{
"epoch": 0.787577387597686,
"grad_norm": 189.59182739257812,
"learning_rate": 1.1366428396832929e-10,
"loss": 15.8744,
"step": 1940
},
{
"epoch": 0.7916370648533442,
"grad_norm": 188.7930450439453,
"learning_rate": 1.0952388099290983e-10,
"loss": 17.6766,
"step": 1950
},
{
"epoch": 0.7956967421090023,
"grad_norm": 134.07015991210938,
"learning_rate": 1.0545100162946586e-10,
"loss": 16.6428,
"step": 1960
},
{
"epoch": 0.7997564193646605,
"grad_norm": 142.8442840576172,
"learning_rate": 1.0144635018618054e-10,
"loss": 17.4065,
"step": 1970
},
{
"epoch": 0.8038160966203187,
"grad_norm": 182.24485778808594,
"learning_rate": 9.751061917283073e-11,
"loss": 17.2971,
"step": 1980
},
{
"epoch": 0.8078757738759769,
"grad_norm": 194.0862274169922,
"learning_rate": 9.364448918103474e-11,
"loss": 17.2544,
"step": 1990
},
{
"epoch": 0.811935451131635,
"grad_norm": 191.11993408203125,
"learning_rate": 8.984862876656026e-11,
"loss": 17.1763,
"step": 2000
},
{
"epoch": 0.8159951283872932,
"grad_norm": 188.06570434570312,
"learning_rate": 8.612369433371265e-11,
"loss": 16.6179,
"step": 2010
},
{
"epoch": 0.8200548056429514,
"grad_norm": 167.46762084960938,
"learning_rate": 8.247033002182614e-11,
"loss": 16.6814,
"step": 2020
},
{
"epoch": 0.8241144828986096,
"grad_norm": 159.38058471679688,
"learning_rate": 7.888916759387471e-11,
"loss": 16.5084,
"step": 2030
},
{
"epoch": 0.8281741601542677,
"grad_norm": 172.08058166503906,
"learning_rate": 7.538082632722371e-11,
"loss": 17.3695,
"step": 2040
},
{
"epoch": 0.832233837409926,
"grad_norm": 222.47935485839844,
"learning_rate": 7.194591290654024e-11,
"loss": 16.9923,
"step": 2050
},
{
"epoch": 0.8362935146655841,
"grad_norm": 189.6218719482422,
"learning_rate": 6.858502131888211e-11,
"loss": 17.5893,
"step": 2060
},
{
"epoch": 0.8403531919212422,
"grad_norm": 244.06753540039062,
"learning_rate": 6.52987327509812e-11,
"loss": 17.5454,
"step": 2070
},
{
"epoch": 0.8444128691769004,
"grad_norm": 137.7332305908203,
"learning_rate": 6.208761548874082e-11,
"loss": 17.2953,
"step": 2080
},
{
"epoch": 0.8484725464325586,
"grad_norm": 201.9289093017578,
"learning_rate": 5.895222481896489e-11,
"loss": 17.7196,
"step": 2090
},
{
"epoch": 0.8525322236882168,
"grad_norm": 172.56558227539062,
"learning_rate": 5.5893102933333277e-11,
"loss": 17.0008,
"step": 2100
},
{
"epoch": 0.8565919009438749,
"grad_norm": 173.23507690429688,
"learning_rate": 5.291077883464307e-11,
"loss": 16.3006,
"step": 2110
},
{
"epoch": 0.8606515781995331,
"grad_norm": 200.89015197753906,
"learning_rate": 5.0005768245330264e-11,
"loss": 17.5656,
"step": 2120
},
{
"epoch": 0.8647112554551913,
"grad_norm": 191.20590209960938,
"learning_rate": 4.717857351828731e-11,
"loss": 17.3456,
"step": 2130
},
{
"epoch": 0.8687709327108495,
"grad_norm": 166.0770263671875,
"learning_rate": 4.4429683549993106e-11,
"loss": 17.2893,
"step": 2140
},
{
"epoch": 0.8728306099665076,
"grad_norm": 172.1090850830078,
"learning_rate": 4.175957369597039e-11,
"loss": 16.464,
"step": 2150
},
{
"epoch": 0.8768902872221659,
"grad_norm": 170.6510009765625,
"learning_rate": 3.9168705688583555e-11,
"loss": 17.662,
"step": 2160
},
{
"epoch": 0.880949964477824,
"grad_norm": 201.04290771484375,
"learning_rate": 3.665752755719332e-11,
"loss": 17.4915,
"step": 2170
},
{
"epoch": 0.8850096417334822,
"grad_norm": 194.7832794189453,
"learning_rate": 3.422647355068076e-11,
"loss": 18.2301,
"step": 2180
},
{
"epoch": 0.8890693189891403,
"grad_norm": 181.28720092773438,
"learning_rate": 3.187596406235421e-11,
"loss": 17.7734,
"step": 2190
},
{
"epoch": 0.8931289962447986,
"grad_norm": 152.69996643066406,
"learning_rate": 2.9606405557251637e-11,
"loss": 16.8411,
"step": 2200
},
{
"epoch": 0.8971886735004567,
"grad_norm": 212.84933471679688,
"learning_rate": 2.7418190501853014e-11,
"loss": 17.3207,
"step": 2210
},
{
"epoch": 0.9012483507561149,
"grad_norm": 144.8594207763672,
"learning_rate": 2.5311697296211634e-11,
"loss": 16.9442,
"step": 2220
},
{
"epoch": 0.905308028011773,
"grad_norm": 232.24757385253906,
"learning_rate": 2.328729020851961e-11,
"loss": 18.1509,
"step": 2230
},
{
"epoch": 0.9093677052674313,
"grad_norm": 180.50587463378906,
"learning_rate": 2.134531931211542e-11,
"loss": 16.2897,
"step": 2240
},
{
"epoch": 0.9134273825230894,
"grad_norm": 176.4561004638672,
"learning_rate": 1.9486120424947908e-11,
"loss": 17.3459,
"step": 2250
},
{
"epoch": 0.9174870597787476,
"grad_norm": 177.8277130126953,
"learning_rate": 1.771001505150366e-11,
"loss": 16.3936,
"step": 2260
},
{
"epoch": 0.9215467370344058,
"grad_norm": 189.9925994873047,
"learning_rate": 1.6017310327211155e-11,
"loss": 17.3137,
"step": 2270
},
{
"epoch": 0.925606414290064,
"grad_norm": 212.03208923339844,
"learning_rate": 1.4408298965328472e-11,
"loss": 17.907,
"step": 2280
},
{
"epoch": 0.9296660915457221,
"grad_norm": 157.50892639160156,
"learning_rate": 1.2883259206325493e-11,
"loss": 16.9568,
"step": 2290
},
{
"epoch": 0.9337257688013803,
"grad_norm": 184.62356567382812,
"learning_rate": 1.1442454769769017e-11,
"loss": 18.1454,
"step": 2300
},
{
"epoch": 0.9377854460570385,
"grad_norm": 206.690185546875,
"learning_rate": 1.0086134808718562e-11,
"loss": 18.086,
"step": 2310
},
{
"epoch": 0.9418451233126967,
"grad_norm": 171.5003662109375,
"learning_rate": 8.814533866641106e-12,
"loss": 18.0924,
"step": 2320
},
{
"epoch": 0.9459048005683548,
"grad_norm": 154.81707763671875,
"learning_rate": 7.627871836852652e-12,
"loss": 16.5896,
"step": 2330
},
{
"epoch": 0.9499644778240129,
"grad_norm": 193.352783203125,
"learning_rate": 6.52635392449269e-12,
"loss": 18.0086,
"step": 2340
},
{
"epoch": 0.9540241550796712,
"grad_norm": 182.68508911132812,
"learning_rate": 5.510170611038701e-12,
"loss": 17.6251,
"step": 2350
},
{
"epoch": 0.9580838323353293,
"grad_norm": 220.1875762939453,
"learning_rate": 4.579497621367057e-12,
"loss": 18.5577,
"step": 2360
},
{
"epoch": 0.9621435095909875,
"grad_norm": 193.38424682617188,
"learning_rate": 3.734495893365664e-12,
"loss": 18.0829,
"step": 2370
},
{
"epoch": 0.9662031868466457,
"grad_norm": 162.79934692382812,
"learning_rate": 2.9753115501032213e-12,
"loss": 17.6267,
"step": 2380
},
{
"epoch": 0.9702628641023039,
"grad_norm": 185.84449768066406,
"learning_rate": 2.3020758745610493e-12,
"loss": 17.418,
"step": 2390
},
{
"epoch": 0.974322541357962,
"grad_norm": 176.00831604003906,
"learning_rate": 1.7149052869305794e-12,
"loss": 17.5759,
"step": 2400
},
{
"epoch": 0.9783822186136202,
"grad_norm": 163.9122772216797,
"learning_rate": 1.2139013244812924e-12,
"loss": 17.8926,
"step": 2410
},
{
"epoch": 0.9824418958692784,
"grad_norm": 172.8951416015625,
"learning_rate": 7.991506240022095e-13,
"loss": 17.5553,
"step": 2420
},
{
"epoch": 0.9865015731249366,
"grad_norm": 186.85015869140625,
"learning_rate": 4.70724906820208e-13,
"loss": 18.0582,
"step": 2430
},
{
"epoch": 0.9905612503805947,
"grad_norm": 203.409423828125,
"learning_rate": 2.286809663974987e-13,
"loss": 18.464,
"step": 2440
},
{
"epoch": 0.9946209276362529,
"grad_norm": 192.5825653076172,
"learning_rate": 7.306065851042654e-14,
"loss": 17.8112,
"step": 2450
},
{
"epoch": 0.9986806048919111,
"grad_norm": 174.4022674560547,
"learning_rate": 3.890894011593371e-15,
"loss": 17.7037,
"step": 2460
},
{
"epoch": 0.9998985080686086,
"step": 2463,
"total_flos": 0.0,
"train_loss": 17.218061584212457,
"train_runtime": 5138.9594,
"train_samples_per_second": 11.504,
"train_steps_per_second": 0.479
}
],
"logging_steps": 10,
"max_steps": 2463,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}