backpack-gpt2-small / trainer_state.json
jcblaise's picture
Upload folder using huggingface_hub
9a4dc84 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.222658667991288,
"eval_steps": 500,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003912727828512476,
"grad_norm": 8.57561206817627,
"learning_rate": 1.55e-06,
"loss": 5.0338,
"step": 32
},
{
"epoch": 0.007825455657024952,
"grad_norm": 5.36021089553833,
"learning_rate": 3.1500000000000003e-06,
"loss": 4.456,
"step": 64
},
{
"epoch": 0.011738183485537427,
"grad_norm": 3.3196067810058594,
"learning_rate": 4.75e-06,
"loss": 3.9216,
"step": 96
},
{
"epoch": 0.015650911314049904,
"grad_norm": 2.2839956283569336,
"learning_rate": 6.35e-06,
"loss": 3.6983,
"step": 128
},
{
"epoch": 0.01956363914256238,
"grad_norm": 1.4226499795913696,
"learning_rate": 7.95e-06,
"loss": 3.5863,
"step": 160
},
{
"epoch": 0.023476366971074854,
"grad_norm": 0.9770936369895935,
"learning_rate": 9.55e-06,
"loss": 3.5076,
"step": 192
},
{
"epoch": 0.02738909479958733,
"grad_norm": 0.6855128407478333,
"learning_rate": 1.115e-05,
"loss": 3.4515,
"step": 224
},
{
"epoch": 0.03130182262809981,
"grad_norm": 0.5743525624275208,
"learning_rate": 1.2750000000000002e-05,
"loss": 3.4145,
"step": 256
},
{
"epoch": 0.03521455045661228,
"grad_norm": 0.4765739440917969,
"learning_rate": 1.435e-05,
"loss": 3.3892,
"step": 288
},
{
"epoch": 0.03912727828512476,
"grad_norm": 0.40247443318367004,
"learning_rate": 1.595e-05,
"loss": 3.3664,
"step": 320
},
{
"epoch": 0.04304000611363723,
"grad_norm": 0.3582874834537506,
"learning_rate": 1.755e-05,
"loss": 3.3487,
"step": 352
},
{
"epoch": 0.04695273394214971,
"grad_norm": 0.31657862663269043,
"learning_rate": 1.915e-05,
"loss": 3.3349,
"step": 384
},
{
"epoch": 0.05086546177066218,
"grad_norm": 0.28206518292427063,
"learning_rate": 2.075e-05,
"loss": 3.3197,
"step": 416
},
{
"epoch": 0.05477818959917466,
"grad_norm": 0.2577824890613556,
"learning_rate": 2.235e-05,
"loss": 3.3058,
"step": 448
},
{
"epoch": 0.05869091742768714,
"grad_norm": 0.23786848783493042,
"learning_rate": 2.395e-05,
"loss": 3.2955,
"step": 480
},
{
"epoch": 0.06260364525619962,
"grad_norm": 0.2239329218864441,
"learning_rate": 2.555e-05,
"loss": 3.2846,
"step": 512
},
{
"epoch": 0.06651637308471209,
"grad_norm": 0.22519271075725555,
"learning_rate": 2.7150000000000003e-05,
"loss": 3.2731,
"step": 544
},
{
"epoch": 0.07042910091322456,
"grad_norm": 0.2189016044139862,
"learning_rate": 2.8749999999999997e-05,
"loss": 3.2663,
"step": 576
},
{
"epoch": 0.07434182874173703,
"grad_norm": 0.20760661363601685,
"learning_rate": 3.035e-05,
"loss": 3.2581,
"step": 608
},
{
"epoch": 0.07825455657024952,
"grad_norm": 0.205606147646904,
"learning_rate": 3.1950000000000004e-05,
"loss": 3.2451,
"step": 640
},
{
"epoch": 0.08216728439876199,
"grad_norm": 0.22558899223804474,
"learning_rate": 3.355e-05,
"loss": 3.2412,
"step": 672
},
{
"epoch": 0.08608001222727446,
"grad_norm": 0.22584667801856995,
"learning_rate": 3.515e-05,
"loss": 3.2358,
"step": 704
},
{
"epoch": 0.08999274005578695,
"grad_norm": 0.22091105580329895,
"learning_rate": 3.675e-05,
"loss": 3.2302,
"step": 736
},
{
"epoch": 0.09390546788429942,
"grad_norm": 0.22428959608078003,
"learning_rate": 3.8350000000000004e-05,
"loss": 3.2228,
"step": 768
},
{
"epoch": 0.09781819571281189,
"grad_norm": 0.22730223834514618,
"learning_rate": 3.995e-05,
"loss": 3.2207,
"step": 800
},
{
"epoch": 0.10173092354132436,
"grad_norm": 0.28039082884788513,
"learning_rate": 4.155e-05,
"loss": 3.2171,
"step": 832
},
{
"epoch": 0.10564365136983685,
"grad_norm": 0.32776346802711487,
"learning_rate": 4.315e-05,
"loss": 3.2104,
"step": 864
},
{
"epoch": 0.10955637919834932,
"grad_norm": 0.2800813615322113,
"learning_rate": 4.4750000000000004e-05,
"loss": 3.2053,
"step": 896
},
{
"epoch": 0.11346910702686179,
"grad_norm": 0.24571874737739563,
"learning_rate": 4.635e-05,
"loss": 3.2046,
"step": 928
},
{
"epoch": 0.11738183485537428,
"grad_norm": 0.5581298470497131,
"learning_rate": 4.795e-05,
"loss": 3.2004,
"step": 960
},
{
"epoch": 0.12129456268388675,
"grad_norm": 0.47118815779685974,
"learning_rate": 4.9550000000000005e-05,
"loss": 3.1967,
"step": 992
},
{
"epoch": 0.12520729051239923,
"grad_norm": 0.23707512021064758,
"learning_rate": 4.9872222222222225e-05,
"loss": 3.1945,
"step": 1024
},
{
"epoch": 0.1291200183409117,
"grad_norm": 0.41069141030311584,
"learning_rate": 4.969444444444445e-05,
"loss": 3.1928,
"step": 1056
},
{
"epoch": 0.13303274616942418,
"grad_norm": 0.376223623752594,
"learning_rate": 4.9516666666666666e-05,
"loss": 3.1871,
"step": 1088
},
{
"epoch": 0.13694547399793663,
"grad_norm": 0.22380244731903076,
"learning_rate": 4.933888888888889e-05,
"loss": 3.1862,
"step": 1120
},
{
"epoch": 0.14085820182644912,
"grad_norm": 0.2950900197029114,
"learning_rate": 4.9161111111111115e-05,
"loss": 3.1828,
"step": 1152
},
{
"epoch": 0.1447709296549616,
"grad_norm": 0.25872257351875305,
"learning_rate": 4.8983333333333336e-05,
"loss": 3.1828,
"step": 1184
},
{
"epoch": 0.14868365748347406,
"grad_norm": 0.3597142994403839,
"learning_rate": 4.880555555555556e-05,
"loss": 3.1845,
"step": 1216
},
{
"epoch": 0.15259638531198655,
"grad_norm": 0.30377593636512756,
"learning_rate": 4.862777777777778e-05,
"loss": 3.1806,
"step": 1248
},
{
"epoch": 0.15650911314049903,
"grad_norm": 0.3617115318775177,
"learning_rate": 4.845e-05,
"loss": 3.178,
"step": 1280
},
{
"epoch": 0.1604218409690115,
"grad_norm": 0.31589606404304504,
"learning_rate": 4.8272222222222226e-05,
"loss": 3.1787,
"step": 1312
},
{
"epoch": 0.16433456879752398,
"grad_norm": 0.30715763568878174,
"learning_rate": 4.809444444444445e-05,
"loss": 3.1754,
"step": 1344
},
{
"epoch": 0.16824729662603646,
"grad_norm": 0.2574257254600525,
"learning_rate": 4.791666666666667e-05,
"loss": 3.1732,
"step": 1376
},
{
"epoch": 0.17216002445454892,
"grad_norm": 0.3290633261203766,
"learning_rate": 4.773888888888889e-05,
"loss": 3.1723,
"step": 1408
},
{
"epoch": 0.1760727522830614,
"grad_norm": 0.24164608120918274,
"learning_rate": 4.756111111111111e-05,
"loss": 3.1693,
"step": 1440
},
{
"epoch": 0.1799854801115739,
"grad_norm": 0.30125918984413147,
"learning_rate": 4.738333333333334e-05,
"loss": 3.1685,
"step": 1472
},
{
"epoch": 0.18389820794008635,
"grad_norm": 0.3488104045391083,
"learning_rate": 4.720555555555556e-05,
"loss": 3.1678,
"step": 1504
},
{
"epoch": 0.18781093576859884,
"grad_norm": 0.2793637812137604,
"learning_rate": 4.702777777777778e-05,
"loss": 3.1668,
"step": 1536
},
{
"epoch": 0.1917236635971113,
"grad_norm": 0.2682870030403137,
"learning_rate": 4.685000000000001e-05,
"loss": 3.1642,
"step": 1568
},
{
"epoch": 0.19563639142562378,
"grad_norm": 0.36307454109191895,
"learning_rate": 4.667222222222222e-05,
"loss": 3.1654,
"step": 1600
},
{
"epoch": 0.19954911925413626,
"grad_norm": 0.23930683732032776,
"learning_rate": 4.649444444444445e-05,
"loss": 3.1641,
"step": 1632
},
{
"epoch": 0.20346184708264872,
"grad_norm": 0.3049800992012024,
"learning_rate": 4.631666666666667e-05,
"loss": 3.1654,
"step": 1664
},
{
"epoch": 0.2073745749111612,
"grad_norm": 0.27725374698638916,
"learning_rate": 4.613888888888889e-05,
"loss": 3.1642,
"step": 1696
},
{
"epoch": 0.2112873027396737,
"grad_norm": 0.2733665108680725,
"learning_rate": 4.596111111111112e-05,
"loss": 3.1584,
"step": 1728
},
{
"epoch": 0.21520003056818615,
"grad_norm": 0.34570956230163574,
"learning_rate": 4.578333333333333e-05,
"loss": 3.162,
"step": 1760
},
{
"epoch": 0.21911275839669864,
"grad_norm": 0.2521582543849945,
"learning_rate": 4.560555555555556e-05,
"loss": 3.1603,
"step": 1792
},
{
"epoch": 0.22302548622521112,
"grad_norm": 0.29344356060028076,
"learning_rate": 4.542777777777778e-05,
"loss": 3.1587,
"step": 1824
},
{
"epoch": 0.22693821405372358,
"grad_norm": 0.426881343126297,
"learning_rate": 4.525e-05,
"loss": 3.1561,
"step": 1856
},
{
"epoch": 0.23085094188223607,
"grad_norm": 0.27699196338653564,
"learning_rate": 4.507222222222223e-05,
"loss": 3.1581,
"step": 1888
},
{
"epoch": 0.23476366971074855,
"grad_norm": 0.32313504815101624,
"learning_rate": 4.4894444444444444e-05,
"loss": 3.1578,
"step": 1920
},
{
"epoch": 0.238676397539261,
"grad_norm": 0.26697778701782227,
"learning_rate": 4.4716666666666665e-05,
"loss": 3.157,
"step": 1952
},
{
"epoch": 0.2425891253677735,
"grad_norm": 0.2206508368253708,
"learning_rate": 4.453888888888889e-05,
"loss": 3.1551,
"step": 1984
},
{
"epoch": 0.24650185319628595,
"grad_norm": 0.252888947725296,
"learning_rate": 4.4361111111111113e-05,
"loss": 3.1563,
"step": 2016
},
{
"epoch": 0.25041458102479847,
"grad_norm": 0.28254494071006775,
"learning_rate": 4.4183333333333334e-05,
"loss": 3.156,
"step": 2048
},
{
"epoch": 0.2543273088533109,
"grad_norm": 0.28460440039634705,
"learning_rate": 4.4005555555555555e-05,
"loss": 3.156,
"step": 2080
},
{
"epoch": 0.2582400366818234,
"grad_norm": 0.290326863527298,
"learning_rate": 4.3827777777777776e-05,
"loss": 3.1518,
"step": 2112
},
{
"epoch": 0.26215276451033587,
"grad_norm": 0.2769670784473419,
"learning_rate": 4.3650000000000004e-05,
"loss": 3.1515,
"step": 2144
},
{
"epoch": 0.26606549233884835,
"grad_norm": 0.21678052842617035,
"learning_rate": 4.3472222222222225e-05,
"loss": 3.1518,
"step": 2176
},
{
"epoch": 0.26997822016736084,
"grad_norm": 0.3134085536003113,
"learning_rate": 4.3294444444444446e-05,
"loss": 3.1501,
"step": 2208
},
{
"epoch": 0.27389094799587327,
"grad_norm": 0.35099807381629944,
"learning_rate": 4.311666666666667e-05,
"loss": 3.1523,
"step": 2240
},
{
"epoch": 0.27780367582438575,
"grad_norm": 0.27320197224617004,
"learning_rate": 4.293888888888889e-05,
"loss": 3.1507,
"step": 2272
},
{
"epoch": 0.28171640365289824,
"grad_norm": 0.28096139430999756,
"learning_rate": 4.2761111111111115e-05,
"loss": 3.1474,
"step": 2304
},
{
"epoch": 0.2856291314814107,
"grad_norm": 0.30300965905189514,
"learning_rate": 4.2583333333333336e-05,
"loss": 3.15,
"step": 2336
},
{
"epoch": 0.2895418593099232,
"grad_norm": 0.2996535003185272,
"learning_rate": 4.240555555555556e-05,
"loss": 3.1528,
"step": 2368
},
{
"epoch": 0.2934545871384357,
"grad_norm": 0.2503749132156372,
"learning_rate": 4.222777777777778e-05,
"loss": 3.1522,
"step": 2400
},
{
"epoch": 0.2973673149669481,
"grad_norm": 0.2272900640964508,
"learning_rate": 4.205e-05,
"loss": 3.1472,
"step": 2432
},
{
"epoch": 0.3012800427954606,
"grad_norm": 0.2367839366197586,
"learning_rate": 4.1872222222222227e-05,
"loss": 3.1479,
"step": 2464
},
{
"epoch": 0.3051927706239731,
"grad_norm": 0.3656509220600128,
"learning_rate": 4.169444444444445e-05,
"loss": 3.1506,
"step": 2496
},
{
"epoch": 0.3091054984524856,
"grad_norm": 0.25474536418914795,
"learning_rate": 4.151666666666667e-05,
"loss": 3.1506,
"step": 2528
},
{
"epoch": 0.31301822628099807,
"grad_norm": 0.21729741990566254,
"learning_rate": 4.133888888888889e-05,
"loss": 3.1466,
"step": 2560
},
{
"epoch": 0.31693095410951055,
"grad_norm": 0.26999133825302124,
"learning_rate": 4.116111111111111e-05,
"loss": 3.1468,
"step": 2592
},
{
"epoch": 0.320843681938023,
"grad_norm": 0.2668827176094055,
"learning_rate": 4.098333333333334e-05,
"loss": 3.144,
"step": 2624
},
{
"epoch": 0.32475640976653547,
"grad_norm": 0.24051733314990997,
"learning_rate": 4.080555555555556e-05,
"loss": 3.1465,
"step": 2656
},
{
"epoch": 0.32866913759504796,
"grad_norm": 0.24717700481414795,
"learning_rate": 4.062777777777778e-05,
"loss": 3.1465,
"step": 2688
},
{
"epoch": 0.33258186542356044,
"grad_norm": 0.23907746374607086,
"learning_rate": 4.045000000000001e-05,
"loss": 3.1453,
"step": 2720
},
{
"epoch": 0.3364945932520729,
"grad_norm": 0.24447326362133026,
"learning_rate": 4.027222222222222e-05,
"loss": 3.1406,
"step": 2752
},
{
"epoch": 0.34040732108058536,
"grad_norm": 0.25871723890304565,
"learning_rate": 4.009444444444444e-05,
"loss": 3.1435,
"step": 2784
},
{
"epoch": 0.34432004890909784,
"grad_norm": 0.3173305094242096,
"learning_rate": 3.991666666666667e-05,
"loss": 3.1439,
"step": 2816
},
{
"epoch": 0.34823277673761033,
"grad_norm": 0.2715188264846802,
"learning_rate": 3.973888888888889e-05,
"loss": 3.1433,
"step": 2848
},
{
"epoch": 0.3521455045661228,
"grad_norm": 0.2764374315738678,
"learning_rate": 3.956111111111112e-05,
"loss": 3.1455,
"step": 2880
},
{
"epoch": 0.3560582323946353,
"grad_norm": 0.3014623522758484,
"learning_rate": 3.938333333333333e-05,
"loss": 3.1399,
"step": 2912
},
{
"epoch": 0.3599709602231478,
"grad_norm": 0.22385312616825104,
"learning_rate": 3.9205555555555554e-05,
"loss": 3.1426,
"step": 2944
},
{
"epoch": 0.3638836880516602,
"grad_norm": 0.22400549054145813,
"learning_rate": 3.902777777777778e-05,
"loss": 3.1393,
"step": 2976
},
{
"epoch": 0.3677964158801727,
"grad_norm": 0.266812801361084,
"learning_rate": 3.885e-05,
"loss": 3.1426,
"step": 3008
},
{
"epoch": 0.3717091437086852,
"grad_norm": 0.2830856442451477,
"learning_rate": 3.867222222222222e-05,
"loss": 3.14,
"step": 3040
},
{
"epoch": 0.37562187153719767,
"grad_norm": 0.2724515199661255,
"learning_rate": 3.8494444444444444e-05,
"loss": 3.1419,
"step": 3072
},
{
"epoch": 0.37953459936571016,
"grad_norm": 0.22998973727226257,
"learning_rate": 3.8316666666666665e-05,
"loss": 3.139,
"step": 3104
},
{
"epoch": 0.3834473271942226,
"grad_norm": 0.23931734263896942,
"learning_rate": 3.813888888888889e-05,
"loss": 3.1408,
"step": 3136
},
{
"epoch": 0.3873600550227351,
"grad_norm": 0.26907482743263245,
"learning_rate": 3.7961111111111114e-05,
"loss": 3.1374,
"step": 3168
},
{
"epoch": 0.39127278285124756,
"grad_norm": 0.24700401723384857,
"learning_rate": 3.7783333333333335e-05,
"loss": 3.137,
"step": 3200
},
{
"epoch": 0.39518551067976004,
"grad_norm": 0.2963546812534332,
"learning_rate": 3.7605555555555556e-05,
"loss": 3.1401,
"step": 3232
},
{
"epoch": 0.39909823850827253,
"grad_norm": 0.2659439444541931,
"learning_rate": 3.7427777777777777e-05,
"loss": 3.1387,
"step": 3264
},
{
"epoch": 0.403010966336785,
"grad_norm": 0.26796412467956543,
"learning_rate": 3.7250000000000004e-05,
"loss": 3.1403,
"step": 3296
},
{
"epoch": 0.40692369416529744,
"grad_norm": 0.29361388087272644,
"learning_rate": 3.7072222222222225e-05,
"loss": 3.1389,
"step": 3328
},
{
"epoch": 0.41083642199380993,
"grad_norm": 0.24953944981098175,
"learning_rate": 3.6894444444444446e-05,
"loss": 3.1402,
"step": 3360
},
{
"epoch": 0.4147491498223224,
"grad_norm": 0.23955155909061432,
"learning_rate": 3.671666666666667e-05,
"loss": 3.1377,
"step": 3392
},
{
"epoch": 0.4186618776508349,
"grad_norm": 0.22984126210212708,
"learning_rate": 3.653888888888889e-05,
"loss": 3.1375,
"step": 3424
},
{
"epoch": 0.4225746054793474,
"grad_norm": 0.2523467540740967,
"learning_rate": 3.6361111111111116e-05,
"loss": 3.1364,
"step": 3456
},
{
"epoch": 0.4264873333078598,
"grad_norm": 0.23315957188606262,
"learning_rate": 3.6183333333333336e-05,
"loss": 3.1389,
"step": 3488
},
{
"epoch": 0.4304000611363723,
"grad_norm": 0.22483432292938232,
"learning_rate": 3.600555555555556e-05,
"loss": 3.1357,
"step": 3520
},
{
"epoch": 0.4343127889648848,
"grad_norm": 0.23685774207115173,
"learning_rate": 3.582777777777778e-05,
"loss": 3.136,
"step": 3552
},
{
"epoch": 0.4382255167933973,
"grad_norm": 0.24475786089897156,
"learning_rate": 3.565e-05,
"loss": 3.1364,
"step": 3584
},
{
"epoch": 0.44213824462190976,
"grad_norm": 0.21655669808387756,
"learning_rate": 3.547222222222222e-05,
"loss": 3.1363,
"step": 3616
},
{
"epoch": 0.44605097245042225,
"grad_norm": 0.24810287356376648,
"learning_rate": 3.529444444444445e-05,
"loss": 3.1364,
"step": 3648
},
{
"epoch": 0.4499637002789347,
"grad_norm": 0.23016402125358582,
"learning_rate": 3.511666666666667e-05,
"loss": 3.1345,
"step": 3680
},
{
"epoch": 0.45387642810744716,
"grad_norm": 0.24041368067264557,
"learning_rate": 3.4938888888888896e-05,
"loss": 3.1389,
"step": 3712
},
{
"epoch": 0.45778915593595965,
"grad_norm": 0.237365260720253,
"learning_rate": 3.476111111111111e-05,
"loss": 3.1335,
"step": 3744
},
{
"epoch": 0.46170188376447213,
"grad_norm": 0.21840572357177734,
"learning_rate": 3.458333333333333e-05,
"loss": 3.1365,
"step": 3776
},
{
"epoch": 0.4656146115929846,
"grad_norm": 0.22491848468780518,
"learning_rate": 3.440555555555556e-05,
"loss": 3.1365,
"step": 3808
},
{
"epoch": 0.4695273394214971,
"grad_norm": 0.2349662482738495,
"learning_rate": 3.422777777777778e-05,
"loss": 3.1364,
"step": 3840
},
{
"epoch": 0.47344006725000953,
"grad_norm": 0.3244574964046478,
"learning_rate": 3.405e-05,
"loss": 3.1333,
"step": 3872
},
{
"epoch": 0.477352795078522,
"grad_norm": 0.20271480083465576,
"learning_rate": 3.387222222222222e-05,
"loss": 3.1337,
"step": 3904
},
{
"epoch": 0.4812655229070345,
"grad_norm": 0.22787164151668549,
"learning_rate": 3.369444444444444e-05,
"loss": 3.1359,
"step": 3936
},
{
"epoch": 0.485178250735547,
"grad_norm": 0.2814686894416809,
"learning_rate": 3.351666666666667e-05,
"loss": 3.1344,
"step": 3968
},
{
"epoch": 0.4890909785640595,
"grad_norm": 0.20366469025611877,
"learning_rate": 3.333888888888889e-05,
"loss": 3.1342,
"step": 4000
},
{
"epoch": 0.4930037063925719,
"grad_norm": 0.2670027017593384,
"learning_rate": 3.316111111111111e-05,
"loss": 3.1319,
"step": 4032
},
{
"epoch": 0.4969164342210844,
"grad_norm": 0.2204466164112091,
"learning_rate": 3.298333333333333e-05,
"loss": 3.1328,
"step": 4064
},
{
"epoch": 0.5008291620495969,
"grad_norm": 0.2765197157859802,
"learning_rate": 3.2805555555555554e-05,
"loss": 3.132,
"step": 4096
},
{
"epoch": 0.5047418898781093,
"grad_norm": 0.2624960243701935,
"learning_rate": 3.262777777777778e-05,
"loss": 3.1348,
"step": 4128
},
{
"epoch": 0.5086546177066218,
"grad_norm": 0.2254333347082138,
"learning_rate": 3.245e-05,
"loss": 3.1327,
"step": 4160
},
{
"epoch": 0.5125673455351343,
"grad_norm": 0.25047773122787476,
"learning_rate": 3.2272222222222224e-05,
"loss": 3.1318,
"step": 4192
},
{
"epoch": 0.5164800733636468,
"grad_norm": 0.23816271126270294,
"learning_rate": 3.2094444444444445e-05,
"loss": 3.1331,
"step": 4224
},
{
"epoch": 0.5203928011921592,
"grad_norm": 0.22233732044696808,
"learning_rate": 3.1916666666666665e-05,
"loss": 3.1315,
"step": 4256
},
{
"epoch": 0.5243055290206717,
"grad_norm": 0.25133851170539856,
"learning_rate": 3.173888888888889e-05,
"loss": 3.1333,
"step": 4288
},
{
"epoch": 0.5282182568491842,
"grad_norm": 0.21504898369312286,
"learning_rate": 3.1561111111111114e-05,
"loss": 3.1332,
"step": 4320
},
{
"epoch": 0.5321309846776967,
"grad_norm": 0.2872157394886017,
"learning_rate": 3.1383333333333335e-05,
"loss": 3.1303,
"step": 4352
},
{
"epoch": 0.5360437125062092,
"grad_norm": 0.244154691696167,
"learning_rate": 3.1205555555555556e-05,
"loss": 3.1323,
"step": 4384
},
{
"epoch": 0.5399564403347217,
"grad_norm": 0.24791453778743744,
"learning_rate": 3.102777777777778e-05,
"loss": 3.1312,
"step": 4416
},
{
"epoch": 0.5438691681632342,
"grad_norm": 0.2378605306148529,
"learning_rate": 3.0850000000000004e-05,
"loss": 3.1309,
"step": 4448
},
{
"epoch": 0.5477818959917465,
"grad_norm": 0.21514585614204407,
"learning_rate": 3.0672222222222225e-05,
"loss": 3.1244,
"step": 4480
},
{
"epoch": 0.551694623820259,
"grad_norm": 0.22684329748153687,
"learning_rate": 3.0494444444444446e-05,
"loss": 3.1297,
"step": 4512
},
{
"epoch": 0.5556073516487715,
"grad_norm": 0.21271203458309174,
"learning_rate": 3.0316666666666664e-05,
"loss": 3.1286,
"step": 4544
},
{
"epoch": 0.559520079477284,
"grad_norm": 0.22873900830745697,
"learning_rate": 3.0138888888888888e-05,
"loss": 3.1262,
"step": 4576
},
{
"epoch": 0.5634328073057965,
"grad_norm": 0.24229228496551514,
"learning_rate": 2.9961111111111112e-05,
"loss": 3.1312,
"step": 4608
},
{
"epoch": 0.567345535134309,
"grad_norm": 0.2754037380218506,
"learning_rate": 2.9783333333333337e-05,
"loss": 3.1296,
"step": 4640
},
{
"epoch": 0.5712582629628215,
"grad_norm": 0.20053815841674805,
"learning_rate": 2.9605555555555558e-05,
"loss": 3.128,
"step": 4672
},
{
"epoch": 0.5751709907913339,
"grad_norm": 0.24577876925468445,
"learning_rate": 2.9427777777777782e-05,
"loss": 3.1302,
"step": 4704
},
{
"epoch": 0.5790837186198464,
"grad_norm": 0.2547786235809326,
"learning_rate": 2.925e-05,
"loss": 3.1263,
"step": 4736
},
{
"epoch": 0.5829964464483589,
"grad_norm": 0.18451441824436188,
"learning_rate": 2.9072222222222224e-05,
"loss": 3.1282,
"step": 4768
},
{
"epoch": 0.5869091742768714,
"grad_norm": 0.21002881228923798,
"learning_rate": 2.8894444444444445e-05,
"loss": 3.1271,
"step": 4800
},
{
"epoch": 0.5908219021053838,
"grad_norm": 0.21180187165737152,
"learning_rate": 2.871666666666667e-05,
"loss": 3.1272,
"step": 4832
},
{
"epoch": 0.5947346299338963,
"grad_norm": 0.2123003453016281,
"learning_rate": 2.8538888888888893e-05,
"loss": 3.1285,
"step": 4864
},
{
"epoch": 0.5986473577624087,
"grad_norm": 0.20064932107925415,
"learning_rate": 2.836111111111111e-05,
"loss": 3.1289,
"step": 4896
},
{
"epoch": 0.6025600855909212,
"grad_norm": 0.19583889842033386,
"learning_rate": 2.8183333333333335e-05,
"loss": 3.128,
"step": 4928
},
{
"epoch": 0.6064728134194337,
"grad_norm": 0.1817025989294052,
"learning_rate": 2.8005555555555556e-05,
"loss": 3.1263,
"step": 4960
},
{
"epoch": 0.6103855412479462,
"grad_norm": 0.18323124945163727,
"learning_rate": 2.782777777777778e-05,
"loss": 3.1276,
"step": 4992
},
{
"epoch": 0.6142982690764587,
"grad_norm": 0.21348968148231506,
"learning_rate": 2.7650000000000005e-05,
"loss": 3.1262,
"step": 5024
},
{
"epoch": 0.6182109969049712,
"grad_norm": 0.24803143739700317,
"learning_rate": 2.7472222222222222e-05,
"loss": 3.1278,
"step": 5056
},
{
"epoch": 0.6221237247334837,
"grad_norm": 0.27887552976608276,
"learning_rate": 2.7294444444444443e-05,
"loss": 3.1261,
"step": 5088
},
{
"epoch": 0.6260364525619961,
"grad_norm": 0.20992670953273773,
"learning_rate": 2.7116666666666667e-05,
"loss": 3.1248,
"step": 5120
},
{
"epoch": 0.6299491803905086,
"grad_norm": 0.20632390677928925,
"learning_rate": 2.693888888888889e-05,
"loss": 3.1295,
"step": 5152
},
{
"epoch": 0.6338619082190211,
"grad_norm": 0.22720162570476532,
"learning_rate": 2.6761111111111116e-05,
"loss": 3.124,
"step": 5184
},
{
"epoch": 0.6377746360475335,
"grad_norm": 0.20604351162910461,
"learning_rate": 2.6583333333333333e-05,
"loss": 3.1246,
"step": 5216
},
{
"epoch": 0.641687363876046,
"grad_norm": 0.21567173302173615,
"learning_rate": 2.6405555555555554e-05,
"loss": 3.1266,
"step": 5248
},
{
"epoch": 0.6456000917045585,
"grad_norm": 0.22443106770515442,
"learning_rate": 2.622777777777778e-05,
"loss": 3.1265,
"step": 5280
},
{
"epoch": 0.6495128195330709,
"grad_norm": 0.2323237955570221,
"learning_rate": 2.6050000000000003e-05,
"loss": 3.1214,
"step": 5312
},
{
"epoch": 0.6534255473615834,
"grad_norm": 0.21166770160198212,
"learning_rate": 2.5872222222222224e-05,
"loss": 3.125,
"step": 5344
},
{
"epoch": 0.6573382751900959,
"grad_norm": 0.21922937035560608,
"learning_rate": 2.5694444444444445e-05,
"loss": 3.1236,
"step": 5376
},
{
"epoch": 0.6612510030186084,
"grad_norm": 0.19853883981704712,
"learning_rate": 2.5516666666666666e-05,
"loss": 3.1256,
"step": 5408
},
{
"epoch": 0.6651637308471209,
"grad_norm": 0.22357633709907532,
"learning_rate": 2.533888888888889e-05,
"loss": 3.1257,
"step": 5440
},
{
"epoch": 0.6690764586756334,
"grad_norm": 0.22123898565769196,
"learning_rate": 2.5161111111111114e-05,
"loss": 3.1265,
"step": 5472
},
{
"epoch": 0.6729891865041459,
"grad_norm": 0.20758691430091858,
"learning_rate": 2.4983333333333335e-05,
"loss": 3.1244,
"step": 5504
},
{
"epoch": 0.6769019143326583,
"grad_norm": 0.19084863364696503,
"learning_rate": 2.4805555555555556e-05,
"loss": 3.124,
"step": 5536
},
{
"epoch": 0.6808146421611707,
"grad_norm": 0.21082304418087006,
"learning_rate": 2.462777777777778e-05,
"loss": 3.1247,
"step": 5568
},
{
"epoch": 0.6847273699896832,
"grad_norm": 0.19547946751117706,
"learning_rate": 2.445e-05,
"loss": 3.1254,
"step": 5600
},
{
"epoch": 0.6886400978181957,
"grad_norm": 0.20289190113544464,
"learning_rate": 2.4272222222222222e-05,
"loss": 3.1274,
"step": 5632
},
{
"epoch": 0.6925528256467082,
"grad_norm": 0.21069744229316711,
"learning_rate": 2.4094444444444443e-05,
"loss": 3.1235,
"step": 5664
},
{
"epoch": 0.6964655534752207,
"grad_norm": 0.20337700843811035,
"learning_rate": 2.3916666666666668e-05,
"loss": 3.1253,
"step": 5696
},
{
"epoch": 0.7003782813037331,
"grad_norm": 0.2150067836046219,
"learning_rate": 2.3738888888888892e-05,
"loss": 3.1255,
"step": 5728
},
{
"epoch": 0.7042910091322456,
"grad_norm": 0.1990475058555603,
"learning_rate": 2.3561111111111113e-05,
"loss": 3.1247,
"step": 5760
},
{
"epoch": 0.7082037369607581,
"grad_norm": 0.20272456109523773,
"learning_rate": 2.3383333333333334e-05,
"loss": 3.1235,
"step": 5792
},
{
"epoch": 0.7121164647892706,
"grad_norm": 0.21050025522708893,
"learning_rate": 2.3205555555555555e-05,
"loss": 3.1226,
"step": 5824
},
{
"epoch": 0.7160291926177831,
"grad_norm": 0.2530113160610199,
"learning_rate": 2.302777777777778e-05,
"loss": 3.1242,
"step": 5856
},
{
"epoch": 0.7199419204462956,
"grad_norm": 0.2530890703201294,
"learning_rate": 2.2850000000000003e-05,
"loss": 3.1215,
"step": 5888
},
{
"epoch": 0.7238546482748079,
"grad_norm": 0.19028717279434204,
"learning_rate": 2.2672222222222224e-05,
"loss": 3.1236,
"step": 5920
},
{
"epoch": 0.7277673761033204,
"grad_norm": 0.20547839999198914,
"learning_rate": 2.2494444444444445e-05,
"loss": 3.1225,
"step": 5952
},
{
"epoch": 0.7316801039318329,
"grad_norm": 0.19479484856128693,
"learning_rate": 2.231666666666667e-05,
"loss": 3.1248,
"step": 5984
},
{
"epoch": 0.7355928317603454,
"grad_norm": 0.2140408456325531,
"learning_rate": 2.213888888888889e-05,
"loss": 3.1237,
"step": 6016
},
{
"epoch": 0.7395055595888579,
"grad_norm": 0.17809583246707916,
"learning_rate": 2.1961111111111114e-05,
"loss": 3.1243,
"step": 6048
},
{
"epoch": 0.7434182874173704,
"grad_norm": 0.19468888640403748,
"learning_rate": 2.1783333333333332e-05,
"loss": 3.1246,
"step": 6080
},
{
"epoch": 0.7473310152458829,
"grad_norm": 0.2106105089187622,
"learning_rate": 2.1605555555555556e-05,
"loss": 3.1224,
"step": 6112
},
{
"epoch": 0.7512437430743953,
"grad_norm": 0.20489418506622314,
"learning_rate": 2.142777777777778e-05,
"loss": 3.1237,
"step": 6144
},
{
"epoch": 0.7551564709029078,
"grad_norm": 0.2453160136938095,
"learning_rate": 2.125e-05,
"loss": 3.1212,
"step": 6176
},
{
"epoch": 0.7590691987314203,
"grad_norm": 0.2121828943490982,
"learning_rate": 2.1072222222222222e-05,
"loss": 3.1192,
"step": 6208
},
{
"epoch": 0.7629819265599328,
"grad_norm": 0.18198275566101074,
"learning_rate": 2.0894444444444443e-05,
"loss": 3.1213,
"step": 6240
},
{
"epoch": 0.7668946543884452,
"grad_norm": 0.1795693039894104,
"learning_rate": 2.0716666666666668e-05,
"loss": 3.1201,
"step": 6272
},
{
"epoch": 0.7708073822169577,
"grad_norm": 0.24014544486999512,
"learning_rate": 2.0538888888888892e-05,
"loss": 3.122,
"step": 6304
},
{
"epoch": 0.7747201100454701,
"grad_norm": 0.20040743052959442,
"learning_rate": 2.0361111111111113e-05,
"loss": 3.1207,
"step": 6336
},
{
"epoch": 0.7786328378739826,
"grad_norm": 0.2076857089996338,
"learning_rate": 2.0183333333333334e-05,
"loss": 3.1245,
"step": 6368
},
{
"epoch": 0.7825455657024951,
"grad_norm": 0.19411978125572205,
"learning_rate": 2.0005555555555555e-05,
"loss": 3.1216,
"step": 6400
},
{
"epoch": 0.7864582935310076,
"grad_norm": 0.17701873183250427,
"learning_rate": 1.982777777777778e-05,
"loss": 3.1228,
"step": 6432
},
{
"epoch": 0.7903710213595201,
"grad_norm": 0.19787663221359253,
"learning_rate": 1.9650000000000003e-05,
"loss": 3.122,
"step": 6464
},
{
"epoch": 0.7942837491880326,
"grad_norm": 0.18991973996162415,
"learning_rate": 1.947222222222222e-05,
"loss": 3.1211,
"step": 6496
},
{
"epoch": 0.7981964770165451,
"grad_norm": 0.18508349359035492,
"learning_rate": 1.9294444444444445e-05,
"loss": 3.1211,
"step": 6528
},
{
"epoch": 0.8021092048450575,
"grad_norm": 0.17648939788341522,
"learning_rate": 1.911666666666667e-05,
"loss": 3.1237,
"step": 6560
},
{
"epoch": 0.80602193267357,
"grad_norm": 0.20672652125358582,
"learning_rate": 1.893888888888889e-05,
"loss": 3.1213,
"step": 6592
},
{
"epoch": 0.8099346605020824,
"grad_norm": 0.21490968763828278,
"learning_rate": 1.876111111111111e-05,
"loss": 3.1201,
"step": 6624
},
{
"epoch": 0.8138473883305949,
"grad_norm": 0.20175087451934814,
"learning_rate": 1.8583333333333332e-05,
"loss": 3.1184,
"step": 6656
},
{
"epoch": 0.8177601161591074,
"grad_norm": 0.17700786888599396,
"learning_rate": 1.8405555555555556e-05,
"loss": 3.1194,
"step": 6688
},
{
"epoch": 0.8216728439876199,
"grad_norm": 0.19697381556034088,
"learning_rate": 1.822777777777778e-05,
"loss": 3.1208,
"step": 6720
},
{
"epoch": 0.8255855718161323,
"grad_norm": 0.19516746699810028,
"learning_rate": 1.805e-05,
"loss": 3.122,
"step": 6752
},
{
"epoch": 0.8294982996446448,
"grad_norm": 0.19233250617980957,
"learning_rate": 1.7872222222222223e-05,
"loss": 3.1237,
"step": 6784
},
{
"epoch": 0.8334110274731573,
"grad_norm": 0.20740792155265808,
"learning_rate": 1.7694444444444443e-05,
"loss": 3.1227,
"step": 6816
},
{
"epoch": 0.8373237553016698,
"grad_norm": 0.18789739906787872,
"learning_rate": 1.7516666666666668e-05,
"loss": 3.1198,
"step": 6848
},
{
"epoch": 0.8412364831301823,
"grad_norm": 0.17981740832328796,
"learning_rate": 1.7338888888888892e-05,
"loss": 3.121,
"step": 6880
},
{
"epoch": 0.8451492109586948,
"grad_norm": 0.2110264003276825,
"learning_rate": 1.716111111111111e-05,
"loss": 3.1186,
"step": 6912
},
{
"epoch": 0.8490619387872073,
"grad_norm": 0.19858282804489136,
"learning_rate": 1.6983333333333334e-05,
"loss": 3.1236,
"step": 6944
},
{
"epoch": 0.8529746666157196,
"grad_norm": 0.17566311359405518,
"learning_rate": 1.6805555555555558e-05,
"loss": 3.1225,
"step": 6976
},
{
"epoch": 0.8568873944442321,
"grad_norm": 0.19274671375751495,
"learning_rate": 1.662777777777778e-05,
"loss": 3.1197,
"step": 7008
},
{
"epoch": 0.8608001222727446,
"grad_norm": 0.20043255388736725,
"learning_rate": 1.645e-05,
"loss": 3.1221,
"step": 7040
},
{
"epoch": 0.8647128501012571,
"grad_norm": 0.17369119822978973,
"learning_rate": 1.627222222222222e-05,
"loss": 3.119,
"step": 7072
},
{
"epoch": 0.8686255779297696,
"grad_norm": 0.18795572221279144,
"learning_rate": 1.6094444444444445e-05,
"loss": 3.116,
"step": 7104
},
{
"epoch": 0.8725383057582821,
"grad_norm": 0.20084317028522491,
"learning_rate": 1.591666666666667e-05,
"loss": 3.1164,
"step": 7136
},
{
"epoch": 0.8764510335867945,
"grad_norm": 0.1732749342918396,
"learning_rate": 1.573888888888889e-05,
"loss": 3.1184,
"step": 7168
},
{
"epoch": 0.880363761415307,
"grad_norm": 0.18775592744350433,
"learning_rate": 1.556111111111111e-05,
"loss": 3.1186,
"step": 7200
},
{
"epoch": 0.8842764892438195,
"grad_norm": 0.1810338944196701,
"learning_rate": 1.5383333333333332e-05,
"loss": 3.1211,
"step": 7232
},
{
"epoch": 0.888189217072332,
"grad_norm": 0.17264607548713684,
"learning_rate": 1.5205555555555557e-05,
"loss": 3.115,
"step": 7264
},
{
"epoch": 0.8921019449008445,
"grad_norm": 0.18331947922706604,
"learning_rate": 1.502777777777778e-05,
"loss": 3.1176,
"step": 7296
},
{
"epoch": 0.896014672729357,
"grad_norm": 0.1883401870727539,
"learning_rate": 1.485e-05,
"loss": 3.1194,
"step": 7328
},
{
"epoch": 0.8999274005578694,
"grad_norm": 0.17407892644405365,
"learning_rate": 1.4672222222222223e-05,
"loss": 3.1188,
"step": 7360
},
{
"epoch": 0.9038401283863818,
"grad_norm": 0.1941099464893341,
"learning_rate": 1.4494444444444444e-05,
"loss": 3.1211,
"step": 7392
},
{
"epoch": 0.9077528562148943,
"grad_norm": 0.17381389439105988,
"learning_rate": 1.4316666666666668e-05,
"loss": 3.1194,
"step": 7424
},
{
"epoch": 0.9116655840434068,
"grad_norm": 0.18369047343730927,
"learning_rate": 1.413888888888889e-05,
"loss": 3.1165,
"step": 7456
},
{
"epoch": 0.9155783118719193,
"grad_norm": 0.17392371594905853,
"learning_rate": 1.3961111111111111e-05,
"loss": 3.1165,
"step": 7488
},
{
"epoch": 0.9194910397004318,
"grad_norm": 0.17337463796138763,
"learning_rate": 1.3783333333333334e-05,
"loss": 3.1192,
"step": 7520
},
{
"epoch": 0.9234037675289443,
"grad_norm": 0.1813974380493164,
"learning_rate": 1.3605555555555557e-05,
"loss": 3.1158,
"step": 7552
},
{
"epoch": 0.9273164953574567,
"grad_norm": 0.1770683377981186,
"learning_rate": 1.3427777777777778e-05,
"loss": 3.1173,
"step": 7584
},
{
"epoch": 0.9312292231859692,
"grad_norm": 0.18390090763568878,
"learning_rate": 1.3250000000000002e-05,
"loss": 3.1211,
"step": 7616
},
{
"epoch": 0.9351419510144817,
"grad_norm": 0.17356765270233154,
"learning_rate": 1.3072222222222221e-05,
"loss": 3.1187,
"step": 7648
},
{
"epoch": 0.9390546788429942,
"grad_norm": 0.173334538936615,
"learning_rate": 1.2894444444444445e-05,
"loss": 3.1191,
"step": 7680
},
{
"epoch": 0.9429674066715066,
"grad_norm": 0.18598856031894684,
"learning_rate": 1.2716666666666668e-05,
"loss": 3.1192,
"step": 7712
},
{
"epoch": 0.9468801345000191,
"grad_norm": 0.1667858213186264,
"learning_rate": 1.2538888888888889e-05,
"loss": 3.1173,
"step": 7744
},
{
"epoch": 0.9507928623285316,
"grad_norm": 0.17433424293994904,
"learning_rate": 1.2361111111111112e-05,
"loss": 3.1184,
"step": 7776
},
{
"epoch": 0.954705590157044,
"grad_norm": 0.1921132653951645,
"learning_rate": 1.2183333333333334e-05,
"loss": 3.119,
"step": 7808
},
{
"epoch": 0.9586183179855565,
"grad_norm": 0.16437648236751556,
"learning_rate": 1.2005555555555557e-05,
"loss": 3.1179,
"step": 7840
},
{
"epoch": 0.962531045814069,
"grad_norm": 0.17323090136051178,
"learning_rate": 1.1827777777777778e-05,
"loss": 3.1192,
"step": 7872
},
{
"epoch": 0.9664437736425815,
"grad_norm": 0.16646146774291992,
"learning_rate": 1.1650000000000002e-05,
"loss": 3.1176,
"step": 7904
},
{
"epoch": 0.970356501471094,
"grad_norm": 0.18198241293430328,
"learning_rate": 1.1472222222222223e-05,
"loss": 3.1178,
"step": 7936
},
{
"epoch": 0.9742692292996065,
"grad_norm": 0.17490531504154205,
"learning_rate": 1.1294444444444445e-05,
"loss": 3.1161,
"step": 7968
},
{
"epoch": 0.978181957128119,
"grad_norm": 0.17505322396755219,
"learning_rate": 1.1116666666666666e-05,
"loss": 3.1213,
"step": 8000
},
{
"epoch": 0.9820946849566314,
"grad_norm": 0.17005711793899536,
"learning_rate": 1.0938888888888889e-05,
"loss": 3.1187,
"step": 8032
},
{
"epoch": 0.9860074127851438,
"grad_norm": 0.18125712871551514,
"learning_rate": 1.0761111111111112e-05,
"loss": 3.12,
"step": 8064
},
{
"epoch": 0.9899201406136563,
"grad_norm": 0.17013822495937347,
"learning_rate": 1.0583333333333334e-05,
"loss": 3.1157,
"step": 8096
},
{
"epoch": 0.9938328684421688,
"grad_norm": 0.1698048710823059,
"learning_rate": 1.0405555555555555e-05,
"loss": 3.1172,
"step": 8128
},
{
"epoch": 0.9977455962706813,
"grad_norm": 0.17143802344799042,
"learning_rate": 1.0227777777777778e-05,
"loss": 3.1153,
"step": 8160
},
{
"epoch": 1.0015895456803332,
"grad_norm": 0.1739780455827713,
"learning_rate": 1.005e-05,
"loss": 3.1163,
"step": 8192
},
{
"epoch": 1.0055022735088457,
"grad_norm": 0.17907440662384033,
"learning_rate": 9.872222222222223e-06,
"loss": 3.1143,
"step": 8224
},
{
"epoch": 1.0094150013373582,
"grad_norm": 0.17365169525146484,
"learning_rate": 9.694444444444446e-06,
"loss": 3.1157,
"step": 8256
},
{
"epoch": 1.0133277291658707,
"grad_norm": 0.1645737588405609,
"learning_rate": 9.516666666666666e-06,
"loss": 3.1134,
"step": 8288
},
{
"epoch": 1.0172404569943831,
"grad_norm": 0.15174245834350586,
"learning_rate": 9.338888888888889e-06,
"loss": 3.1142,
"step": 8320
},
{
"epoch": 1.0211531848228956,
"grad_norm": 0.16984011232852936,
"learning_rate": 9.161111111111112e-06,
"loss": 3.1142,
"step": 8352
},
{
"epoch": 1.0250659126514081,
"grad_norm": 0.1772463321685791,
"learning_rate": 8.983333333333334e-06,
"loss": 3.1178,
"step": 8384
},
{
"epoch": 1.0289786404799206,
"grad_norm": 0.16304141283035278,
"learning_rate": 8.805555555555555e-06,
"loss": 3.113,
"step": 8416
},
{
"epoch": 1.032891368308433,
"grad_norm": 0.15513816475868225,
"learning_rate": 8.627777777777778e-06,
"loss": 3.1145,
"step": 8448
},
{
"epoch": 1.0368040961369456,
"grad_norm": 0.1862088292837143,
"learning_rate": 8.45e-06,
"loss": 3.1109,
"step": 8480
},
{
"epoch": 1.0407168239654578,
"grad_norm": 0.17995817959308624,
"learning_rate": 8.272222222222223e-06,
"loss": 3.1128,
"step": 8512
},
{
"epoch": 1.0446295517939703,
"grad_norm": 0.1758676916360855,
"learning_rate": 8.094444444444444e-06,
"loss": 3.1128,
"step": 8544
},
{
"epoch": 1.0485422796224828,
"grad_norm": 0.16609688103199005,
"learning_rate": 7.916666666666667e-06,
"loss": 3.114,
"step": 8576
},
{
"epoch": 1.0524550074509953,
"grad_norm": 0.15258896350860596,
"learning_rate": 7.738888888888889e-06,
"loss": 3.1171,
"step": 8608
},
{
"epoch": 1.0563677352795078,
"grad_norm": 0.16240954399108887,
"learning_rate": 7.561111111111112e-06,
"loss": 3.113,
"step": 8640
},
{
"epoch": 1.0602804631080203,
"grad_norm": 0.16423362493515015,
"learning_rate": 7.3833333333333335e-06,
"loss": 3.1154,
"step": 8672
},
{
"epoch": 1.0641931909365328,
"grad_norm": 0.17032068967819214,
"learning_rate": 7.205555555555555e-06,
"loss": 3.1146,
"step": 8704
},
{
"epoch": 1.0681059187650452,
"grad_norm": 0.1564359813928604,
"learning_rate": 7.027777777777778e-06,
"loss": 3.1162,
"step": 8736
},
{
"epoch": 1.0720186465935577,
"grad_norm": 0.15838623046875,
"learning_rate": 6.8500000000000005e-06,
"loss": 3.113,
"step": 8768
},
{
"epoch": 1.0759313744220702,
"grad_norm": 0.17325465381145477,
"learning_rate": 6.672222222222223e-06,
"loss": 3.1153,
"step": 8800
},
{
"epoch": 1.0798441022505827,
"grad_norm": 0.16170760989189148,
"learning_rate": 6.494444444444445e-06,
"loss": 3.115,
"step": 8832
},
{
"epoch": 1.0837568300790952,
"grad_norm": 0.15591956675052643,
"learning_rate": 6.316666666666667e-06,
"loss": 3.1088,
"step": 8864
},
{
"epoch": 1.0876695579076077,
"grad_norm": 0.15115121006965637,
"learning_rate": 6.138888888888889e-06,
"loss": 3.1103,
"step": 8896
},
{
"epoch": 1.0915822857361202,
"grad_norm": 0.1577509045600891,
"learning_rate": 5.961111111111111e-06,
"loss": 3.112,
"step": 8928
},
{
"epoch": 1.0954950135646326,
"grad_norm": 0.1545899361371994,
"learning_rate": 5.783333333333334e-06,
"loss": 3.1108,
"step": 8960
},
{
"epoch": 1.0994077413931451,
"grad_norm": 0.1597297489643097,
"learning_rate": 5.605555555555555e-06,
"loss": 3.1172,
"step": 8992
},
{
"epoch": 1.1033204692216576,
"grad_norm": 0.16016387939453125,
"learning_rate": 5.427777777777778e-06,
"loss": 3.1156,
"step": 9024
},
{
"epoch": 1.10723319705017,
"grad_norm": 0.15304987132549286,
"learning_rate": 5.25e-06,
"loss": 3.1126,
"step": 9056
},
{
"epoch": 1.1111459248786826,
"grad_norm": 0.1560225784778595,
"learning_rate": 5.072222222222222e-06,
"loss": 3.1152,
"step": 9088
},
{
"epoch": 1.115058652707195,
"grad_norm": 0.16613492369651794,
"learning_rate": 4.894444444444445e-06,
"loss": 3.1147,
"step": 9120
},
{
"epoch": 1.1189713805357075,
"grad_norm": 0.15055406093597412,
"learning_rate": 4.7166666666666675e-06,
"loss": 3.1116,
"step": 9152
},
{
"epoch": 1.12288410836422,
"grad_norm": 0.16280752420425415,
"learning_rate": 4.538888888888889e-06,
"loss": 3.1148,
"step": 9184
},
{
"epoch": 1.1267968361927325,
"grad_norm": 0.1523207277059555,
"learning_rate": 4.361111111111112e-06,
"loss": 3.1133,
"step": 9216
},
{
"epoch": 1.1307095640212448,
"grad_norm": 0.1500737965106964,
"learning_rate": 4.183333333333334e-06,
"loss": 3.1177,
"step": 9248
},
{
"epoch": 1.1346222918497573,
"grad_norm": 0.16134943068027496,
"learning_rate": 4.005555555555555e-06,
"loss": 3.1143,
"step": 9280
},
{
"epoch": 1.1385350196782698,
"grad_norm": 0.1499546766281128,
"learning_rate": 3.827777777777778e-06,
"loss": 3.1133,
"step": 9312
},
{
"epoch": 1.1424477475067822,
"grad_norm": 0.15620845556259155,
"learning_rate": 3.6499999999999998e-06,
"loss": 3.1122,
"step": 9344
},
{
"epoch": 1.1463604753352947,
"grad_norm": 0.15544985234737396,
"learning_rate": 3.4722222222222224e-06,
"loss": 3.1146,
"step": 9376
},
{
"epoch": 1.1502732031638072,
"grad_norm": 0.15928788483142853,
"learning_rate": 3.2944444444444446e-06,
"loss": 3.1123,
"step": 9408
},
{
"epoch": 1.1541859309923197,
"grad_norm": 0.14999979734420776,
"learning_rate": 3.1166666666666668e-06,
"loss": 3.1149,
"step": 9440
},
{
"epoch": 1.1580986588208322,
"grad_norm": 0.15014442801475525,
"learning_rate": 2.938888888888889e-06,
"loss": 3.1113,
"step": 9472
},
{
"epoch": 1.1620113866493447,
"grad_norm": 0.14749625325202942,
"learning_rate": 2.761111111111111e-06,
"loss": 3.113,
"step": 9504
},
{
"epoch": 1.1659241144778572,
"grad_norm": 0.14931970834732056,
"learning_rate": 2.5833333333333333e-06,
"loss": 3.1144,
"step": 9536
},
{
"epoch": 1.1698368423063696,
"grad_norm": 0.14572674036026,
"learning_rate": 2.4055555555555555e-06,
"loss": 3.1093,
"step": 9568
},
{
"epoch": 1.1737495701348821,
"grad_norm": 0.15361888706684113,
"learning_rate": 2.227777777777778e-06,
"loss": 3.1138,
"step": 9600
},
{
"epoch": 1.1776622979633946,
"grad_norm": 0.1433536857366562,
"learning_rate": 2.0500000000000003e-06,
"loss": 3.1123,
"step": 9632
},
{
"epoch": 1.181575025791907,
"grad_norm": 0.14533208310604095,
"learning_rate": 1.8722222222222225e-06,
"loss": 3.1116,
"step": 9664
},
{
"epoch": 1.1854877536204196,
"grad_norm": 0.14816279709339142,
"learning_rate": 1.6944444444444446e-06,
"loss": 3.1128,
"step": 9696
},
{
"epoch": 1.189400481448932,
"grad_norm": 0.14798638224601746,
"learning_rate": 1.5166666666666668e-06,
"loss": 3.116,
"step": 9728
},
{
"epoch": 1.1933132092774446,
"grad_norm": 0.1386597454547882,
"learning_rate": 1.338888888888889e-06,
"loss": 3.1145,
"step": 9760
},
{
"epoch": 1.197225937105957,
"grad_norm": 0.14148685336112976,
"learning_rate": 1.161111111111111e-06,
"loss": 3.1115,
"step": 9792
},
{
"epoch": 1.2011386649344695,
"grad_norm": 0.14324016869068146,
"learning_rate": 9.833333333333334e-07,
"loss": 3.1117,
"step": 9824
},
{
"epoch": 1.205051392762982,
"grad_norm": 0.14499281346797943,
"learning_rate": 8.055555555555556e-07,
"loss": 3.1129,
"step": 9856
},
{
"epoch": 1.2089641205914945,
"grad_norm": 0.1464635133743286,
"learning_rate": 6.277777777777778e-07,
"loss": 3.1169,
"step": 9888
},
{
"epoch": 1.2128768484200068,
"grad_norm": 0.14767299592494965,
"learning_rate": 4.5e-07,
"loss": 3.1131,
"step": 9920
},
{
"epoch": 1.2167895762485195,
"grad_norm": 0.14456725120544434,
"learning_rate": 2.722222222222222e-07,
"loss": 3.116,
"step": 9952
},
{
"epoch": 1.2207023040770317,
"grad_norm": 0.1386868953704834,
"learning_rate": 9.444444444444445e-08,
"loss": 3.1151,
"step": 9984
},
{
"epoch": 1.222658667991288,
"step": 10000,
"total_flos": 8.246852548747592e+18,
"train_loss": 1.5593041332244872,
"train_runtime": 85792.9956,
"train_samples_per_second": 238.714,
"train_steps_per_second": 0.117
}
],
"logging_steps": 32,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.246852548747592e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}