SCOPE-Direct-v2 / trainer_state.json
Cooolder's picture
Upload folder using huggingface_hub
20ed250 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 6695,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014936798670624918,
"grad_norm": 184.9497833251953,
"learning_rate": 1.3432835820895523e-07,
"loss": 4.6248,
"step": 10
},
{
"epoch": 0.0029873597341249837,
"grad_norm": 123.1812744140625,
"learning_rate": 2.8358208955223886e-07,
"loss": 4.6243,
"step": 20
},
{
"epoch": 0.004481039601187476,
"grad_norm": 195.16688537597656,
"learning_rate": 4.3283582089552244e-07,
"loss": 4.2232,
"step": 30
},
{
"epoch": 0.005974719468249967,
"grad_norm": 90.20713806152344,
"learning_rate": 5.82089552238806e-07,
"loss": 3.2697,
"step": 40
},
{
"epoch": 0.007468399335312459,
"grad_norm": 38.96065139770508,
"learning_rate": 7.313432835820897e-07,
"loss": 2.3537,
"step": 50
},
{
"epoch": 0.008962079202374951,
"grad_norm": 18.04777717590332,
"learning_rate": 8.805970149253732e-07,
"loss": 1.4564,
"step": 60
},
{
"epoch": 0.010455759069437442,
"grad_norm": 14.39755630493164,
"learning_rate": 1.0298507462686568e-06,
"loss": 1.1459,
"step": 70
},
{
"epoch": 0.011949438936499935,
"grad_norm": 14.463386535644531,
"learning_rate": 1.1791044776119403e-06,
"loss": 0.9877,
"step": 80
},
{
"epoch": 0.013443118803562427,
"grad_norm": 15.058645248413086,
"learning_rate": 1.3283582089552241e-06,
"loss": 0.8928,
"step": 90
},
{
"epoch": 0.014936798670624918,
"grad_norm": 14.862004280090332,
"learning_rate": 1.4776119402985075e-06,
"loss": 0.7928,
"step": 100
},
{
"epoch": 0.01643047853768741,
"grad_norm": 14.284353256225586,
"learning_rate": 1.626865671641791e-06,
"loss": 0.7325,
"step": 110
},
{
"epoch": 0.017924158404749903,
"grad_norm": 13.038077354431152,
"learning_rate": 1.7761194029850749e-06,
"loss": 0.6793,
"step": 120
},
{
"epoch": 0.019417838271812395,
"grad_norm": 12.484222412109375,
"learning_rate": 1.9253731343283582e-06,
"loss": 0.6044,
"step": 130
},
{
"epoch": 0.020911518138874884,
"grad_norm": 12.41592788696289,
"learning_rate": 2.074626865671642e-06,
"loss": 0.5792,
"step": 140
},
{
"epoch": 0.022405198005937377,
"grad_norm": 10.647322654724121,
"learning_rate": 2.2238805970149254e-06,
"loss": 0.5225,
"step": 150
},
{
"epoch": 0.02389887787299987,
"grad_norm": 10.59830093383789,
"learning_rate": 2.373134328358209e-06,
"loss": 0.4818,
"step": 160
},
{
"epoch": 0.02539255774006236,
"grad_norm": 8.66019058227539,
"learning_rate": 2.5223880597014925e-06,
"loss": 0.4556,
"step": 170
},
{
"epoch": 0.026886237607124854,
"grad_norm": 7.149711608886719,
"learning_rate": 2.6716417910447763e-06,
"loss": 0.3993,
"step": 180
},
{
"epoch": 0.028379917474187343,
"grad_norm": 5.81325101852417,
"learning_rate": 2.82089552238806e-06,
"loss": 0.4024,
"step": 190
},
{
"epoch": 0.029873597341249836,
"grad_norm": 5.245195388793945,
"learning_rate": 2.9701492537313435e-06,
"loss": 0.4011,
"step": 200
},
{
"epoch": 0.03136727720831233,
"grad_norm": 5.920019149780273,
"learning_rate": 3.1194029850746273e-06,
"loss": 0.3821,
"step": 210
},
{
"epoch": 0.03286095707537482,
"grad_norm": 4.425774097442627,
"learning_rate": 3.2686567164179106e-06,
"loss": 0.3863,
"step": 220
},
{
"epoch": 0.03435463694243731,
"grad_norm": 2.886918067932129,
"learning_rate": 3.417910447761194e-06,
"loss": 0.3741,
"step": 230
},
{
"epoch": 0.035848316809499806,
"grad_norm": 6.619475841522217,
"learning_rate": 3.5671641791044782e-06,
"loss": 0.3674,
"step": 240
},
{
"epoch": 0.037341996676562295,
"grad_norm": 5.154594898223877,
"learning_rate": 3.7164179104477616e-06,
"loss": 0.3615,
"step": 250
},
{
"epoch": 0.03883567654362479,
"grad_norm": 4.2160820960998535,
"learning_rate": 3.865671641791045e-06,
"loss": 0.3373,
"step": 260
},
{
"epoch": 0.04032935641068728,
"grad_norm": 2.998599052429199,
"learning_rate": 4.014925373134328e-06,
"loss": 0.3669,
"step": 270
},
{
"epoch": 0.04182303627774977,
"grad_norm": 4.233543395996094,
"learning_rate": 4.1641791044776125e-06,
"loss": 0.367,
"step": 280
},
{
"epoch": 0.043316716144812265,
"grad_norm": 4.081936359405518,
"learning_rate": 4.313432835820896e-06,
"loss": 0.3581,
"step": 290
},
{
"epoch": 0.044810396011874754,
"grad_norm": 3.0351035594940186,
"learning_rate": 4.462686567164179e-06,
"loss": 0.3378,
"step": 300
},
{
"epoch": 0.04630407587893725,
"grad_norm": 5.17939567565918,
"learning_rate": 4.611940298507463e-06,
"loss": 0.3771,
"step": 310
},
{
"epoch": 0.04779775574599974,
"grad_norm": 2.004800319671631,
"learning_rate": 4.761194029850746e-06,
"loss": 0.334,
"step": 320
},
{
"epoch": 0.04929143561306223,
"grad_norm": 4.026329040527344,
"learning_rate": 4.91044776119403e-06,
"loss": 0.3373,
"step": 330
},
{
"epoch": 0.05078511548012472,
"grad_norm": 3.8416078090667725,
"learning_rate": 5.059701492537314e-06,
"loss": 0.3616,
"step": 340
},
{
"epoch": 0.05227879534718721,
"grad_norm": 2.231009006500244,
"learning_rate": 5.208955223880598e-06,
"loss": 0.3196,
"step": 350
},
{
"epoch": 0.05377247521424971,
"grad_norm": 3.798912286758423,
"learning_rate": 5.358208955223881e-06,
"loss": 0.3638,
"step": 360
},
{
"epoch": 0.0552661550813122,
"grad_norm": 2.688854932785034,
"learning_rate": 5.5074626865671645e-06,
"loss": 0.3411,
"step": 370
},
{
"epoch": 0.056759834948374686,
"grad_norm": 3.7967114448547363,
"learning_rate": 5.656716417910449e-06,
"loss": 0.3717,
"step": 380
},
{
"epoch": 0.05825351481543718,
"grad_norm": 2.219935655593872,
"learning_rate": 5.805970149253732e-06,
"loss": 0.33,
"step": 390
},
{
"epoch": 0.05974719468249967,
"grad_norm": 2.561140775680542,
"learning_rate": 5.9552238805970155e-06,
"loss": 0.3509,
"step": 400
},
{
"epoch": 0.06124087454956217,
"grad_norm": 1.8147597312927246,
"learning_rate": 6.1044776119403e-06,
"loss": 0.3371,
"step": 410
},
{
"epoch": 0.06273455441662466,
"grad_norm": 2.9185450077056885,
"learning_rate": 6.253731343283582e-06,
"loss": 0.3351,
"step": 420
},
{
"epoch": 0.06422823428368715,
"grad_norm": 1.7370891571044922,
"learning_rate": 6.4029850746268664e-06,
"loss": 0.3465,
"step": 430
},
{
"epoch": 0.06572191415074964,
"grad_norm": 1.9567862749099731,
"learning_rate": 6.552238805970151e-06,
"loss": 0.3617,
"step": 440
},
{
"epoch": 0.06721559401781213,
"grad_norm": 3.0918054580688477,
"learning_rate": 6.701492537313433e-06,
"loss": 0.344,
"step": 450
},
{
"epoch": 0.06870927388487462,
"grad_norm": 2.9340105056762695,
"learning_rate": 6.850746268656717e-06,
"loss": 0.3464,
"step": 460
},
{
"epoch": 0.07020295375193712,
"grad_norm": 2.3813934326171875,
"learning_rate": 7e-06,
"loss": 0.3233,
"step": 470
},
{
"epoch": 0.07169663361899961,
"grad_norm": 1.6313637495040894,
"learning_rate": 7.149253731343284e-06,
"loss": 0.3377,
"step": 480
},
{
"epoch": 0.0731903134860621,
"grad_norm": 2.1293253898620605,
"learning_rate": 7.298507462686568e-06,
"loss": 0.3595,
"step": 490
},
{
"epoch": 0.07468399335312459,
"grad_norm": 2.9458706378936768,
"learning_rate": 7.447761194029851e-06,
"loss": 0.3289,
"step": 500
},
{
"epoch": 0.07468399335312459,
"eval_loss": 0.3393873870372772,
"eval_runtime": 77.3397,
"eval_samples_per_second": 6.995,
"eval_steps_per_second": 3.504,
"step": 500
},
{
"epoch": 0.07617767322018708,
"grad_norm": 3.799473762512207,
"learning_rate": 7.597014925373135e-06,
"loss": 0.3188,
"step": 510
},
{
"epoch": 0.07767135308724958,
"grad_norm": 2.0290870666503906,
"learning_rate": 7.746268656716418e-06,
"loss": 0.3592,
"step": 520
},
{
"epoch": 0.07916503295431207,
"grad_norm": 3.1726157665252686,
"learning_rate": 7.895522388059703e-06,
"loss": 0.3284,
"step": 530
},
{
"epoch": 0.08065871282137456,
"grad_norm": 2.264389991760254,
"learning_rate": 8.044776119402986e-06,
"loss": 0.3631,
"step": 540
},
{
"epoch": 0.08215239268843705,
"grad_norm": 2.3615527153015137,
"learning_rate": 8.19402985074627e-06,
"loss": 0.3333,
"step": 550
},
{
"epoch": 0.08364607255549954,
"grad_norm": 2.3802566528320312,
"learning_rate": 8.343283582089553e-06,
"loss": 0.3482,
"step": 560
},
{
"epoch": 0.08513975242256204,
"grad_norm": 2.536975145339966,
"learning_rate": 8.492537313432838e-06,
"loss": 0.3316,
"step": 570
},
{
"epoch": 0.08663343228962453,
"grad_norm": 2.01639723777771,
"learning_rate": 8.64179104477612e-06,
"loss": 0.355,
"step": 580
},
{
"epoch": 0.08812711215668702,
"grad_norm": 2.158482313156128,
"learning_rate": 8.791044776119405e-06,
"loss": 0.3637,
"step": 590
},
{
"epoch": 0.08962079202374951,
"grad_norm": 3.023801326751709,
"learning_rate": 8.940298507462686e-06,
"loss": 0.3663,
"step": 600
},
{
"epoch": 0.091114471890812,
"grad_norm": 1.7491657733917236,
"learning_rate": 9.089552238805971e-06,
"loss": 0.368,
"step": 610
},
{
"epoch": 0.0926081517578745,
"grad_norm": 1.5877282619476318,
"learning_rate": 9.238805970149255e-06,
"loss": 0.3366,
"step": 620
},
{
"epoch": 0.09410183162493699,
"grad_norm": 3.5212433338165283,
"learning_rate": 9.388059701492538e-06,
"loss": 0.3501,
"step": 630
},
{
"epoch": 0.09559551149199948,
"grad_norm": 2.3926730155944824,
"learning_rate": 9.537313432835821e-06,
"loss": 0.3328,
"step": 640
},
{
"epoch": 0.09708919135906197,
"grad_norm": 3.278258800506592,
"learning_rate": 9.686567164179105e-06,
"loss": 0.3635,
"step": 650
},
{
"epoch": 0.09858287122612445,
"grad_norm": 2.390896797180176,
"learning_rate": 9.835820895522388e-06,
"loss": 0.3453,
"step": 660
},
{
"epoch": 0.10007655109318696,
"grad_norm": 2.1486220359802246,
"learning_rate": 9.985074626865673e-06,
"loss": 0.327,
"step": 670
},
{
"epoch": 0.10157023096024945,
"grad_norm": 3.7770419120788574,
"learning_rate": 9.999944943338487e-06,
"loss": 0.3048,
"step": 680
},
{
"epoch": 0.10306391082731194,
"grad_norm": 3.788212776184082,
"learning_rate": 9.999754625571397e-06,
"loss": 0.3593,
"step": 690
},
{
"epoch": 0.10455759069437442,
"grad_norm": 2.0790538787841797,
"learning_rate": 9.999428372160074e-06,
"loss": 0.3782,
"step": 700
},
{
"epoch": 0.10605127056143691,
"grad_norm": 2.0736265182495117,
"learning_rate": 9.998966191974846e-06,
"loss": 0.3522,
"step": 710
},
{
"epoch": 0.10754495042849942,
"grad_norm": 2.3214290142059326,
"learning_rate": 9.998368097581685e-06,
"loss": 0.3844,
"step": 720
},
{
"epoch": 0.1090386302955619,
"grad_norm": 1.3843424320220947,
"learning_rate": 9.997634105241855e-06,
"loss": 0.3387,
"step": 730
},
{
"epoch": 0.1105323101626244,
"grad_norm": 4.11653995513916,
"learning_rate": 9.996764234911483e-06,
"loss": 0.3523,
"step": 740
},
{
"epoch": 0.11202599002968688,
"grad_norm": 1.6446789503097534,
"learning_rate": 9.995758510241003e-06,
"loss": 0.3339,
"step": 750
},
{
"epoch": 0.11351966989674937,
"grad_norm": 1.4377137422561646,
"learning_rate": 9.994616958574526e-06,
"loss": 0.3523,
"step": 760
},
{
"epoch": 0.11501334976381188,
"grad_norm": 1.9575657844543457,
"learning_rate": 9.993339610949084e-06,
"loss": 0.3654,
"step": 770
},
{
"epoch": 0.11650702963087436,
"grad_norm": 1.8258610963821411,
"learning_rate": 9.9919265020938e-06,
"loss": 0.3465,
"step": 780
},
{
"epoch": 0.11800070949793685,
"grad_norm": 2.1197669506073,
"learning_rate": 9.99037767042893e-06,
"loss": 0.36,
"step": 790
},
{
"epoch": 0.11949438936499934,
"grad_norm": 1.671007752418518,
"learning_rate": 9.988693158064826e-06,
"loss": 0.3182,
"step": 800
},
{
"epoch": 0.12098806923206183,
"grad_norm": 2.0421807765960693,
"learning_rate": 9.986873010800792e-06,
"loss": 0.3402,
"step": 810
},
{
"epoch": 0.12248174909912433,
"grad_norm": 2.7439417839050293,
"learning_rate": 9.984917278123832e-06,
"loss": 0.3551,
"step": 820
},
{
"epoch": 0.12397542896618682,
"grad_norm": 1.2339754104614258,
"learning_rate": 9.982826013207314e-06,
"loss": 0.3407,
"step": 830
},
{
"epoch": 0.12546910883324933,
"grad_norm": 3.45686674118042,
"learning_rate": 9.980599272909517e-06,
"loss": 0.3262,
"step": 840
},
{
"epoch": 0.1269627887003118,
"grad_norm": 2.196939468383789,
"learning_rate": 9.978237117772086e-06,
"loss": 0.3537,
"step": 850
},
{
"epoch": 0.1284564685673743,
"grad_norm": 2.2232518196105957,
"learning_rate": 9.975739612018391e-06,
"loss": 0.3621,
"step": 860
},
{
"epoch": 0.12995014843443678,
"grad_norm": 1.7306561470031738,
"learning_rate": 9.973106823551772e-06,
"loss": 0.3207,
"step": 870
},
{
"epoch": 0.13144382830149928,
"grad_norm": 1.6579896211624146,
"learning_rate": 9.970338823953704e-06,
"loss": 0.3399,
"step": 880
},
{
"epoch": 0.13293750816856179,
"grad_norm": 2.4403505325317383,
"learning_rate": 9.96743568848184e-06,
"loss": 0.3616,
"step": 890
},
{
"epoch": 0.13443118803562426,
"grad_norm": 2.051017999649048,
"learning_rate": 9.964397496067972e-06,
"loss": 0.3408,
"step": 900
},
{
"epoch": 0.13592486790268676,
"grad_norm": 1.935581922531128,
"learning_rate": 9.961224329315886e-06,
"loss": 0.3469,
"step": 910
},
{
"epoch": 0.13741854776974924,
"grad_norm": 2.0615079402923584,
"learning_rate": 9.957916274499103e-06,
"loss": 0.3401,
"step": 920
},
{
"epoch": 0.13891222763681174,
"grad_norm": 2.1460647583007812,
"learning_rate": 9.954473421558554e-06,
"loss": 0.328,
"step": 930
},
{
"epoch": 0.14040590750387424,
"grad_norm": 1.6261128187179565,
"learning_rate": 9.950895864100117e-06,
"loss": 0.3483,
"step": 940
},
{
"epoch": 0.14189958737093672,
"grad_norm": 2.0029091835021973,
"learning_rate": 9.947183699392083e-06,
"loss": 0.3655,
"step": 950
},
{
"epoch": 0.14339326723799922,
"grad_norm": 2.068676233291626,
"learning_rate": 9.943337028362503e-06,
"loss": 0.3133,
"step": 960
},
{
"epoch": 0.1448869471050617,
"grad_norm": 2.6636133193969727,
"learning_rate": 9.93935595559645e-06,
"loss": 0.3295,
"step": 970
},
{
"epoch": 0.1463806269721242,
"grad_norm": 1.58219313621521,
"learning_rate": 9.935240589333179e-06,
"loss": 0.3247,
"step": 980
},
{
"epoch": 0.1478743068391867,
"grad_norm": 1.7050349712371826,
"learning_rate": 9.930991041463166e-06,
"loss": 0.3172,
"step": 990
},
{
"epoch": 0.14936798670624918,
"grad_norm": 1.4773019552230835,
"learning_rate": 9.926607427525094e-06,
"loss": 0.3445,
"step": 1000
},
{
"epoch": 0.14936798670624918,
"eval_loss": 0.33145418763160706,
"eval_runtime": 76.2509,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 3.554,
"step": 1000
},
{
"epoch": 0.15086166657331168,
"grad_norm": 2.2282180786132812,
"learning_rate": 9.922089866702685e-06,
"loss": 0.3449,
"step": 1010
},
{
"epoch": 0.15235534644037416,
"grad_norm": 1.7335081100463867,
"learning_rate": 9.917438481821475e-06,
"loss": 0.3664,
"step": 1020
},
{
"epoch": 0.15384902630743666,
"grad_norm": 1.7053015232086182,
"learning_rate": 9.912653399345473e-06,
"loss": 0.3457,
"step": 1030
},
{
"epoch": 0.15534270617449916,
"grad_norm": 1.1435269117355347,
"learning_rate": 9.907734749373712e-06,
"loss": 0.3177,
"step": 1040
},
{
"epoch": 0.15683638604156164,
"grad_norm": 3.1279070377349854,
"learning_rate": 9.90268266563673e-06,
"loss": 0.351,
"step": 1050
},
{
"epoch": 0.15833006590862414,
"grad_norm": 2.90409779548645,
"learning_rate": 9.897497285492919e-06,
"loss": 0.3403,
"step": 1060
},
{
"epoch": 0.15982374577568662,
"grad_norm": 1.5271624326705933,
"learning_rate": 9.892178749924792e-06,
"loss": 0.3039,
"step": 1070
},
{
"epoch": 0.16131742564274912,
"grad_norm": 1.622085452079773,
"learning_rate": 9.886727203535163e-06,
"loss": 0.3323,
"step": 1080
},
{
"epoch": 0.16281110550981162,
"grad_norm": 2.4689345359802246,
"learning_rate": 9.881142794543196e-06,
"loss": 0.3069,
"step": 1090
},
{
"epoch": 0.1643047853768741,
"grad_norm": 1.3529936075210571,
"learning_rate": 9.875425674780388e-06,
"loss": 0.3265,
"step": 1100
},
{
"epoch": 0.1657984652439366,
"grad_norm": 1.7921439409255981,
"learning_rate": 9.86957599968644e-06,
"loss": 0.3439,
"step": 1110
},
{
"epoch": 0.16729214511099907,
"grad_norm": 1.8610994815826416,
"learning_rate": 9.863593928305031e-06,
"loss": 0.323,
"step": 1120
},
{
"epoch": 0.16878582497806158,
"grad_norm": 1.460547685623169,
"learning_rate": 9.857479623279481e-06,
"loss": 0.3502,
"step": 1130
},
{
"epoch": 0.17027950484512408,
"grad_norm": 1.4841840267181396,
"learning_rate": 9.851233250848355e-06,
"loss": 0.332,
"step": 1140
},
{
"epoch": 0.17177318471218656,
"grad_norm": 1.907037377357483,
"learning_rate": 9.844854980840914e-06,
"loss": 0.3251,
"step": 1150
},
{
"epoch": 0.17326686457924906,
"grad_norm": 1.833953857421875,
"learning_rate": 9.838344986672518e-06,
"loss": 0.3628,
"step": 1160
},
{
"epoch": 0.17476054444631153,
"grad_norm": 1.5002996921539307,
"learning_rate": 9.831703445339904e-06,
"loss": 0.3346,
"step": 1170
},
{
"epoch": 0.17625422431337404,
"grad_norm": 1.2558107376098633,
"learning_rate": 9.824930537416372e-06,
"loss": 0.3429,
"step": 1180
},
{
"epoch": 0.17774790418043654,
"grad_norm": 2.025219678878784,
"learning_rate": 9.81802644704688e-06,
"loss": 0.3385,
"step": 1190
},
{
"epoch": 0.17924158404749901,
"grad_norm": 1.8134315013885498,
"learning_rate": 9.810991361943037e-06,
"loss": 0.3362,
"step": 1200
},
{
"epoch": 0.18073526391456152,
"grad_norm": 1.2848788499832153,
"learning_rate": 9.80382547337799e-06,
"loss": 0.3146,
"step": 1210
},
{
"epoch": 0.182228943781624,
"grad_norm": 1.6937299966812134,
"learning_rate": 9.796528976181238e-06,
"loss": 0.3192,
"step": 1220
},
{
"epoch": 0.1837226236486865,
"grad_norm": 1.8493894338607788,
"learning_rate": 9.78910206873333e-06,
"loss": 0.3463,
"step": 1230
},
{
"epoch": 0.185216303515749,
"grad_norm": 1.8786216974258423,
"learning_rate": 9.781544952960458e-06,
"loss": 0.3178,
"step": 1240
},
{
"epoch": 0.18670998338281147,
"grad_norm": 1.6666313409805298,
"learning_rate": 9.773857834328992e-06,
"loss": 0.3263,
"step": 1250
},
{
"epoch": 0.18820366324987398,
"grad_norm": 1.6735985279083252,
"learning_rate": 9.766040921839867e-06,
"loss": 0.3435,
"step": 1260
},
{
"epoch": 0.18969734311693645,
"grad_norm": 1.4434776306152344,
"learning_rate": 9.758094428022927e-06,
"loss": 0.3291,
"step": 1270
},
{
"epoch": 0.19119102298399895,
"grad_norm": 2.4167513847351074,
"learning_rate": 9.750018568931122e-06,
"loss": 0.3433,
"step": 1280
},
{
"epoch": 0.19268470285106146,
"grad_norm": 1.7800685167312622,
"learning_rate": 9.741813564134647e-06,
"loss": 0.3223,
"step": 1290
},
{
"epoch": 0.19417838271812393,
"grad_norm": 1.9175775051116943,
"learning_rate": 9.733479636714978e-06,
"loss": 0.3549,
"step": 1300
},
{
"epoch": 0.19567206258518643,
"grad_norm": 2.1426539421081543,
"learning_rate": 9.725017013258789e-06,
"loss": 0.3243,
"step": 1310
},
{
"epoch": 0.1971657424522489,
"grad_norm": 1.8812899589538574,
"learning_rate": 9.716425923851804e-06,
"loss": 0.3312,
"step": 1320
},
{
"epoch": 0.1986594223193114,
"grad_norm": 1.4907119274139404,
"learning_rate": 9.707706602072547e-06,
"loss": 0.3499,
"step": 1330
},
{
"epoch": 0.20015310218637392,
"grad_norm": 1.8211297988891602,
"learning_rate": 9.69885928498597e-06,
"loss": 0.3289,
"step": 1340
},
{
"epoch": 0.2016467820534364,
"grad_norm": 1.4706814289093018,
"learning_rate": 9.689884213137033e-06,
"loss": 0.3252,
"step": 1350
},
{
"epoch": 0.2031404619204989,
"grad_norm": 2.1436257362365723,
"learning_rate": 9.68078163054414e-06,
"loss": 0.3314,
"step": 1360
},
{
"epoch": 0.20463414178756137,
"grad_norm": 2.100780725479126,
"learning_rate": 9.671551784692529e-06,
"loss": 0.3227,
"step": 1370
},
{
"epoch": 0.20612782165462387,
"grad_norm": 1.4741297960281372,
"learning_rate": 9.662194926527517e-06,
"loss": 0.3467,
"step": 1380
},
{
"epoch": 0.20762150152168637,
"grad_norm": 2.250545024871826,
"learning_rate": 9.6527113104477e-06,
"loss": 0.3504,
"step": 1390
},
{
"epoch": 0.20911518138874885,
"grad_norm": 2.133129835128784,
"learning_rate": 9.643101194298023e-06,
"loss": 0.3535,
"step": 1400
},
{
"epoch": 0.21060886125581135,
"grad_norm": 2.9924333095550537,
"learning_rate": 9.633364839362777e-06,
"loss": 0.3501,
"step": 1410
},
{
"epoch": 0.21210254112287383,
"grad_norm": 2.5759615898132324,
"learning_rate": 9.623502510358488e-06,
"loss": 0.3427,
"step": 1420
},
{
"epoch": 0.21359622098993633,
"grad_norm": 1.1932740211486816,
"learning_rate": 9.613514475426722e-06,
"loss": 0.3381,
"step": 1430
},
{
"epoch": 0.21508990085699883,
"grad_norm": 1.5130189657211304,
"learning_rate": 9.6034010061268e-06,
"loss": 0.3297,
"step": 1440
},
{
"epoch": 0.2165835807240613,
"grad_norm": 1.248481035232544,
"learning_rate": 9.59316237742841e-06,
"loss": 0.3251,
"step": 1450
},
{
"epoch": 0.2180772605911238,
"grad_norm": 1.7967370748519897,
"learning_rate": 9.582798867704131e-06,
"loss": 0.3398,
"step": 1460
},
{
"epoch": 0.2195709404581863,
"grad_norm": 1.2705239057540894,
"learning_rate": 9.572310758721864e-06,
"loss": 0.3053,
"step": 1470
},
{
"epoch": 0.2210646203252488,
"grad_norm": 1.6166293621063232,
"learning_rate": 9.561698335637171e-06,
"loss": 0.3424,
"step": 1480
},
{
"epoch": 0.2225583001923113,
"grad_norm": 1.8217055797576904,
"learning_rate": 9.550961886985528e-06,
"loss": 0.347,
"step": 1490
},
{
"epoch": 0.22405198005937377,
"grad_norm": 1.6405028104782104,
"learning_rate": 9.540101704674473e-06,
"loss": 0.3383,
"step": 1500
},
{
"epoch": 0.22405198005937377,
"eval_loss": 0.326803594827652,
"eval_runtime": 76.2278,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 3.555,
"step": 1500
},
{
"epoch": 0.22554565992643627,
"grad_norm": 2.2624378204345703,
"learning_rate": 9.529118083975672e-06,
"loss": 0.335,
"step": 1510
},
{
"epoch": 0.22703933979349875,
"grad_norm": 1.2416856288909912,
"learning_rate": 9.518011323516892e-06,
"loss": 0.342,
"step": 1520
},
{
"epoch": 0.22853301966056125,
"grad_norm": 1.1462935209274292,
"learning_rate": 9.506781725273879e-06,
"loss": 0.3226,
"step": 1530
},
{
"epoch": 0.23002669952762375,
"grad_norm": 1.9097304344177246,
"learning_rate": 9.495429594562151e-06,
"loss": 0.3213,
"step": 1540
},
{
"epoch": 0.23152037939468623,
"grad_norm": 1.6176527738571167,
"learning_rate": 9.483955240028695e-06,
"loss": 0.3348,
"step": 1550
},
{
"epoch": 0.23301405926174873,
"grad_norm": 1.6169483661651611,
"learning_rate": 9.472358973643576e-06,
"loss": 0.3237,
"step": 1560
},
{
"epoch": 0.2345077391288112,
"grad_norm": 1.86874258518219,
"learning_rate": 9.460641110691456e-06,
"loss": 0.3475,
"step": 1570
},
{
"epoch": 0.2360014189958737,
"grad_norm": 1.534540057182312,
"learning_rate": 9.448801969763016e-06,
"loss": 0.3487,
"step": 1580
},
{
"epoch": 0.2374950988629362,
"grad_norm": 1.68146550655365,
"learning_rate": 9.436841872746309e-06,
"loss": 0.3128,
"step": 1590
},
{
"epoch": 0.23898877872999869,
"grad_norm": 1.0647422075271606,
"learning_rate": 9.424761144817987e-06,
"loss": 0.3437,
"step": 1600
},
{
"epoch": 0.2404824585970612,
"grad_norm": 1.4840996265411377,
"learning_rate": 9.412560114434477e-06,
"loss": 0.3483,
"step": 1610
},
{
"epoch": 0.24197613846412366,
"grad_norm": 2.902223587036133,
"learning_rate": 9.400239113323042e-06,
"loss": 0.3654,
"step": 1620
},
{
"epoch": 0.24346981833118617,
"grad_norm": 1.711083173751831,
"learning_rate": 9.387798476472766e-06,
"loss": 0.3369,
"step": 1630
},
{
"epoch": 0.24496349819824867,
"grad_norm": 1.4812666177749634,
"learning_rate": 9.37523854212545e-06,
"loss": 0.3521,
"step": 1640
},
{
"epoch": 0.24645717806531114,
"grad_norm": 1.1067218780517578,
"learning_rate": 9.362559651766402e-06,
"loss": 0.302,
"step": 1650
},
{
"epoch": 0.24795085793237365,
"grad_norm": 1.2541941404342651,
"learning_rate": 9.349762150115163e-06,
"loss": 0.3348,
"step": 1660
},
{
"epoch": 0.24944453779943612,
"grad_norm": 1.125554084777832,
"learning_rate": 9.336846385116138e-06,
"loss": 0.3444,
"step": 1670
},
{
"epoch": 0.25093821766649865,
"grad_norm": 1.8702635765075684,
"learning_rate": 9.323812707929126e-06,
"loss": 0.3092,
"step": 1680
},
{
"epoch": 0.2524318975335611,
"grad_norm": 1.7931956052780151,
"learning_rate": 9.31066147291978e-06,
"loss": 0.3416,
"step": 1690
},
{
"epoch": 0.2539255774006236,
"grad_norm": 1.8561383485794067,
"learning_rate": 9.297393037649965e-06,
"loss": 0.3521,
"step": 1700
},
{
"epoch": 0.2554192572676861,
"grad_norm": 1.8236286640167236,
"learning_rate": 9.284007762868047e-06,
"loss": 0.3025,
"step": 1710
},
{
"epoch": 0.2569129371347486,
"grad_norm": 1.4247581958770752,
"learning_rate": 9.270506012499072e-06,
"loss": 0.336,
"step": 1720
},
{
"epoch": 0.2584066170018111,
"grad_norm": 1.79005765914917,
"learning_rate": 9.256888153634888e-06,
"loss": 0.3153,
"step": 1730
},
{
"epoch": 0.25990029686887356,
"grad_norm": 2.0392203330993652,
"learning_rate": 9.243154556524144e-06,
"loss": 0.3462,
"step": 1740
},
{
"epoch": 0.26139397673593606,
"grad_norm": 1.978434681892395,
"learning_rate": 9.229305594562236e-06,
"loss": 0.3491,
"step": 1750
},
{
"epoch": 0.26288765660299857,
"grad_norm": 2.794302463531494,
"learning_rate": 9.215341644281161e-06,
"loss": 0.3432,
"step": 1760
},
{
"epoch": 0.26438133647006107,
"grad_norm": 2.9479925632476807,
"learning_rate": 9.201263085339266e-06,
"loss": 0.3267,
"step": 1770
},
{
"epoch": 0.26587501633712357,
"grad_norm": 1.6784942150115967,
"learning_rate": 9.187070300510927e-06,
"loss": 0.3403,
"step": 1780
},
{
"epoch": 0.267368696204186,
"grad_norm": 1.38883638381958,
"learning_rate": 9.172763675676153e-06,
"loss": 0.3242,
"step": 1790
},
{
"epoch": 0.2688623760712485,
"grad_norm": 2.5442349910736084,
"learning_rate": 9.158343599810087e-06,
"loss": 0.3369,
"step": 1800
},
{
"epoch": 0.270356055938311,
"grad_norm": 1.3056666851043701,
"learning_rate": 9.143810464972429e-06,
"loss": 0.3129,
"step": 1810
},
{
"epoch": 0.2718497358053735,
"grad_norm": 1.8842471837997437,
"learning_rate": 9.12916466629678e-06,
"loss": 0.3257,
"step": 1820
},
{
"epoch": 0.27334341567243603,
"grad_norm": 0.9923204183578491,
"learning_rate": 9.114406601979895e-06,
"loss": 0.3208,
"step": 1830
},
{
"epoch": 0.2748370955394985,
"grad_norm": 1.6141374111175537,
"learning_rate": 9.099536673270864e-06,
"loss": 0.3253,
"step": 1840
},
{
"epoch": 0.276330775406561,
"grad_norm": 2.0269787311553955,
"learning_rate": 9.084555284460192e-06,
"loss": 0.3179,
"step": 1850
},
{
"epoch": 0.2778244552736235,
"grad_norm": 1.620477557182312,
"learning_rate": 9.06946284286882e-06,
"loss": 0.3224,
"step": 1860
},
{
"epoch": 0.279318135140686,
"grad_norm": 1.725224494934082,
"learning_rate": 9.054259758837038e-06,
"loss": 0.3288,
"step": 1870
},
{
"epoch": 0.2808118150077485,
"grad_norm": 2.209329605102539,
"learning_rate": 9.038946445713335e-06,
"loss": 0.3421,
"step": 1880
},
{
"epoch": 0.28230549487481094,
"grad_norm": 1.3899812698364258,
"learning_rate": 9.02352331984316e-06,
"loss": 0.3255,
"step": 1890
},
{
"epoch": 0.28379917474187344,
"grad_norm": 1.5803393125534058,
"learning_rate": 9.007990800557601e-06,
"loss": 0.3147,
"step": 1900
},
{
"epoch": 0.28529285460893594,
"grad_norm": 1.134922742843628,
"learning_rate": 8.992349310161989e-06,
"loss": 0.3412,
"step": 1910
},
{
"epoch": 0.28678653447599844,
"grad_norm": 1.9992294311523438,
"learning_rate": 8.976599273924406e-06,
"loss": 0.3429,
"step": 1920
},
{
"epoch": 0.28828021434306095,
"grad_norm": 1.468029260635376,
"learning_rate": 8.960741120064131e-06,
"loss": 0.3279,
"step": 1930
},
{
"epoch": 0.2897738942101234,
"grad_norm": 1.7822861671447754,
"learning_rate": 8.944775279739996e-06,
"loss": 0.3192,
"step": 1940
},
{
"epoch": 0.2912675740771859,
"grad_norm": 1.5257068872451782,
"learning_rate": 8.928702187038665e-06,
"loss": 0.3359,
"step": 1950
},
{
"epoch": 0.2927612539442484,
"grad_norm": 1.5627810955047607,
"learning_rate": 8.91252227896282e-06,
"loss": 0.3255,
"step": 1960
},
{
"epoch": 0.2942549338113109,
"grad_norm": 1.1691981554031372,
"learning_rate": 8.8962359954193e-06,
"loss": 0.3398,
"step": 1970
},
{
"epoch": 0.2957486136783734,
"grad_norm": 2.4454123973846436,
"learning_rate": 8.879843779207123e-06,
"loss": 0.3137,
"step": 1980
},
{
"epoch": 0.29724229354543585,
"grad_norm": 1.4002143144607544,
"learning_rate": 8.863346076005452e-06,
"loss": 0.3262,
"step": 1990
},
{
"epoch": 0.29873597341249836,
"grad_norm": 1.3549312353134155,
"learning_rate": 8.846743334361486e-06,
"loss": 0.3352,
"step": 2000
},
{
"epoch": 0.29873597341249836,
"eval_loss": 0.32243964076042175,
"eval_runtime": 76.2222,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 3.555,
"step": 2000
},
{
"epoch": 0.30022965327956086,
"grad_norm": 1.2843849658966064,
"learning_rate": 8.830036005678253e-06,
"loss": 0.3178,
"step": 2010
},
{
"epoch": 0.30172333314662336,
"grad_norm": 1.5276010036468506,
"learning_rate": 8.81322454420234e-06,
"loss": 0.337,
"step": 2020
},
{
"epoch": 0.30321701301368587,
"grad_norm": 1.4595482349395752,
"learning_rate": 8.796309407011553e-06,
"loss": 0.3196,
"step": 2030
},
{
"epoch": 0.3047106928807483,
"grad_norm": 1.7560086250305176,
"learning_rate": 8.779291054002468e-06,
"loss": 0.3407,
"step": 2040
},
{
"epoch": 0.3062043727478108,
"grad_norm": 1.4491099119186401,
"learning_rate": 8.762169947877951e-06,
"loss": 0.3225,
"step": 2050
},
{
"epoch": 0.3076980526148733,
"grad_norm": 1.2083287239074707,
"learning_rate": 8.74494655413457e-06,
"loss": 0.3135,
"step": 2060
},
{
"epoch": 0.3091917324819358,
"grad_norm": 1.734601616859436,
"learning_rate": 8.727621341049924e-06,
"loss": 0.3435,
"step": 2070
},
{
"epoch": 0.3106854123489983,
"grad_norm": 1.7759486436843872,
"learning_rate": 8.710194779669932e-06,
"loss": 0.3192,
"step": 2080
},
{
"epoch": 0.31217909221606077,
"grad_norm": 1.632818579673767,
"learning_rate": 8.692667343796013e-06,
"loss": 0.334,
"step": 2090
},
{
"epoch": 0.3136727720831233,
"grad_norm": 1.8493646383285522,
"learning_rate": 8.675039509972216e-06,
"loss": 0.3345,
"step": 2100
},
{
"epoch": 0.3151664519501858,
"grad_norm": 2.082334280014038,
"learning_rate": 8.657311757472247e-06,
"loss": 0.3551,
"step": 2110
},
{
"epoch": 0.3166601318172483,
"grad_norm": 2.2276527881622314,
"learning_rate": 8.639484568286451e-06,
"loss": 0.3335,
"step": 2120
},
{
"epoch": 0.3181538116843108,
"grad_norm": 1.907583475112915,
"learning_rate": 8.621558427108705e-06,
"loss": 0.3219,
"step": 2130
},
{
"epoch": 0.31964749155137323,
"grad_norm": 1.540067434310913,
"learning_rate": 8.603533821323238e-06,
"loss": 0.322,
"step": 2140
},
{
"epoch": 0.32114117141843573,
"grad_norm": 1.2835497856140137,
"learning_rate": 8.585411240991378e-06,
"loss": 0.3143,
"step": 2150
},
{
"epoch": 0.32263485128549824,
"grad_norm": 1.189209222793579,
"learning_rate": 8.56719117883823e-06,
"loss": 0.3333,
"step": 2160
},
{
"epoch": 0.32412853115256074,
"grad_norm": 1.5749543905258179,
"learning_rate": 8.548874130239286e-06,
"loss": 0.3257,
"step": 2170
},
{
"epoch": 0.32562221101962324,
"grad_norm": 2.1032919883728027,
"learning_rate": 8.530460593206942e-06,
"loss": 0.3155,
"step": 2180
},
{
"epoch": 0.3271158908866857,
"grad_norm": 1.9780592918395996,
"learning_rate": 8.511951068376975e-06,
"loss": 0.3199,
"step": 2190
},
{
"epoch": 0.3286095707537482,
"grad_norm": 1.8979151248931885,
"learning_rate": 8.493346058994916e-06,
"loss": 0.3323,
"step": 2200
},
{
"epoch": 0.3301032506208107,
"grad_norm": 1.7598830461502075,
"learning_rate": 8.474646070902376e-06,
"loss": 0.3202,
"step": 2210
},
{
"epoch": 0.3315969304878732,
"grad_norm": 1.7886403799057007,
"learning_rate": 8.455851612523291e-06,
"loss": 0.3319,
"step": 2220
},
{
"epoch": 0.3330906103549357,
"grad_norm": 1.9333144426345825,
"learning_rate": 8.4369631948501e-06,
"loss": 0.3377,
"step": 2230
},
{
"epoch": 0.33458429022199815,
"grad_norm": 1.5406423807144165,
"learning_rate": 8.417981331429855e-06,
"loss": 0.3359,
"step": 2240
},
{
"epoch": 0.33607797008906065,
"grad_norm": 1.1198780536651611,
"learning_rate": 8.39890653835024e-06,
"loss": 0.3423,
"step": 2250
},
{
"epoch": 0.33757164995612315,
"grad_norm": 1.867664098739624,
"learning_rate": 8.379739334225571e-06,
"loss": 0.3274,
"step": 2260
},
{
"epoch": 0.33906532982318566,
"grad_norm": 1.5488725900650024,
"learning_rate": 8.360480240182666e-06,
"loss": 0.3366,
"step": 2270
},
{
"epoch": 0.34055900969024816,
"grad_norm": 1.5203229188919067,
"learning_rate": 8.341129779846695e-06,
"loss": 0.3229,
"step": 2280
},
{
"epoch": 0.3420526895573106,
"grad_norm": 1.774835228919983,
"learning_rate": 8.321688479326935e-06,
"loss": 0.3307,
"step": 2290
},
{
"epoch": 0.3435463694243731,
"grad_norm": 1.333151936531067,
"learning_rate": 8.302156867202468e-06,
"loss": 0.3216,
"step": 2300
},
{
"epoch": 0.3450400492914356,
"grad_norm": 1.3206020593643188,
"learning_rate": 8.28253547450781e-06,
"loss": 0.3125,
"step": 2310
},
{
"epoch": 0.3465337291584981,
"grad_norm": 1.8065084218978882,
"learning_rate": 8.262824834718471e-06,
"loss": 0.3201,
"step": 2320
},
{
"epoch": 0.3480274090255606,
"grad_norm": 2.162179708480835,
"learning_rate": 8.243025483736458e-06,
"loss": 0.3156,
"step": 2330
},
{
"epoch": 0.34952108889262307,
"grad_norm": 1.118371844291687,
"learning_rate": 8.22313795987569e-06,
"loss": 0.3433,
"step": 2340
},
{
"epoch": 0.35101476875968557,
"grad_norm": 1.838300347328186,
"learning_rate": 8.20316280384738e-06,
"loss": 0.3154,
"step": 2350
},
{
"epoch": 0.35250844862674807,
"grad_norm": 1.6531926393508911,
"learning_rate": 8.183100558745317e-06,
"loss": 0.3072,
"step": 2360
},
{
"epoch": 0.3540021284938106,
"grad_norm": 2.1075356006622314,
"learning_rate": 8.162951770031116e-06,
"loss": 0.3291,
"step": 2370
},
{
"epoch": 0.3554958083608731,
"grad_norm": 1.7505310773849487,
"learning_rate": 8.142716985519373e-06,
"loss": 0.3222,
"step": 2380
},
{
"epoch": 0.3569894882279355,
"grad_norm": 1.5103789567947388,
"learning_rate": 8.122396755362782e-06,
"loss": 0.3086,
"step": 2390
},
{
"epoch": 0.35848316809499803,
"grad_norm": 1.8631788492202759,
"learning_rate": 8.10199163203717e-06,
"loss": 0.3312,
"step": 2400
},
{
"epoch": 0.35997684796206053,
"grad_norm": 1.6605143547058105,
"learning_rate": 8.081502170326478e-06,
"loss": 0.3228,
"step": 2410
},
{
"epoch": 0.36147052782912303,
"grad_norm": 1.1152336597442627,
"learning_rate": 8.060928927307687e-06,
"loss": 0.3307,
"step": 2420
},
{
"epoch": 0.36296420769618554,
"grad_norm": 1.3379615545272827,
"learning_rate": 8.040272462335648e-06,
"loss": 0.323,
"step": 2430
},
{
"epoch": 0.364457887563248,
"grad_norm": 2.2633602619171143,
"learning_rate": 8.019533337027903e-06,
"loss": 0.3195,
"step": 2440
},
{
"epoch": 0.3659515674303105,
"grad_norm": 1.8531728982925415,
"learning_rate": 7.998712115249391e-06,
"loss": 0.3531,
"step": 2450
},
{
"epoch": 0.367445247297373,
"grad_norm": 1.6278972625732422,
"learning_rate": 7.977809363097135e-06,
"loss": 0.3373,
"step": 2460
},
{
"epoch": 0.3689389271644355,
"grad_norm": 1.7813271284103394,
"learning_rate": 7.956825648884842e-06,
"loss": 0.3506,
"step": 2470
},
{
"epoch": 0.370432607031498,
"grad_norm": 2.0010931491851807,
"learning_rate": 7.935761543127449e-06,
"loss": 0.3166,
"step": 2480
},
{
"epoch": 0.37192628689856044,
"grad_norm": 2.6339111328125,
"learning_rate": 7.91461761852562e-06,
"loss": 0.32,
"step": 2490
},
{
"epoch": 0.37341996676562295,
"grad_norm": 1.8536508083343506,
"learning_rate": 7.893394449950166e-06,
"loss": 0.3027,
"step": 2500
},
{
"epoch": 0.37341996676562295,
"eval_loss": 0.31971076130867004,
"eval_runtime": 76.1507,
"eval_samples_per_second": 7.104,
"eval_steps_per_second": 3.559,
"step": 2500
},
{
"epoch": 0.37491364663268545,
"grad_norm": 1.504650592803955,
"learning_rate": 7.87209261442643e-06,
"loss": 0.3075,
"step": 2510
},
{
"epoch": 0.37640732649974795,
"grad_norm": 1.0728139877319336,
"learning_rate": 7.850712691118577e-06,
"loss": 0.3329,
"step": 2520
},
{
"epoch": 0.37790100636681045,
"grad_norm": 1.5715535879135132,
"learning_rate": 7.829255261313862e-06,
"loss": 0.3105,
"step": 2530
},
{
"epoch": 0.3793946862338729,
"grad_norm": 0.8371075987815857,
"learning_rate": 7.807720908406826e-06,
"loss": 0.3318,
"step": 2540
},
{
"epoch": 0.3808883661009354,
"grad_norm": 2.6301848888397217,
"learning_rate": 7.786110217883429e-06,
"loss": 0.3471,
"step": 2550
},
{
"epoch": 0.3823820459679979,
"grad_norm": 1.0217111110687256,
"learning_rate": 7.764423777305132e-06,
"loss": 0.2987,
"step": 2560
},
{
"epoch": 0.3838757258350604,
"grad_norm": 1.5058764219284058,
"learning_rate": 7.742662176292926e-06,
"loss": 0.301,
"step": 2570
},
{
"epoch": 0.3853694057021229,
"grad_norm": 1.2323505878448486,
"learning_rate": 7.720826006511297e-06,
"loss": 0.3135,
"step": 2580
},
{
"epoch": 0.38686308556918536,
"grad_norm": 1.6528573036193848,
"learning_rate": 7.698915861652139e-06,
"loss": 0.3357,
"step": 2590
},
{
"epoch": 0.38835676543624786,
"grad_norm": 1.556429386138916,
"learning_rate": 7.676932337418624e-06,
"loss": 0.3063,
"step": 2600
},
{
"epoch": 0.38985044530331037,
"grad_norm": 1.9085198640823364,
"learning_rate": 7.654876031508981e-06,
"loss": 0.3214,
"step": 2610
},
{
"epoch": 0.39134412517037287,
"grad_norm": 1.279447078704834,
"learning_rate": 7.63274754360028e-06,
"loss": 0.3206,
"step": 2620
},
{
"epoch": 0.3928378050374354,
"grad_norm": 2.345536231994629,
"learning_rate": 7.610547475332089e-06,
"loss": 0.3254,
"step": 2630
},
{
"epoch": 0.3943314849044978,
"grad_norm": 0.9263664484024048,
"learning_rate": 7.588276430290151e-06,
"loss": 0.3234,
"step": 2640
},
{
"epoch": 0.3958251647715603,
"grad_norm": 1.5908204317092896,
"learning_rate": 7.56593501398995e-06,
"loss": 0.3246,
"step": 2650
},
{
"epoch": 0.3973188446386228,
"grad_norm": 1.5689475536346436,
"learning_rate": 7.5435238338602604e-06,
"loss": 0.3183,
"step": 2660
},
{
"epoch": 0.39881252450568533,
"grad_norm": 0.8952176570892334,
"learning_rate": 7.521043499226625e-06,
"loss": 0.3019,
"step": 2670
},
{
"epoch": 0.40030620437274783,
"grad_norm": 1.4977798461914062,
"learning_rate": 7.498494621294796e-06,
"loss": 0.347,
"step": 2680
},
{
"epoch": 0.4017998842398103,
"grad_norm": 1.0641767978668213,
"learning_rate": 7.475877813134106e-06,
"loss": 0.341,
"step": 2690
},
{
"epoch": 0.4032935641068728,
"grad_norm": 1.3907352685928345,
"learning_rate": 7.453193689660811e-06,
"loss": 0.3206,
"step": 2700
},
{
"epoch": 0.4047872439739353,
"grad_norm": 1.4206258058547974,
"learning_rate": 7.430442867621365e-06,
"loss": 0.3058,
"step": 2710
},
{
"epoch": 0.4062809238409978,
"grad_norm": 1.0893877744674683,
"learning_rate": 7.407625965575656e-06,
"loss": 0.306,
"step": 2720
},
{
"epoch": 0.4077746037080603,
"grad_norm": 1.5306363105773926,
"learning_rate": 7.384743603880181e-06,
"loss": 0.3395,
"step": 2730
},
{
"epoch": 0.40926828357512274,
"grad_norm": 1.5694290399551392,
"learning_rate": 7.361796404671187e-06,
"loss": 0.3044,
"step": 2740
},
{
"epoch": 0.41076196344218524,
"grad_norm": 1.862500786781311,
"learning_rate": 7.338784991847755e-06,
"loss": 0.3307,
"step": 2750
},
{
"epoch": 0.41225564330924774,
"grad_norm": 1.3926466703414917,
"learning_rate": 7.315709991054832e-06,
"loss": 0.3052,
"step": 2760
},
{
"epoch": 0.41374932317631025,
"grad_norm": 1.6417464017868042,
"learning_rate": 7.292572029666228e-06,
"loss": 0.3108,
"step": 2770
},
{
"epoch": 0.41524300304337275,
"grad_norm": 2.3295059204101562,
"learning_rate": 7.269371736767552e-06,
"loss": 0.3299,
"step": 2780
},
{
"epoch": 0.4167366829104352,
"grad_norm": 1.707053303718567,
"learning_rate": 7.246109743139111e-06,
"loss": 0.3129,
"step": 2790
},
{
"epoch": 0.4182303627774977,
"grad_norm": 1.233490228652954,
"learning_rate": 7.222786681238762e-06,
"loss": 0.3234,
"step": 2800
},
{
"epoch": 0.4197240426445602,
"grad_norm": 0.8047583699226379,
"learning_rate": 7.1994031851847125e-06,
"loss": 0.3038,
"step": 2810
},
{
"epoch": 0.4212177225116227,
"grad_norm": 1.466469168663025,
"learning_rate": 7.175959890738282e-06,
"loss": 0.3382,
"step": 2820
},
{
"epoch": 0.4227114023786852,
"grad_norm": 1.0184977054595947,
"learning_rate": 7.152457435286619e-06,
"loss": 0.3143,
"step": 2830
},
{
"epoch": 0.42420508224574766,
"grad_norm": 1.102300763130188,
"learning_rate": 7.128896457825364e-06,
"loss": 0.3228,
"step": 2840
},
{
"epoch": 0.42569876211281016,
"grad_norm": 1.8604798316955566,
"learning_rate": 7.1052775989412855e-06,
"loss": 0.2981,
"step": 2850
},
{
"epoch": 0.42719244197987266,
"grad_norm": 1.1831308603286743,
"learning_rate": 7.081601500794857e-06,
"loss": 0.3297,
"step": 2860
},
{
"epoch": 0.42868612184693516,
"grad_norm": 1.7088931798934937,
"learning_rate": 7.057868807102799e-06,
"loss": 0.3101,
"step": 2870
},
{
"epoch": 0.43017980171399767,
"grad_norm": 1.3123115301132202,
"learning_rate": 7.034080163120579e-06,
"loss": 0.3258,
"step": 2880
},
{
"epoch": 0.4316734815810601,
"grad_norm": 1.3527169227600098,
"learning_rate": 7.010236215624867e-06,
"loss": 0.3029,
"step": 2890
},
{
"epoch": 0.4331671614481226,
"grad_norm": 1.361512541770935,
"learning_rate": 6.986337612895949e-06,
"loss": 0.3392,
"step": 2900
},
{
"epoch": 0.4346608413151851,
"grad_norm": 1.4390591382980347,
"learning_rate": 6.962385004700105e-06,
"loss": 0.3351,
"step": 2910
},
{
"epoch": 0.4361545211822476,
"grad_norm": 1.67287278175354,
"learning_rate": 6.938379042271939e-06,
"loss": 0.3255,
"step": 2920
},
{
"epoch": 0.4376482010493101,
"grad_norm": 1.2548269033432007,
"learning_rate": 6.914320378296674e-06,
"loss": 0.3262,
"step": 2930
},
{
"epoch": 0.4391418809163726,
"grad_norm": 1.2193247079849243,
"learning_rate": 6.89020966689241e-06,
"loss": 0.3412,
"step": 2940
},
{
"epoch": 0.4406355607834351,
"grad_norm": 1.1901212930679321,
"learning_rate": 6.866047563592334e-06,
"loss": 0.3002,
"step": 2950
},
{
"epoch": 0.4421292406504976,
"grad_norm": 1.65078866481781,
"learning_rate": 6.841834725326899e-06,
"loss": 0.3172,
"step": 2960
},
{
"epoch": 0.4436229205175601,
"grad_norm": 1.3838766813278198,
"learning_rate": 6.817571810405967e-06,
"loss": 0.3215,
"step": 2970
},
{
"epoch": 0.4451166003846226,
"grad_norm": 1.2286713123321533,
"learning_rate": 6.793259478500907e-06,
"loss": 0.3208,
"step": 2980
},
{
"epoch": 0.44661028025168503,
"grad_norm": 0.9910550713539124,
"learning_rate": 6.7688983906266544e-06,
"loss": 0.3293,
"step": 2990
},
{
"epoch": 0.44810396011874754,
"grad_norm": 1.6711299419403076,
"learning_rate": 6.74448920912375e-06,
"loss": 0.3272,
"step": 3000
},
{
"epoch": 0.44810396011874754,
"eval_loss": 0.31658411026000977,
"eval_runtime": 76.2262,
"eval_samples_per_second": 7.097,
"eval_steps_per_second": 3.555,
"step": 3000
},
{
"epoch": 0.44959763998581004,
"grad_norm": 1.8898260593414307,
"learning_rate": 6.720032597640326e-06,
"loss": 0.332,
"step": 3010
},
{
"epoch": 0.45109131985287254,
"grad_norm": 1.8445961475372314,
"learning_rate": 6.695529221114059e-06,
"loss": 0.3165,
"step": 3020
},
{
"epoch": 0.45258499971993504,
"grad_norm": 1.3706282377243042,
"learning_rate": 6.670979745754101e-06,
"loss": 0.3165,
"step": 3030
},
{
"epoch": 0.4540786795869975,
"grad_norm": 1.7057021856307983,
"learning_rate": 6.646384839022955e-06,
"loss": 0.3045,
"step": 3040
},
{
"epoch": 0.45557235945406,
"grad_norm": 1.5170303583145142,
"learning_rate": 6.621745169618337e-06,
"loss": 0.3061,
"step": 3050
},
{
"epoch": 0.4570660393211225,
"grad_norm": 2.1427805423736572,
"learning_rate": 6.597061407454987e-06,
"loss": 0.31,
"step": 3060
},
{
"epoch": 0.458559719188185,
"grad_norm": 1.1289193630218506,
"learning_rate": 6.572334223646468e-06,
"loss": 0.3388,
"step": 3070
},
{
"epoch": 0.4600533990552475,
"grad_norm": 1.3998080492019653,
"learning_rate": 6.5475642904869004e-06,
"loss": 0.3296,
"step": 3080
},
{
"epoch": 0.46154707892230995,
"grad_norm": 1.4870209693908691,
"learning_rate": 6.5227522814327e-06,
"loss": 0.3441,
"step": 3090
},
{
"epoch": 0.46304075878937245,
"grad_norm": 1.3324146270751953,
"learning_rate": 6.4978988710842585e-06,
"loss": 0.3072,
"step": 3100
},
{
"epoch": 0.46453443865643496,
"grad_norm": 2.6711628437042236,
"learning_rate": 6.473004735167605e-06,
"loss": 0.3199,
"step": 3110
},
{
"epoch": 0.46602811852349746,
"grad_norm": 1.2170815467834473,
"learning_rate": 6.44807055051604e-06,
"loss": 0.3184,
"step": 3120
},
{
"epoch": 0.46752179839055996,
"grad_norm": 1.2690013647079468,
"learning_rate": 6.423096995051722e-06,
"loss": 0.3292,
"step": 3130
},
{
"epoch": 0.4690154782576224,
"grad_norm": 1.5772716999053955,
"learning_rate": 6.398084747767241e-06,
"loss": 0.3219,
"step": 3140
},
{
"epoch": 0.4705091581246849,
"grad_norm": 1.8444935083389282,
"learning_rate": 6.373034488707159e-06,
"loss": 0.3282,
"step": 3150
},
{
"epoch": 0.4720028379917474,
"grad_norm": 1.8097927570343018,
"learning_rate": 6.347946898949524e-06,
"loss": 0.3426,
"step": 3160
},
{
"epoch": 0.4734965178588099,
"grad_norm": 1.232932209968567,
"learning_rate": 6.322822660587343e-06,
"loss": 0.3195,
"step": 3170
},
{
"epoch": 0.4749901977258724,
"grad_norm": 1.4135682582855225,
"learning_rate": 6.297662456710043e-06,
"loss": 0.3125,
"step": 3180
},
{
"epoch": 0.47648387759293487,
"grad_norm": 1.2826404571533203,
"learning_rate": 6.272466971384902e-06,
"loss": 0.3418,
"step": 3190
},
{
"epoch": 0.47797755745999737,
"grad_norm": 1.1015794277191162,
"learning_rate": 6.24723688963844e-06,
"loss": 0.3114,
"step": 3200
},
{
"epoch": 0.4794712373270599,
"grad_norm": 1.6762737035751343,
"learning_rate": 6.221972897437804e-06,
"loss": 0.3315,
"step": 3210
},
{
"epoch": 0.4809649171941224,
"grad_norm": 1.5286458730697632,
"learning_rate": 6.1966756816721195e-06,
"loss": 0.3081,
"step": 3220
},
{
"epoch": 0.4824585970611849,
"grad_norm": 2.174837827682495,
"learning_rate": 6.171345930133798e-06,
"loss": 0.3251,
"step": 3230
},
{
"epoch": 0.4839522769282473,
"grad_norm": 1.966800570487976,
"learning_rate": 6.145984331499859e-06,
"loss": 0.33,
"step": 3240
},
{
"epoch": 0.48544595679530983,
"grad_norm": 1.1323667764663696,
"learning_rate": 6.120591575313189e-06,
"loss": 0.322,
"step": 3250
},
{
"epoch": 0.48693963666237233,
"grad_norm": 1.1445270776748657,
"learning_rate": 6.095168351963805e-06,
"loss": 0.3066,
"step": 3260
},
{
"epoch": 0.48843331652943484,
"grad_norm": 0.9923702478408813,
"learning_rate": 6.069715352670076e-06,
"loss": 0.3006,
"step": 3270
},
{
"epoch": 0.48992699639649734,
"grad_norm": 1.8956522941589355,
"learning_rate": 6.044233269459935e-06,
"loss": 0.3309,
"step": 3280
},
{
"epoch": 0.4914206762635598,
"grad_norm": 1.9034560918807983,
"learning_rate": 6.018722795152062e-06,
"loss": 0.3168,
"step": 3290
},
{
"epoch": 0.4929143561306223,
"grad_norm": 1.3808101415634155,
"learning_rate": 5.993184623337045e-06,
"loss": 0.3148,
"step": 3300
},
{
"epoch": 0.4944080359976848,
"grad_norm": 1.3605296611785889,
"learning_rate": 5.967619448358529e-06,
"loss": 0.3128,
"step": 3310
},
{
"epoch": 0.4959017158647473,
"grad_norm": 1.7083598375320435,
"learning_rate": 5.942027965294329e-06,
"loss": 0.3224,
"step": 3320
},
{
"epoch": 0.4973953957318098,
"grad_norm": 2.0568454265594482,
"learning_rate": 5.916410869937541e-06,
"loss": 0.3199,
"step": 3330
},
{
"epoch": 0.49888907559887224,
"grad_norm": 1.6961411237716675,
"learning_rate": 5.890768858777613e-06,
"loss": 0.3356,
"step": 3340
},
{
"epoch": 0.5003827554659348,
"grad_norm": 1.5090399980545044,
"learning_rate": 5.865102628981424e-06,
"loss": 0.3014,
"step": 3350
},
{
"epoch": 0.5018764353329973,
"grad_norm": 1.3667364120483398,
"learning_rate": 5.839412878374313e-06,
"loss": 0.3386,
"step": 3360
},
{
"epoch": 0.5033701152000597,
"grad_norm": 1.5758767127990723,
"learning_rate": 5.813700305421119e-06,
"loss": 0.2939,
"step": 3370
},
{
"epoch": 0.5048637950671222,
"grad_norm": 1.0520446300506592,
"learning_rate": 5.787965609207184e-06,
"loss": 0.2978,
"step": 3380
},
{
"epoch": 0.5063574749341847,
"grad_norm": 1.4224300384521484,
"learning_rate": 5.762209489419343e-06,
"loss": 0.3168,
"step": 3390
},
{
"epoch": 0.5078511548012472,
"grad_norm": 1.1233537197113037,
"learning_rate": 5.736432646326911e-06,
"loss": 0.3219,
"step": 3400
},
{
"epoch": 0.5093448346683097,
"grad_norm": 1.480785608291626,
"learning_rate": 5.710635780762639e-06,
"loss": 0.3227,
"step": 3410
},
{
"epoch": 0.5108385145353722,
"grad_norm": 1.2440319061279297,
"learning_rate": 5.68481959410365e-06,
"loss": 0.3391,
"step": 3420
},
{
"epoch": 0.5123321944024347,
"grad_norm": 1.2866686582565308,
"learning_rate": 5.658984788252384e-06,
"loss": 0.2983,
"step": 3430
},
{
"epoch": 0.5138258742694972,
"grad_norm": 1.2832037210464478,
"learning_rate": 5.633132065617509e-06,
"loss": 0.3066,
"step": 3440
},
{
"epoch": 0.5153195541365597,
"grad_norm": 1.5093879699707031,
"learning_rate": 5.607262129094819e-06,
"loss": 0.3198,
"step": 3450
},
{
"epoch": 0.5168132340036222,
"grad_norm": 1.5857967138290405,
"learning_rate": 5.581375682048131e-06,
"loss": 0.3187,
"step": 3460
},
{
"epoch": 0.5183069138706846,
"grad_norm": 1.2231806516647339,
"learning_rate": 5.555473428290154e-06,
"loss": 0.3029,
"step": 3470
},
{
"epoch": 0.5198005937377471,
"grad_norm": 1.2822185754776,
"learning_rate": 5.5295560720633575e-06,
"loss": 0.3046,
"step": 3480
},
{
"epoch": 0.5212942736048096,
"grad_norm": 1.2995489835739136,
"learning_rate": 5.503624318020829e-06,
"loss": 0.3295,
"step": 3490
},
{
"epoch": 0.5227879534718721,
"grad_norm": 1.9352302551269531,
"learning_rate": 5.477678871207105e-06,
"loss": 0.3216,
"step": 3500
},
{
"epoch": 0.5227879534718721,
"eval_loss": 0.31384068727493286,
"eval_runtime": 76.2107,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 3.556,
"step": 3500
},
{
"epoch": 0.5242816333389346,
"grad_norm": 1.2406340837478638,
"learning_rate": 5.4517204370390086e-06,
"loss": 0.3009,
"step": 3510
},
{
"epoch": 0.5257753132059971,
"grad_norm": 1.5014777183532715,
"learning_rate": 5.425749721286471e-06,
"loss": 0.3138,
"step": 3520
},
{
"epoch": 0.5272689930730596,
"grad_norm": 2.0441832542419434,
"learning_rate": 5.399767430053338e-06,
"loss": 0.3317,
"step": 3530
},
{
"epoch": 0.5287626729401221,
"grad_norm": 1.4949225187301636,
"learning_rate": 5.373774269758178e-06,
"loss": 0.3156,
"step": 3540
},
{
"epoch": 0.5302563528071846,
"grad_norm": 1.5877892971038818,
"learning_rate": 5.3477709471150716e-06,
"loss": 0.2948,
"step": 3550
},
{
"epoch": 0.5317500326742471,
"grad_norm": 1.809065580368042,
"learning_rate": 5.321758169114396e-06,
"loss": 0.3177,
"step": 3560
},
{
"epoch": 0.5332437125413095,
"grad_norm": 1.4500436782836914,
"learning_rate": 5.295736643003605e-06,
"loss": 0.2974,
"step": 3570
},
{
"epoch": 0.534737392408372,
"grad_norm": 1.2693545818328857,
"learning_rate": 5.269707076268005e-06,
"loss": 0.2848,
"step": 3580
},
{
"epoch": 0.5362310722754345,
"grad_norm": 1.21388578414917,
"learning_rate": 5.243670176611509e-06,
"loss": 0.3199,
"step": 3590
},
{
"epoch": 0.537724752142497,
"grad_norm": 1.2438586950302124,
"learning_rate": 5.217626651937404e-06,
"loss": 0.3064,
"step": 3600
},
{
"epoch": 0.5392184320095595,
"grad_norm": 2.074819326400757,
"learning_rate": 5.1915772103291e-06,
"loss": 0.3081,
"step": 3610
},
{
"epoch": 0.540712111876622,
"grad_norm": 1.8442296981811523,
"learning_rate": 5.1655225600308765e-06,
"loss": 0.3303,
"step": 3620
},
{
"epoch": 0.5422057917436846,
"grad_norm": 1.3743780851364136,
"learning_rate": 5.139463409428635e-06,
"loss": 0.3368,
"step": 3630
},
{
"epoch": 0.543699471610747,
"grad_norm": 1.440290927886963,
"learning_rate": 5.113400467030632e-06,
"loss": 0.3332,
"step": 3640
},
{
"epoch": 0.5451931514778096,
"grad_norm": 1.7156881093978882,
"learning_rate": 5.087334441448213e-06,
"loss": 0.3164,
"step": 3650
},
{
"epoch": 0.5466868313448721,
"grad_norm": 0.9851483106613159,
"learning_rate": 5.061266041376553e-06,
"loss": 0.3407,
"step": 3660
},
{
"epoch": 0.5481805112119345,
"grad_norm": 1.2339802980422974,
"learning_rate": 5.035195975575387e-06,
"loss": 0.3115,
"step": 3670
},
{
"epoch": 0.549674191078997,
"grad_norm": 1.1633639335632324,
"learning_rate": 5.0091249528497374e-06,
"loss": 0.3215,
"step": 3680
},
{
"epoch": 0.5511678709460595,
"grad_norm": 1.794061303138733,
"learning_rate": 4.983053682030642e-06,
"loss": 0.3222,
"step": 3690
},
{
"epoch": 0.552661550813122,
"grad_norm": 1.7158312797546387,
"learning_rate": 4.95698287195589e-06,
"loss": 0.3021,
"step": 3700
},
{
"epoch": 0.5541552306801845,
"grad_norm": 1.7610148191452026,
"learning_rate": 4.930913231450737e-06,
"loss": 0.2871,
"step": 3710
},
{
"epoch": 0.555648910547247,
"grad_norm": 1.8225277662277222,
"learning_rate": 4.904845469308642e-06,
"loss": 0.2988,
"step": 3720
},
{
"epoch": 0.5571425904143095,
"grad_norm": 1.7422287464141846,
"learning_rate": 4.8787802942719955e-06,
"loss": 0.3258,
"step": 3730
},
{
"epoch": 0.558636270281372,
"grad_norm": 1.324690818786621,
"learning_rate": 4.8527184150128475e-06,
"loss": 0.3182,
"step": 3740
},
{
"epoch": 0.5601299501484345,
"grad_norm": 1.0865528583526611,
"learning_rate": 4.82666054011364e-06,
"loss": 0.309,
"step": 3750
},
{
"epoch": 0.561623630015497,
"grad_norm": 1.5340676307678223,
"learning_rate": 4.800607378047944e-06,
"loss": 0.3356,
"step": 3760
},
{
"epoch": 0.5631173098825594,
"grad_norm": 1.476318359375,
"learning_rate": 4.774559637161197e-06,
"loss": 0.31,
"step": 3770
},
{
"epoch": 0.5646109897496219,
"grad_norm": 1.3898128271102905,
"learning_rate": 4.74851802565144e-06,
"loss": 0.3202,
"step": 3780
},
{
"epoch": 0.5661046696166844,
"grad_norm": 1.4143530130386353,
"learning_rate": 4.722483251550067e-06,
"loss": 0.3445,
"step": 3790
},
{
"epoch": 0.5675983494837469,
"grad_norm": 0.8102360963821411,
"learning_rate": 4.696456022702574e-06,
"loss": 0.3087,
"step": 3800
},
{
"epoch": 0.5690920293508094,
"grad_norm": 1.0995668172836304,
"learning_rate": 4.670437046749312e-06,
"loss": 0.3077,
"step": 3810
},
{
"epoch": 0.5705857092178719,
"grad_norm": 1.710694432258606,
"learning_rate": 4.6444270311062496e-06,
"loss": 0.3123,
"step": 3820
},
{
"epoch": 0.5720793890849344,
"grad_norm": 1.50558602809906,
"learning_rate": 4.618426682945736e-06,
"loss": 0.3142,
"step": 3830
},
{
"epoch": 0.5735730689519969,
"grad_norm": 1.3168991804122925,
"learning_rate": 4.59243670917728e-06,
"loss": 0.3349,
"step": 3840
},
{
"epoch": 0.5750667488190594,
"grad_norm": 1.0681779384613037,
"learning_rate": 4.566457816428326e-06,
"loss": 0.3153,
"step": 3850
},
{
"epoch": 0.5765604286861219,
"grad_norm": 1.5274810791015625,
"learning_rate": 4.5404907110250364e-06,
"loss": 0.3263,
"step": 3860
},
{
"epoch": 0.5780541085531843,
"grad_norm": 1.5444824695587158,
"learning_rate": 4.514536098973105e-06,
"loss": 0.306,
"step": 3870
},
{
"epoch": 0.5795477884202468,
"grad_norm": 1.126636028289795,
"learning_rate": 4.488594685938541e-06,
"loss": 0.3122,
"step": 3880
},
{
"epoch": 0.5810414682873093,
"grad_norm": 1.2185169458389282,
"learning_rate": 4.462667177228496e-06,
"loss": 0.2975,
"step": 3890
},
{
"epoch": 0.5825351481543718,
"grad_norm": 1.721125602722168,
"learning_rate": 4.4367542777720854e-06,
"loss": 0.3174,
"step": 3900
},
{
"epoch": 0.5840288280214343,
"grad_norm": 1.476317048072815,
"learning_rate": 4.410856692101219e-06,
"loss": 0.3093,
"step": 3910
},
{
"epoch": 0.5855225078884968,
"grad_norm": 1.5350698232650757,
"learning_rate": 4.384975124331451e-06,
"loss": 0.3243,
"step": 3920
},
{
"epoch": 0.5870161877555593,
"grad_norm": 1.8953022956848145,
"learning_rate": 4.35911027814283e-06,
"loss": 0.319,
"step": 3930
},
{
"epoch": 0.5885098676226218,
"grad_norm": 1.768258810043335,
"learning_rate": 4.333262856760774e-06,
"loss": 0.3073,
"step": 3940
},
{
"epoch": 0.5900035474896843,
"grad_norm": 0.974807858467102,
"learning_rate": 4.3074335629369455e-06,
"loss": 0.3208,
"step": 3950
},
{
"epoch": 0.5914972273567468,
"grad_norm": 1.3250782489776611,
"learning_rate": 4.281623098930148e-06,
"loss": 0.2884,
"step": 3960
},
{
"epoch": 0.5929909072238092,
"grad_norm": 1.5974177122116089,
"learning_rate": 4.25583216648723e-06,
"loss": 0.2861,
"step": 3970
},
{
"epoch": 0.5944845870908717,
"grad_norm": 1.2887296676635742,
"learning_rate": 4.2300614668240065e-06,
"loss": 0.3445,
"step": 3980
},
{
"epoch": 0.5959782669579342,
"grad_norm": 2.0698602199554443,
"learning_rate": 4.204311700606195e-06,
"loss": 0.3091,
"step": 3990
},
{
"epoch": 0.5974719468249967,
"grad_norm": 1.6275320053100586,
"learning_rate": 4.1785835679303635e-06,
"loss": 0.3223,
"step": 4000
},
{
"epoch": 0.5974719468249967,
"eval_loss": 0.31077098846435547,
"eval_runtime": 76.2168,
"eval_samples_per_second": 7.098,
"eval_steps_per_second": 3.556,
"step": 4000
},
{
"epoch": 0.5989656266920592,
"grad_norm": 1.140994906425476,
"learning_rate": 4.152877768304898e-06,
"loss": 0.316,
"step": 4010
},
{
"epoch": 0.6004593065591217,
"grad_norm": 1.963865041732788,
"learning_rate": 4.127195000630987e-06,
"loss": 0.3173,
"step": 4020
},
{
"epoch": 0.6019529864261842,
"grad_norm": 1.7010706663131714,
"learning_rate": 4.1015359631836085e-06,
"loss": 0.3318,
"step": 4030
},
{
"epoch": 0.6034466662932467,
"grad_norm": 1.9144036769866943,
"learning_rate": 4.0759013535925575e-06,
"loss": 0.3229,
"step": 4040
},
{
"epoch": 0.6049403461603092,
"grad_norm": 1.5420873165130615,
"learning_rate": 4.050291868823469e-06,
"loss": 0.2952,
"step": 4050
},
{
"epoch": 0.6064340260273717,
"grad_norm": 1.2293835878372192,
"learning_rate": 4.0247082051588794e-06,
"loss": 0.3273,
"step": 4060
},
{
"epoch": 0.6079277058944341,
"grad_norm": 1.203016996383667,
"learning_rate": 3.999151058179283e-06,
"loss": 0.3301,
"step": 4070
},
{
"epoch": 0.6094213857614966,
"grad_norm": 1.7640305757522583,
"learning_rate": 3.973621122744226e-06,
"loss": 0.3217,
"step": 4080
},
{
"epoch": 0.6109150656285591,
"grad_norm": 0.998776912689209,
"learning_rate": 3.9481190929734185e-06,
"loss": 0.2961,
"step": 4090
},
{
"epoch": 0.6124087454956216,
"grad_norm": 1.3041551113128662,
"learning_rate": 3.922645662227854e-06,
"loss": 0.3178,
"step": 4100
},
{
"epoch": 0.6139024253626841,
"grad_norm": 1.2349125146865845,
"learning_rate": 3.897201523090967e-06,
"loss": 0.2985,
"step": 4110
},
{
"epoch": 0.6153961052297466,
"grad_norm": 1.5378309488296509,
"learning_rate": 3.8717873673497945e-06,
"loss": 0.2987,
"step": 4120
},
{
"epoch": 0.6168897850968091,
"grad_norm": 1.2633869647979736,
"learning_rate": 3.846403885976175e-06,
"loss": 0.2989,
"step": 4130
},
{
"epoch": 0.6183834649638716,
"grad_norm": 1.7205194234848022,
"learning_rate": 3.821051769107952e-06,
"loss": 0.3105,
"step": 4140
},
{
"epoch": 0.6198771448309341,
"grad_norm": 2.0533735752105713,
"learning_rate": 3.7957317060302225e-06,
"loss": 0.3204,
"step": 4150
},
{
"epoch": 0.6213708246979966,
"grad_norm": 1.609309434890747,
"learning_rate": 3.770444385156587e-06,
"loss": 0.3107,
"step": 4160
},
{
"epoch": 0.622864504565059,
"grad_norm": 1.2998265027999878,
"learning_rate": 3.745190494010436e-06,
"loss": 0.3101,
"step": 4170
},
{
"epoch": 0.6243581844321215,
"grad_norm": 2.1723592281341553,
"learning_rate": 3.7199707192062578e-06,
"loss": 0.2887,
"step": 4180
},
{
"epoch": 0.625851864299184,
"grad_norm": 1.8159315586090088,
"learning_rate": 3.6947857464309695e-06,
"loss": 0.3088,
"step": 4190
},
{
"epoch": 0.6273455441662465,
"grad_norm": 1.644352912902832,
"learning_rate": 3.6696362604252734e-06,
"loss": 0.3128,
"step": 4200
},
{
"epoch": 0.628839224033309,
"grad_norm": 2.0270018577575684,
"learning_rate": 3.6445229449650443e-06,
"loss": 0.3324,
"step": 4210
},
{
"epoch": 0.6303329039003716,
"grad_norm": 1.4132014513015747,
"learning_rate": 3.6194464828427324e-06,
"loss": 0.3078,
"step": 4220
},
{
"epoch": 0.6318265837674341,
"grad_norm": 1.3948049545288086,
"learning_rate": 3.5944075558488e-06,
"loss": 0.315,
"step": 4230
},
{
"epoch": 0.6333202636344966,
"grad_norm": 1.699584722518921,
"learning_rate": 3.569406844753196e-06,
"loss": 0.3218,
"step": 4240
},
{
"epoch": 0.6348139435015591,
"grad_norm": 1.175880789756775,
"learning_rate": 3.544445029286829e-06,
"loss": 0.3271,
"step": 4250
},
{
"epoch": 0.6363076233686216,
"grad_norm": 1.1815729141235352,
"learning_rate": 3.5195227881230985e-06,
"loss": 0.3202,
"step": 4260
},
{
"epoch": 0.637801303235684,
"grad_norm": 1.096339464187622,
"learning_rate": 3.4946407988594394e-06,
"loss": 0.3212,
"step": 4270
},
{
"epoch": 0.6392949831027465,
"grad_norm": 2.088253974914551,
"learning_rate": 3.4697997379988983e-06,
"loss": 0.3117,
"step": 4280
},
{
"epoch": 0.640788662969809,
"grad_norm": 1.1896518468856812,
"learning_rate": 3.445000280931743e-06,
"loss": 0.3055,
"step": 4290
},
{
"epoch": 0.6422823428368715,
"grad_norm": 1.6314555406570435,
"learning_rate": 3.4202431019170964e-06,
"loss": 0.313,
"step": 4300
},
{
"epoch": 0.643776022703934,
"grad_norm": 1.4211452007293701,
"learning_rate": 3.3955288740646064e-06,
"loss": 0.2967,
"step": 4310
},
{
"epoch": 0.6452697025709965,
"grad_norm": 1.8873090744018555,
"learning_rate": 3.3708582693161473e-06,
"loss": 0.3218,
"step": 4320
},
{
"epoch": 0.646763382438059,
"grad_norm": 1.0354822874069214,
"learning_rate": 3.346231958427546e-06,
"loss": 0.3155,
"step": 4330
},
{
"epoch": 0.6482570623051215,
"grad_norm": 1.7193752527236938,
"learning_rate": 3.3216506109503478e-06,
"loss": 0.2933,
"step": 4340
},
{
"epoch": 0.649750742172184,
"grad_norm": 1.969494104385376,
"learning_rate": 3.297114895213611e-06,
"loss": 0.3086,
"step": 4350
},
{
"epoch": 0.6512444220392465,
"grad_norm": 1.3515018224716187,
"learning_rate": 3.2726254783057388e-06,
"loss": 0.3012,
"step": 4360
},
{
"epoch": 0.6527381019063089,
"grad_norm": 1.2439565658569336,
"learning_rate": 3.2481830260563393e-06,
"loss": 0.3175,
"step": 4370
},
{
"epoch": 0.6542317817733714,
"grad_norm": 2.0741679668426514,
"learning_rate": 3.2237882030181227e-06,
"loss": 0.3281,
"step": 4380
},
{
"epoch": 0.6557254616404339,
"grad_norm": 1.3941818475723267,
"learning_rate": 3.199441672448838e-06,
"loss": 0.3179,
"step": 4390
},
{
"epoch": 0.6572191415074964,
"grad_norm": 1.3664950132369995,
"learning_rate": 3.1751440962932324e-06,
"loss": 0.3252,
"step": 4400
},
{
"epoch": 0.6587128213745589,
"grad_norm": 1.3657866716384888,
"learning_rate": 3.150896135165059e-06,
"loss": 0.3274,
"step": 4410
},
{
"epoch": 0.6602065012416214,
"grad_norm": 1.4565297365188599,
"learning_rate": 3.126698448329112e-06,
"loss": 0.319,
"step": 4420
},
{
"epoch": 0.6617001811086839,
"grad_norm": 1.585686445236206,
"learning_rate": 3.1025516936833122e-06,
"loss": 0.2937,
"step": 4430
},
{
"epoch": 0.6631938609757464,
"grad_norm": 1.6105479001998901,
"learning_rate": 3.0784565277408063e-06,
"loss": 0.3247,
"step": 4440
},
{
"epoch": 0.6646875408428089,
"grad_norm": 1.0377700328826904,
"learning_rate": 3.0544136056121232e-06,
"loss": 0.3215,
"step": 4450
},
{
"epoch": 0.6661812207098714,
"grad_norm": 1.4693603515625,
"learning_rate": 3.0304235809873654e-06,
"loss": 0.3016,
"step": 4460
},
{
"epoch": 0.6676749005769338,
"grad_norm": 1.3283636569976807,
"learning_rate": 3.006487106118433e-06,
"loss": 0.3024,
"step": 4470
},
{
"epoch": 0.6691685804439963,
"grad_norm": 1.0531262159347534,
"learning_rate": 2.982604831801289e-06,
"loss": 0.3287,
"step": 4480
},
{
"epoch": 0.6706622603110588,
"grad_norm": 1.6268073320388794,
"learning_rate": 2.9587774073582677e-06,
"loss": 0.306,
"step": 4490
},
{
"epoch": 0.6721559401781213,
"grad_norm": 1.7072473764419556,
"learning_rate": 2.9350054806204214e-06,
"loss": 0.3346,
"step": 4500
},
{
"epoch": 0.6721559401781213,
"eval_loss": 0.307580828666687,
"eval_runtime": 76.4751,
"eval_samples_per_second": 7.074,
"eval_steps_per_second": 3.544,
"step": 4500
},
{
"epoch": 0.6736496200451838,
"grad_norm": 1.3722172975540161,
"learning_rate": 2.9112896979099037e-06,
"loss": 0.3213,
"step": 4510
},
{
"epoch": 0.6751432999122463,
"grad_norm": 0.8439034819602966,
"learning_rate": 2.8876307040223956e-06,
"loss": 0.3102,
"step": 4520
},
{
"epoch": 0.6766369797793088,
"grad_norm": 2.1569015979766846,
"learning_rate": 2.864029142209579e-06,
"loss": 0.3189,
"step": 4530
},
{
"epoch": 0.6781306596463713,
"grad_norm": 0.9060570597648621,
"learning_rate": 2.840485654161651e-06,
"loss": 0.2811,
"step": 4540
},
{
"epoch": 0.6796243395134338,
"grad_norm": 1.4373691082000732,
"learning_rate": 2.817000879989866e-06,
"loss": 0.3052,
"step": 4550
},
{
"epoch": 0.6811180193804963,
"grad_norm": 1.3326523303985596,
"learning_rate": 2.7935754582091413e-06,
"loss": 0.3184,
"step": 4560
},
{
"epoch": 0.6826116992475587,
"grad_norm": 1.3754558563232422,
"learning_rate": 2.770210025720691e-06,
"loss": 0.3192,
"step": 4570
},
{
"epoch": 0.6841053791146212,
"grad_norm": 2.0747873783111572,
"learning_rate": 2.746905217794715e-06,
"loss": 0.3408,
"step": 4580
},
{
"epoch": 0.6855990589816837,
"grad_norm": 1.3364531993865967,
"learning_rate": 2.7236616680531256e-06,
"loss": 0.3005,
"step": 4590
},
{
"epoch": 0.6870927388487462,
"grad_norm": 1.6091829538345337,
"learning_rate": 2.7004800084523166e-06,
"loss": 0.3288,
"step": 4600
},
{
"epoch": 0.6885864187158087,
"grad_norm": 1.1656900644302368,
"learning_rate": 2.6773608692659825e-06,
"loss": 0.2837,
"step": 4610
},
{
"epoch": 0.6900800985828712,
"grad_norm": 1.4220030307769775,
"learning_rate": 2.6543048790679915e-06,
"loss": 0.3119,
"step": 4620
},
{
"epoch": 0.6915737784499337,
"grad_norm": 1.688082218170166,
"learning_rate": 2.63131266471528e-06,
"loss": 0.3282,
"step": 4630
},
{
"epoch": 0.6930674583169962,
"grad_norm": 1.2834751605987549,
"learning_rate": 2.60838485133082e-06,
"loss": 0.3018,
"step": 4640
},
{
"epoch": 0.6945611381840587,
"grad_norm": 1.5603129863739014,
"learning_rate": 2.5855220622866197e-06,
"loss": 0.3035,
"step": 4650
},
{
"epoch": 0.6960548180511212,
"grad_norm": 1.6552413702011108,
"learning_rate": 2.562724919186777e-06,
"loss": 0.321,
"step": 4660
},
{
"epoch": 0.6975484979181836,
"grad_norm": 1.5287736654281616,
"learning_rate": 2.5399940418505754e-06,
"loss": 0.3229,
"step": 4670
},
{
"epoch": 0.6990421777852461,
"grad_norm": 1.5035234689712524,
"learning_rate": 2.5173300482956346e-06,
"loss": 0.2946,
"step": 4680
},
{
"epoch": 0.7005358576523086,
"grad_norm": 2.163083791732788,
"learning_rate": 2.4947335547211083e-06,
"loss": 0.3239,
"step": 4690
},
{
"epoch": 0.7020295375193711,
"grad_norm": 1.5969173908233643,
"learning_rate": 2.472205175490928e-06,
"loss": 0.3033,
"step": 4700
},
{
"epoch": 0.7035232173864336,
"grad_norm": 1.2077685594558716,
"learning_rate": 2.4497455231171003e-06,
"loss": 0.3142,
"step": 4710
},
{
"epoch": 0.7050168972534961,
"grad_norm": 1.0711603164672852,
"learning_rate": 2.4273552082430586e-06,
"loss": 0.292,
"step": 4720
},
{
"epoch": 0.7065105771205586,
"grad_norm": 1.1325751543045044,
"learning_rate": 2.405034839627051e-06,
"loss": 0.3309,
"step": 4730
},
{
"epoch": 0.7080042569876212,
"grad_norm": 1.3801145553588867,
"learning_rate": 2.3827850241255974e-06,
"loss": 0.3266,
"step": 4740
},
{
"epoch": 0.7094979368546837,
"grad_norm": 1.4642720222473145,
"learning_rate": 2.3606063666769846e-06,
"loss": 0.2985,
"step": 4750
},
{
"epoch": 0.7109916167217462,
"grad_norm": 1.8076415061950684,
"learning_rate": 2.3384994702848234e-06,
"loss": 0.3185,
"step": 4760
},
{
"epoch": 0.7124852965888087,
"grad_norm": 1.7433451414108276,
"learning_rate": 2.3164649360016505e-06,
"loss": 0.3004,
"step": 4770
},
{
"epoch": 0.713978976455871,
"grad_norm": 1.4180279970169067,
"learning_rate": 2.294503362912589e-06,
"loss": 0.3193,
"step": 4780
},
{
"epoch": 0.7154726563229336,
"grad_norm": 1.5062882900238037,
"learning_rate": 2.2726153481190588e-06,
"loss": 0.3233,
"step": 4790
},
{
"epoch": 0.7169663361899961,
"grad_norm": 1.4006506204605103,
"learning_rate": 2.250801486722541e-06,
"loss": 0.3125,
"step": 4800
},
{
"epoch": 0.7184600160570586,
"grad_norm": 1.7776737213134766,
"learning_rate": 2.2290623718084052e-06,
"loss": 0.2971,
"step": 4810
},
{
"epoch": 0.7199536959241211,
"grad_norm": 1.5043376684188843,
"learning_rate": 2.207398594429773e-06,
"loss": 0.2992,
"step": 4820
},
{
"epoch": 0.7214473757911836,
"grad_norm": 2.0632483959198,
"learning_rate": 2.185810743591458e-06,
"loss": 0.3223,
"step": 4830
},
{
"epoch": 0.7229410556582461,
"grad_norm": 1.3994874954223633,
"learning_rate": 2.1642994062339458e-06,
"loss": 0.3374,
"step": 4840
},
{
"epoch": 0.7244347355253086,
"grad_norm": 1.8748818635940552,
"learning_rate": 2.1428651672174382e-06,
"loss": 0.308,
"step": 4850
},
{
"epoch": 0.7259284153923711,
"grad_norm": 1.1066455841064453,
"learning_rate": 2.1215086093059527e-06,
"loss": 0.2935,
"step": 4860
},
{
"epoch": 0.7274220952594336,
"grad_norm": 1.480527400970459,
"learning_rate": 2.100230313151476e-06,
"loss": 0.3537,
"step": 4870
},
{
"epoch": 0.728915775126496,
"grad_norm": 1.5903476476669312,
"learning_rate": 2.079030857278179e-06,
"loss": 0.3039,
"step": 4880
},
{
"epoch": 0.7304094549935585,
"grad_norm": 1.1910865306854248,
"learning_rate": 2.057910818066684e-06,
"loss": 0.3233,
"step": 4890
},
{
"epoch": 0.731903134860621,
"grad_norm": 1.305713176727295,
"learning_rate": 2.036870769738401e-06,
"loss": 0.3295,
"step": 4900
},
{
"epoch": 0.7333968147276835,
"grad_norm": 1.8359174728393555,
"learning_rate": 2.0159112843399066e-06,
"loss": 0.3121,
"step": 4910
},
{
"epoch": 0.734890494594746,
"grad_norm": 1.200527548789978,
"learning_rate": 1.995032931727396e-06,
"loss": 0.3155,
"step": 4920
},
{
"epoch": 0.7363841744618085,
"grad_norm": 0.9834126234054565,
"learning_rate": 1.97423627955119e-06,
"loss": 0.3086,
"step": 4930
},
{
"epoch": 0.737877854328871,
"grad_norm": 1.8965601921081543,
"learning_rate": 1.9535218932402987e-06,
"loss": 0.296,
"step": 4940
},
{
"epoch": 0.7393715341959335,
"grad_norm": 1.8559459447860718,
"learning_rate": 1.9328903359870504e-06,
"loss": 0.2943,
"step": 4950
},
{
"epoch": 0.740865214062996,
"grad_norm": 1.5035439729690552,
"learning_rate": 1.9123421687317784e-06,
"loss": 0.3121,
"step": 4960
},
{
"epoch": 0.7423588939300585,
"grad_norm": 1.1852291822433472,
"learning_rate": 1.8918779501475708e-06,
"loss": 0.3158,
"step": 4970
},
{
"epoch": 0.7438525737971209,
"grad_norm": 1.259185791015625,
"learning_rate": 1.8714982366250796e-06,
"loss": 0.2938,
"step": 4980
},
{
"epoch": 0.7453462536641834,
"grad_norm": 1.5034098625183105,
"learning_rate": 1.8512035822573915e-06,
"loss": 0.2949,
"step": 4990
},
{
"epoch": 0.7468399335312459,
"grad_norm": 1.282578706741333,
"learning_rate": 1.8309945388249733e-06,
"loss": 0.3098,
"step": 5000
},
{
"epoch": 0.7468399335312459,
"eval_loss": 0.30645084381103516,
"eval_runtime": 76.3887,
"eval_samples_per_second": 7.082,
"eval_steps_per_second": 3.548,
"step": 5000
},
{
"epoch": 0.7483336133983084,
"grad_norm": 1.2789969444274902,
"learning_rate": 1.8108716557806545e-06,
"loss": 0.3168,
"step": 5010
},
{
"epoch": 0.7498272932653709,
"grad_norm": 1.0109808444976807,
"learning_rate": 1.7908354802346982e-06,
"loss": 0.2843,
"step": 5020
},
{
"epoch": 0.7513209731324334,
"grad_norm": 1.3991084098815918,
"learning_rate": 1.7708865569399247e-06,
"loss": 0.3324,
"step": 5030
},
{
"epoch": 0.7528146529994959,
"grad_norm": 1.529976725578308,
"learning_rate": 1.751025428276899e-06,
"loss": 0.3152,
"step": 5040
},
{
"epoch": 0.7543083328665584,
"grad_norm": 1.9336539506912231,
"learning_rate": 1.7312526342391862e-06,
"loss": 0.3077,
"step": 5050
},
{
"epoch": 0.7558020127336209,
"grad_norm": 1.4918617010116577,
"learning_rate": 1.7115687124186658e-06,
"loss": 0.3139,
"step": 5060
},
{
"epoch": 0.7572956926006834,
"grad_norm": 2.2446916103363037,
"learning_rate": 1.6919741979909222e-06,
"loss": 0.3278,
"step": 5070
},
{
"epoch": 0.7587893724677458,
"grad_norm": 1.2982836961746216,
"learning_rate": 1.6724696237006848e-06,
"loss": 0.3063,
"step": 5080
},
{
"epoch": 0.7602830523348083,
"grad_norm": 1.0719565153121948,
"learning_rate": 1.653055519847357e-06,
"loss": 0.2921,
"step": 5090
},
{
"epoch": 0.7617767322018708,
"grad_norm": 1.5067294836044312,
"learning_rate": 1.6337324142705836e-06,
"loss": 0.3102,
"step": 5100
},
{
"epoch": 0.7632704120689333,
"grad_norm": 1.2776610851287842,
"learning_rate": 1.6145008323359068e-06,
"loss": 0.2969,
"step": 5110
},
{
"epoch": 0.7647640919359958,
"grad_norm": 1.1912457942962646,
"learning_rate": 1.5953612969204834e-06,
"loss": 0.2682,
"step": 5120
},
{
"epoch": 0.7662577718030583,
"grad_norm": 1.404762625694275,
"learning_rate": 1.5763143283988663e-06,
"loss": 0.2963,
"step": 5130
},
{
"epoch": 0.7677514516701208,
"grad_norm": 1.2275928258895874,
"learning_rate": 1.5573604446288572e-06,
"loss": 0.2801,
"step": 5140
},
{
"epoch": 0.7692451315371833,
"grad_norm": 1.437886118888855,
"learning_rate": 1.538500160937424e-06,
"loss": 0.31,
"step": 5150
},
{
"epoch": 0.7707388114042458,
"grad_norm": 1.3553423881530762,
"learning_rate": 1.519733990106696e-06,
"loss": 0.2946,
"step": 5160
},
{
"epoch": 0.7722324912713083,
"grad_norm": 1.8724462985992432,
"learning_rate": 1.5010624423600161e-06,
"loss": 0.294,
"step": 5170
},
{
"epoch": 0.7737261711383707,
"grad_norm": 1.0624821186065674,
"learning_rate": 1.48248602534807e-06,
"loss": 0.3292,
"step": 5180
},
{
"epoch": 0.7752198510054332,
"grad_norm": 1.6190390586853027,
"learning_rate": 1.4640052441350893e-06,
"loss": 0.3258,
"step": 5190
},
{
"epoch": 0.7767135308724957,
"grad_norm": 1.0761600732803345,
"learning_rate": 1.4456206011851115e-06,
"loss": 0.3226,
"step": 5200
},
{
"epoch": 0.7782072107395582,
"grad_norm": 1.5092500448226929,
"learning_rate": 1.4273325963483226e-06,
"loss": 0.2854,
"step": 5210
},
{
"epoch": 0.7797008906066207,
"grad_norm": 1.7542755603790283,
"learning_rate": 1.4091417268474683e-06,
"loss": 0.3071,
"step": 5220
},
{
"epoch": 0.7811945704736832,
"grad_norm": 1.1649888753890991,
"learning_rate": 1.3910484872643326e-06,
"loss": 0.3309,
"step": 5230
},
{
"epoch": 0.7826882503407457,
"grad_norm": 1.1616461277008057,
"learning_rate": 1.3730533695262927e-06,
"loss": 0.285,
"step": 5240
},
{
"epoch": 0.7841819302078082,
"grad_norm": 1.3221337795257568,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.3119,
"step": 5250
},
{
"epoch": 0.7856756100748707,
"grad_norm": 1.7051467895507812,
"learning_rate": 1.3373594539427941e-06,
"loss": 0.3262,
"step": 5260
},
{
"epoch": 0.7871692899419332,
"grad_norm": 1.3770607709884644,
"learning_rate": 1.3196616265600442e-06,
"loss": 0.2957,
"step": 5270
},
{
"epoch": 0.7886629698089956,
"grad_norm": 2.04876708984375,
"learning_rate": 1.3020638619214199e-06,
"loss": 0.3109,
"step": 5280
},
{
"epoch": 0.7901566496760581,
"grad_norm": 1.0968056917190552,
"learning_rate": 1.2845666384830951e-06,
"loss": 0.325,
"step": 5290
},
{
"epoch": 0.7916503295431206,
"grad_norm": 1.4080263376235962,
"learning_rate": 1.2671704319676847e-06,
"loss": 0.3151,
"step": 5300
},
{
"epoch": 0.7931440094101831,
"grad_norm": 1.1893798112869263,
"learning_rate": 1.2498757153513075e-06,
"loss": 0.3196,
"step": 5310
},
{
"epoch": 0.7946376892772457,
"grad_norm": 1.286380410194397,
"learning_rate": 1.2326829588507282e-06,
"loss": 0.3288,
"step": 5320
},
{
"epoch": 0.7961313691443082,
"grad_norm": 1.841400384902954,
"learning_rate": 1.2155926299105737e-06,
"loss": 0.3035,
"step": 5330
},
{
"epoch": 0.7976250490113707,
"grad_norm": 1.0659278631210327,
"learning_rate": 1.1986051931906207e-06,
"loss": 0.3368,
"step": 5340
},
{
"epoch": 0.7991187288784332,
"grad_norm": 1.0193463563919067,
"learning_rate": 1.1817211105531667e-06,
"loss": 0.3063,
"step": 5350
},
{
"epoch": 0.8006124087454957,
"grad_norm": 1.656656265258789,
"learning_rate": 1.1649408410504686e-06,
"loss": 0.3059,
"step": 5360
},
{
"epoch": 0.8021060886125582,
"grad_norm": 1.4112011194229126,
"learning_rate": 1.148264840912267e-06,
"loss": 0.3059,
"step": 5370
},
{
"epoch": 0.8035997684796206,
"grad_norm": 0.9142074584960938,
"learning_rate": 1.131693563533376e-06,
"loss": 0.3003,
"step": 5380
},
{
"epoch": 0.8050934483466831,
"grad_norm": 1.3341706991195679,
"learning_rate": 1.1152274594613588e-06,
"loss": 0.3185,
"step": 5390
},
{
"epoch": 0.8065871282137456,
"grad_norm": 1.5966401100158691,
"learning_rate": 1.0988669763842786e-06,
"loss": 0.3394,
"step": 5400
},
{
"epoch": 0.8080808080808081,
"grad_norm": 1.6179472208023071,
"learning_rate": 1.0826125591185265e-06,
"loss": 0.3209,
"step": 5410
},
{
"epoch": 0.8095744879478706,
"grad_norm": 1.7330031394958496,
"learning_rate": 1.0664646495967263e-06,
"loss": 0.3303,
"step": 5420
},
{
"epoch": 0.8110681678149331,
"grad_norm": 1.7587456703186035,
"learning_rate": 1.050423686855721e-06,
"loss": 0.3356,
"step": 5430
},
{
"epoch": 0.8125618476819956,
"grad_norm": 1.2351861000061035,
"learning_rate": 1.0344901070246332e-06,
"loss": 0.2924,
"step": 5440
},
{
"epoch": 0.8140555275490581,
"grad_norm": 1.0523043870925903,
"learning_rate": 1.0186643433130128e-06,
"loss": 0.314,
"step": 5450
},
{
"epoch": 0.8155492074161206,
"grad_norm": 1.5923221111297607,
"learning_rate": 1.0029468259990515e-06,
"loss": 0.2991,
"step": 5460
},
{
"epoch": 0.8170428872831831,
"grad_norm": 1.6099388599395752,
"learning_rate": 9.873379824178886e-07,
"loss": 0.3055,
"step": 5470
},
{
"epoch": 0.8185365671502455,
"grad_norm": 1.1426987648010254,
"learning_rate": 9.718382369499936e-07,
"loss": 0.2959,
"step": 5480
},
{
"epoch": 0.820030247017308,
"grad_norm": 2.282841444015503,
"learning_rate": 9.564480110096226e-07,
"loss": 0.3473,
"step": 5490
},
{
"epoch": 0.8215239268843705,
"grad_norm": 1.753849983215332,
"learning_rate": 9.411677230333672e-07,
"loss": 0.2938,
"step": 5500
},
{
"epoch": 0.8215239268843705,
"eval_loss": 0.30588287115097046,
"eval_runtime": 76.2019,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 3.556,
"step": 5500
},
{
"epoch": 0.823017606751433,
"grad_norm": 1.5381008386611938,
"learning_rate": 9.259977884687726e-07,
"loss": 0.3001,
"step": 5510
},
{
"epoch": 0.8245112866184955,
"grad_norm": 1.2274119853973389,
"learning_rate": 9.10938619763046e-07,
"loss": 0.2968,
"step": 5520
},
{
"epoch": 0.826004966485558,
"grad_norm": 1.2566465139389038,
"learning_rate": 8.959906263518398e-07,
"loss": 0.3135,
"step": 5530
},
{
"epoch": 0.8274986463526205,
"grad_norm": 1.4085761308670044,
"learning_rate": 8.811542146481223e-07,
"loss": 0.3067,
"step": 5540
},
{
"epoch": 0.828992326219683,
"grad_norm": 1.507485032081604,
"learning_rate": 8.664297880311234e-07,
"loss": 0.3254,
"step": 5550
},
{
"epoch": 0.8304860060867455,
"grad_norm": 1.4448457956314087,
"learning_rate": 8.518177468353767e-07,
"loss": 0.3273,
"step": 5560
},
{
"epoch": 0.831979685953808,
"grad_norm": 1.4602904319763184,
"learning_rate": 8.373184883398239e-07,
"loss": 0.2887,
"step": 5570
},
{
"epoch": 0.8334733658208704,
"grad_norm": 2.004305601119995,
"learning_rate": 8.229324067570193e-07,
"loss": 0.3068,
"step": 5580
},
{
"epoch": 0.8349670456879329,
"grad_norm": 1.5558735132217407,
"learning_rate": 8.086598932224116e-07,
"loss": 0.3012,
"step": 5590
},
{
"epoch": 0.8364607255549954,
"grad_norm": 1.6343220472335815,
"learning_rate": 7.945013357837089e-07,
"loss": 0.3052,
"step": 5600
},
{
"epoch": 0.8379544054220579,
"grad_norm": 1.551553726196289,
"learning_rate": 7.804571193903277e-07,
"loss": 0.3024,
"step": 5610
},
{
"epoch": 0.8394480852891204,
"grad_norm": 1.437134027481079,
"learning_rate": 7.665276258829274e-07,
"loss": 0.312,
"step": 5620
},
{
"epoch": 0.8409417651561829,
"grad_norm": 1.3783475160598755,
"learning_rate": 7.527132339830273e-07,
"loss": 0.2973,
"step": 5630
},
{
"epoch": 0.8424354450232454,
"grad_norm": 1.8350893259048462,
"learning_rate": 7.390143192827148e-07,
"loss": 0.3183,
"step": 5640
},
{
"epoch": 0.8439291248903079,
"grad_norm": 1.1564915180206299,
"learning_rate": 7.25431254234425e-07,
"loss": 0.281,
"step": 5650
},
{
"epoch": 0.8454228047573704,
"grad_norm": 2.0109055042266846,
"learning_rate": 7.119644081408216e-07,
"loss": 0.3059,
"step": 5660
},
{
"epoch": 0.8469164846244329,
"grad_norm": 1.2296003103256226,
"learning_rate": 6.986141471447533e-07,
"loss": 0.3149,
"step": 5670
},
{
"epoch": 0.8484101644914953,
"grad_norm": 1.5590568780899048,
"learning_rate": 6.853808342192981e-07,
"loss": 0.31,
"step": 5680
},
{
"epoch": 0.8499038443585578,
"grad_norm": 1.2360633611679077,
"learning_rate": 6.72264829157896e-07,
"loss": 0.306,
"step": 5690
},
{
"epoch": 0.8513975242256203,
"grad_norm": 0.8478025197982788,
"learning_rate": 6.592664885645678e-07,
"loss": 0.2989,
"step": 5700
},
{
"epoch": 0.8528912040926828,
"grad_norm": 2.2121694087982178,
"learning_rate": 6.463861658442166e-07,
"loss": 0.3025,
"step": 5710
},
{
"epoch": 0.8543848839597453,
"grad_norm": 1.8200898170471191,
"learning_rate": 6.336242111930224e-07,
"loss": 0.2983,
"step": 5720
},
{
"epoch": 0.8558785638268078,
"grad_norm": 1.2733746767044067,
"learning_rate": 6.209809715889182e-07,
"loss": 0.3251,
"step": 5730
},
{
"epoch": 0.8573722436938703,
"grad_norm": 1.0714962482452393,
"learning_rate": 6.084567907821559e-07,
"loss": 0.3361,
"step": 5740
},
{
"epoch": 0.8588659235609328,
"grad_norm": 1.6452571153640747,
"learning_rate": 5.960520092859668e-07,
"loss": 0.3235,
"step": 5750
},
{
"epoch": 0.8603596034279953,
"grad_norm": 0.9949471354484558,
"learning_rate": 5.837669643672927e-07,
"loss": 0.3074,
"step": 5760
},
{
"epoch": 0.8618532832950578,
"grad_norm": 1.2611801624298096,
"learning_rate": 5.716019900376257e-07,
"loss": 0.2955,
"step": 5770
},
{
"epoch": 0.8633469631621202,
"grad_norm": 1.4436218738555908,
"learning_rate": 5.595574170439199e-07,
"loss": 0.3071,
"step": 5780
},
{
"epoch": 0.8648406430291827,
"grad_norm": 1.6352945566177368,
"learning_rate": 5.476335728596061e-07,
"loss": 0.327,
"step": 5790
},
{
"epoch": 0.8663343228962452,
"grad_norm": 1.846356749534607,
"learning_rate": 5.358307816756803e-07,
"loss": 0.3174,
"step": 5800
},
{
"epoch": 0.8678280027633077,
"grad_norm": 1.2852689027786255,
"learning_rate": 5.24149364391895e-07,
"loss": 0.3086,
"step": 5810
},
{
"epoch": 0.8693216826303702,
"grad_norm": 1.4159107208251953,
"learning_rate": 5.125896386080348e-07,
"loss": 0.2913,
"step": 5820
},
{
"epoch": 0.8708153624974327,
"grad_norm": 1.575850248336792,
"learning_rate": 5.011519186152775e-07,
"loss": 0.2937,
"step": 5830
},
{
"epoch": 0.8723090423644952,
"grad_norm": 1.3794643878936768,
"learning_rate": 4.898365153876505e-07,
"loss": 0.3049,
"step": 5840
},
{
"epoch": 0.8738027222315577,
"grad_norm": 1.2364630699157715,
"learning_rate": 4.78643736573578e-07,
"loss": 0.3129,
"step": 5850
},
{
"epoch": 0.8752964020986203,
"grad_norm": 0.8901196122169495,
"learning_rate": 4.675738864875134e-07,
"loss": 0.2912,
"step": 5860
},
{
"epoch": 0.8767900819656828,
"grad_norm": 1.1799402236938477,
"learning_rate": 4.566272661016674e-07,
"loss": 0.3204,
"step": 5870
},
{
"epoch": 0.8782837618327451,
"grad_norm": 1.7847167253494263,
"learning_rate": 4.4580417303782487e-07,
"loss": 0.3081,
"step": 5880
},
{
"epoch": 0.8797774416998076,
"grad_norm": 2.1496951580047607,
"learning_rate": 4.3510490155925235e-07,
"loss": 0.3114,
"step": 5890
},
{
"epoch": 0.8812711215668702,
"grad_norm": 1.502504587173462,
"learning_rate": 4.245297425626971e-07,
"loss": 0.2944,
"step": 5900
},
{
"epoch": 0.8827648014339327,
"grad_norm": 1.207311749458313,
"learning_rate": 4.140789835704806e-07,
"loss": 0.3059,
"step": 5910
},
{
"epoch": 0.8842584813009952,
"grad_norm": 1.821098804473877,
"learning_rate": 4.0375290872267825e-07,
"loss": 0.2872,
"step": 5920
},
{
"epoch": 0.8857521611680577,
"grad_norm": 1.6243329048156738,
"learning_rate": 3.935517987693932e-07,
"loss": 0.3064,
"step": 5930
},
{
"epoch": 0.8872458410351202,
"grad_norm": 1.5818045139312744,
"learning_rate": 3.8347593106312974e-07,
"loss": 0.2777,
"step": 5940
},
{
"epoch": 0.8887395209021827,
"grad_norm": 1.2436670064926147,
"learning_rate": 3.7352557955124437e-07,
"loss": 0.3014,
"step": 5950
},
{
"epoch": 0.8902332007692452,
"grad_norm": 1.4755676984786987,
"learning_rate": 3.637010147685016e-07,
"loss": 0.312,
"step": 5960
},
{
"epoch": 0.8917268806363077,
"grad_norm": 1.5071330070495605,
"learning_rate": 3.540025038297196e-07,
"loss": 0.3246,
"step": 5970
},
{
"epoch": 0.8932205605033701,
"grad_norm": 1.5448203086853027,
"learning_rate": 3.44430310422505e-07,
"loss": 0.301,
"step": 5980
},
{
"epoch": 0.8947142403704326,
"grad_norm": 1.1398577690124512,
"learning_rate": 3.3498469480008454e-07,
"loss": 0.2993,
"step": 5990
},
{
"epoch": 0.8962079202374951,
"grad_norm": 1.4388718605041504,
"learning_rate": 3.256659137742313e-07,
"loss": 0.315,
"step": 6000
},
{
"epoch": 0.8962079202374951,
"eval_loss": 0.30459803342819214,
"eval_runtime": 76.2056,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 3.556,
"step": 6000
},
{
"epoch": 0.8977016001045576,
"grad_norm": 1.604347825050354,
"learning_rate": 3.164742207082788e-07,
"loss": 0.319,
"step": 6010
},
{
"epoch": 0.8991952799716201,
"grad_norm": 1.558813214302063,
"learning_rate": 3.0740986551023535e-07,
"loss": 0.3084,
"step": 6020
},
{
"epoch": 0.9006889598386826,
"grad_norm": 1.7928036451339722,
"learning_rate": 2.9847309462598726e-07,
"loss": 0.3147,
"step": 6030
},
{
"epoch": 0.9021826397057451,
"grad_norm": 1.4524924755096436,
"learning_rate": 2.896641510326009e-07,
"loss": 0.3112,
"step": 6040
},
{
"epoch": 0.9036763195728076,
"grad_norm": 1.2006369829177856,
"learning_rate": 2.809832742317137e-07,
"loss": 0.3284,
"step": 6050
},
{
"epoch": 0.9051699994398701,
"grad_norm": 1.2945834398269653,
"learning_rate": 2.724307002430249e-07,
"loss": 0.3057,
"step": 6060
},
{
"epoch": 0.9066636793069326,
"grad_norm": 0.915762722492218,
"learning_rate": 2.6400666159787646e-07,
"loss": 0.3078,
"step": 6070
},
{
"epoch": 0.908157359173995,
"grad_norm": 1.4049233198165894,
"learning_rate": 2.5571138733293255e-07,
"loss": 0.3251,
"step": 6080
},
{
"epoch": 0.9096510390410575,
"grad_norm": 1.4237291812896729,
"learning_rate": 2.475451029839515e-07,
"loss": 0.3224,
"step": 6090
},
{
"epoch": 0.91114471890812,
"grad_norm": 1.0404157638549805,
"learning_rate": 2.3950803057965435e-07,
"loss": 0.312,
"step": 6100
},
{
"epoch": 0.9126383987751825,
"grad_norm": 1.250205636024475,
"learning_rate": 2.3160038863568768e-07,
"loss": 0.312,
"step": 6110
},
{
"epoch": 0.914132078642245,
"grad_norm": 1.2475612163543701,
"learning_rate": 2.2382239214868152e-07,
"loss": 0.3077,
"step": 6120
},
{
"epoch": 0.9156257585093075,
"grad_norm": 1.5456167459487915,
"learning_rate": 2.161742525904087e-07,
"loss": 0.3301,
"step": 6130
},
{
"epoch": 0.91711943837637,
"grad_norm": 1.449046015739441,
"learning_rate": 2.086561779020285e-07,
"loss": 0.3371,
"step": 6140
},
{
"epoch": 0.9186131182434325,
"grad_norm": 1.6901681423187256,
"learning_rate": 2.012683724884379e-07,
"loss": 0.3178,
"step": 6150
},
{
"epoch": 0.920106798110495,
"grad_norm": 1.598301649093628,
"learning_rate": 1.9401103721271076e-07,
"loss": 0.2795,
"step": 6160
},
{
"epoch": 0.9216004779775575,
"grad_norm": 1.3405296802520752,
"learning_rate": 1.8688436939064025e-07,
"loss": 0.3362,
"step": 6170
},
{
"epoch": 0.9230941578446199,
"grad_norm": 1.2465345859527588,
"learning_rate": 1.798885627853708e-07,
"loss": 0.3009,
"step": 6180
},
{
"epoch": 0.9245878377116824,
"grad_norm": 1.2908686399459839,
"learning_rate": 1.7302380760213345e-07,
"loss": 0.3066,
"step": 6190
},
{
"epoch": 0.9260815175787449,
"grad_norm": 1.4556738138198853,
"learning_rate": 1.6629029048307044e-07,
"loss": 0.3031,
"step": 6200
},
{
"epoch": 0.9275751974458074,
"grad_norm": 0.9532304406166077,
"learning_rate": 1.5968819450216444e-07,
"loss": 0.331,
"step": 6210
},
{
"epoch": 0.9290688773128699,
"grad_norm": 1.7296748161315918,
"learning_rate": 1.5321769916025798e-07,
"loss": 0.3211,
"step": 6220
},
{
"epoch": 0.9305625571799324,
"grad_norm": 1.8877276182174683,
"learning_rate": 1.4687898038017513e-07,
"loss": 0.3241,
"step": 6230
},
{
"epoch": 0.9320562370469949,
"grad_norm": 1.5971440076828003,
"learning_rate": 1.406722105019376e-07,
"loss": 0.3089,
"step": 6240
},
{
"epoch": 0.9335499169140574,
"grad_norm": 1.8514761924743652,
"learning_rate": 1.3459755827807952e-07,
"loss": 0.3199,
"step": 6250
},
{
"epoch": 0.9350435967811199,
"grad_norm": 1.612648367881775,
"learning_rate": 1.2865518886905848e-07,
"loss": 0.3195,
"step": 6260
},
{
"epoch": 0.9365372766481824,
"grad_norm": 1.5449368953704834,
"learning_rate": 1.228452638387656e-07,
"loss": 0.3154,
"step": 6270
},
{
"epoch": 0.9380309565152448,
"grad_norm": 1.3344995975494385,
"learning_rate": 1.1716794115013419e-07,
"loss": 0.3065,
"step": 6280
},
{
"epoch": 0.9395246363823073,
"grad_norm": 1.4628318548202515,
"learning_rate": 1.1162337516084253e-07,
"loss": 0.3333,
"step": 6290
},
{
"epoch": 0.9410183162493698,
"grad_norm": 1.5249940156936646,
"learning_rate": 1.0621171661911844e-07,
"loss": 0.3183,
"step": 6300
},
{
"epoch": 0.9425119961164323,
"grad_norm": 1.3567790985107422,
"learning_rate": 1.0093311265963967e-07,
"loss": 0.2903,
"step": 6310
},
{
"epoch": 0.9440056759834948,
"grad_norm": 2.6283679008483887,
"learning_rate": 9.578770679953664e-08,
"loss": 0.3182,
"step": 6320
},
{
"epoch": 0.9454993558505573,
"grad_norm": 1.6033724546432495,
"learning_rate": 9.07756389344866e-08,
"loss": 0.3061,
"step": 6330
},
{
"epoch": 0.9469930357176198,
"grad_norm": 1.2911169528961182,
"learning_rate": 8.589704533491173e-08,
"loss": 0.3242,
"step": 6340
},
{
"epoch": 0.9484867155846823,
"grad_norm": 1.2943521738052368,
"learning_rate": 8.115205864227316e-08,
"loss": 0.319,
"step": 6350
},
{
"epoch": 0.9499803954517448,
"grad_norm": 1.4396514892578125,
"learning_rate": 7.65408078654678e-08,
"loss": 0.3246,
"step": 6360
},
{
"epoch": 0.9514740753188073,
"grad_norm": 1.282568097114563,
"learning_rate": 7.206341837731667e-08,
"loss": 0.3194,
"step": 6370
},
{
"epoch": 0.9529677551858697,
"grad_norm": 1.448075294494629,
"learning_rate": 6.772001191115928e-08,
"loss": 0.2985,
"step": 6380
},
{
"epoch": 0.9544614350529322,
"grad_norm": 0.9579274654388428,
"learning_rate": 6.351070655754187e-08,
"loss": 0.3208,
"step": 6390
},
{
"epoch": 0.9559551149199947,
"grad_norm": 1.2854158878326416,
"learning_rate": 5.943561676100773e-08,
"loss": 0.2923,
"step": 6400
},
{
"epoch": 0.9574487947870572,
"grad_norm": 1.7156449556350708,
"learning_rate": 5.5494853316985786e-08,
"loss": 0.3132,
"step": 6410
},
{
"epoch": 0.9589424746541197,
"grad_norm": 1.3507882356643677,
"learning_rate": 5.168852336877695e-08,
"loss": 0.335,
"step": 6420
},
{
"epoch": 0.9604361545211823,
"grad_norm": 1.696518063545227,
"learning_rate": 4.801673040464305e-08,
"loss": 0.3196,
"step": 6430
},
{
"epoch": 0.9619298343882448,
"grad_norm": 1.6517608165740967,
"learning_rate": 4.447957425499139e-08,
"loss": 0.3038,
"step": 6440
},
{
"epoch": 0.9634235142553073,
"grad_norm": 1.3434756994247437,
"learning_rate": 4.107715108966237e-08,
"loss": 0.3067,
"step": 6450
},
{
"epoch": 0.9649171941223698,
"grad_norm": 1.260919213294983,
"learning_rate": 3.7809553415311675e-08,
"loss": 0.3052,
"step": 6460
},
{
"epoch": 0.9664108739894323,
"grad_norm": 1.7151823043823242,
"learning_rate": 3.467687007289833e-08,
"loss": 0.2897,
"step": 6470
},
{
"epoch": 0.9679045538564947,
"grad_norm": 1.9328373670578003,
"learning_rate": 3.167918623526833e-08,
"loss": 0.2919,
"step": 6480
},
{
"epoch": 0.9693982337235572,
"grad_norm": 1.4327538013458252,
"learning_rate": 2.8816583404837616e-08,
"loss": 0.2983,
"step": 6490
},
{
"epoch": 0.9708919135906197,
"grad_norm": 1.2370225191116333,
"learning_rate": 2.608913941137825e-08,
"loss": 0.301,
"step": 6500
},
{
"epoch": 0.9708919135906197,
"eval_loss": 0.3039746582508087,
"eval_runtime": 76.1933,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 3.557,
"step": 6500
},
{
"epoch": 0.9723855934576822,
"grad_norm": 1.433455467224121,
"learning_rate": 2.3496928409900143e-08,
"loss": 0.3035,
"step": 6510
},
{
"epoch": 0.9738792733247447,
"grad_norm": 1.2109386920928955,
"learning_rate": 2.10400208786371e-08,
"loss": 0.3246,
"step": 6520
},
{
"epoch": 0.9753729531918072,
"grad_norm": 1.5277369022369385,
"learning_rate": 1.87184836171278e-08,
"loss": 0.3185,
"step": 6530
},
{
"epoch": 0.9768666330588697,
"grad_norm": 2.307945728302002,
"learning_rate": 1.6532379744403915e-08,
"loss": 0.3209,
"step": 6540
},
{
"epoch": 0.9783603129259322,
"grad_norm": 2.4343667030334473,
"learning_rate": 1.448176869726814e-08,
"loss": 0.3146,
"step": 6550
},
{
"epoch": 0.9798539927929947,
"grad_norm": 2.0399911403656006,
"learning_rate": 1.2566706228685499e-08,
"loss": 0.3042,
"step": 6560
},
{
"epoch": 0.9813476726600572,
"grad_norm": 1.6942330598831177,
"learning_rate": 1.0787244406259556e-08,
"loss": 0.2949,
"step": 6570
},
{
"epoch": 0.9828413525271196,
"grad_norm": 1.0024651288986206,
"learning_rate": 9.143431610822983e-09,
"loss": 0.3046,
"step": 6580
},
{
"epoch": 0.9843350323941821,
"grad_norm": 1.426224946975708,
"learning_rate": 7.635312535119732e-09,
"loss": 0.3148,
"step": 6590
},
{
"epoch": 0.9858287122612446,
"grad_norm": 1.2437832355499268,
"learning_rate": 6.2629281825887785e-09,
"loss": 0.3209,
"step": 6600
},
{
"epoch": 0.9873223921283071,
"grad_norm": 1.3072861433029175,
"learning_rate": 5.026315866252241e-09,
"loss": 0.32,
"step": 6610
},
{
"epoch": 0.9888160719953696,
"grad_norm": 1.787488579750061,
"learning_rate": 3.9255092076984084e-09,
"loss": 0.3269,
"step": 6620
},
{
"epoch": 0.9903097518624321,
"grad_norm": 1.3305130004882812,
"learning_rate": 2.9605381361685893e-09,
"loss": 0.3157,
"step": 6630
},
{
"epoch": 0.9918034317294946,
"grad_norm": 1.319860577583313,
"learning_rate": 2.131428887742204e-09,
"loss": 0.2924,
"step": 6640
},
{
"epoch": 0.9932971115965571,
"grad_norm": 1.1278481483459473,
"learning_rate": 1.4382040046267976e-09,
"loss": 0.3155,
"step": 6650
},
{
"epoch": 0.9947907914636196,
"grad_norm": 1.3013602495193481,
"learning_rate": 8.808823345407558e-10,
"loss": 0.3081,
"step": 6660
},
{
"epoch": 0.9962844713306821,
"grad_norm": 1.4793535470962524,
"learning_rate": 4.594790302037133e-10,
"loss": 0.3225,
"step": 6670
},
{
"epoch": 0.9977781511977445,
"grad_norm": 1.2639180421829224,
"learning_rate": 1.7400554892466058e-10,
"loss": 0.3101,
"step": 6680
},
{
"epoch": 0.999271831064807,
"grad_norm": 1.553964614868164,
"learning_rate": 2.4469652287750777e-11,
"loss": 0.3104,
"step": 6690
},
{
"epoch": 1.0,
"step": 6695,
"total_flos": 1.727502264531714e+18,
"train_loss": 0.35656481113248656,
"train_runtime": 34664.1508,
"train_samples_per_second": 1.545,
"train_steps_per_second": 0.193
}
],
"logging_steps": 10,
"max_steps": 6695,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.727502264531714e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}