AgThinker-14B / trainer_state.json
chegde's picture
Add files using upload-large-folder tool
58be2dc verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.298368298368298,
"eval_steps": 500,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014344629729245113,
"grad_norm": 2.1563808263401123,
"learning_rate": 2.1479713603818614e-06,
"loss": 0.9832,
"step": 10
},
{
"epoch": 0.028689259458490227,
"grad_norm": 1.5376642197375787,
"learning_rate": 4.5346062052505965e-06,
"loss": 0.8531,
"step": 20
},
{
"epoch": 0.04303388918773534,
"grad_norm": 0.7108211564362369,
"learning_rate": 6.921241050119331e-06,
"loss": 0.7191,
"step": 30
},
{
"epoch": 0.05737851891698045,
"grad_norm": 0.5064006283904037,
"learning_rate": 9.307875894988068e-06,
"loss": 0.6532,
"step": 40
},
{
"epoch": 0.07172314864622557,
"grad_norm": 0.5016524926299113,
"learning_rate": 1.1694510739856803e-05,
"loss": 0.6245,
"step": 50
},
{
"epoch": 0.08606777837547068,
"grad_norm": 0.5064320977744877,
"learning_rate": 1.4081145584725539e-05,
"loss": 0.6017,
"step": 60
},
{
"epoch": 0.1004124081047158,
"grad_norm": 0.4525315758811991,
"learning_rate": 1.6467780429594274e-05,
"loss": 0.5788,
"step": 70
},
{
"epoch": 0.1147570378339609,
"grad_norm": 0.4891140282424216,
"learning_rate": 1.885441527446301e-05,
"loss": 0.5726,
"step": 80
},
{
"epoch": 0.129101667563206,
"grad_norm": 0.5816465774117703,
"learning_rate": 2.1241050119331742e-05,
"loss": 0.5669,
"step": 90
},
{
"epoch": 0.14344629729245115,
"grad_norm": 0.5416459422496235,
"learning_rate": 2.3627684964200477e-05,
"loss": 0.5621,
"step": 100
},
{
"epoch": 0.15779092702169625,
"grad_norm": 0.5501550374753971,
"learning_rate": 2.6014319809069216e-05,
"loss": 0.5543,
"step": 110
},
{
"epoch": 0.17213555675094136,
"grad_norm": 0.48897325487812016,
"learning_rate": 2.840095465393795e-05,
"loss": 0.5519,
"step": 120
},
{
"epoch": 0.1864801864801865,
"grad_norm": 0.5193769497689257,
"learning_rate": 3.0787589498806684e-05,
"loss": 0.546,
"step": 130
},
{
"epoch": 0.2008248162094316,
"grad_norm": 0.5100064220076413,
"learning_rate": 3.3174224343675416e-05,
"loss": 0.5454,
"step": 140
},
{
"epoch": 0.2151694459386767,
"grad_norm": 0.4992592686432826,
"learning_rate": 3.5560859188544155e-05,
"loss": 0.5437,
"step": 150
},
{
"epoch": 0.2295140756679218,
"grad_norm": 0.4838403686730783,
"learning_rate": 3.794749403341289e-05,
"loss": 0.5412,
"step": 160
},
{
"epoch": 0.24385870539716695,
"grad_norm": 0.45590237103117387,
"learning_rate": 4.0334128878281626e-05,
"loss": 0.541,
"step": 170
},
{
"epoch": 0.258203335126412,
"grad_norm": 0.44189108119445925,
"learning_rate": 4.272076372315036e-05,
"loss": 0.5328,
"step": 180
},
{
"epoch": 0.2725479648556572,
"grad_norm": 0.4596950199469469,
"learning_rate": 4.510739856801909e-05,
"loss": 0.5328,
"step": 190
},
{
"epoch": 0.2868925945849023,
"grad_norm": 0.4232582169854354,
"learning_rate": 4.749403341288783e-05,
"loss": 0.5331,
"step": 200
},
{
"epoch": 0.3012372243141474,
"grad_norm": 0.47183677702470816,
"learning_rate": 4.988066825775656e-05,
"loss": 0.5405,
"step": 210
},
{
"epoch": 0.3155818540433925,
"grad_norm": 0.510543623904468,
"learning_rate": 5.22673031026253e-05,
"loss": 0.5448,
"step": 220
},
{
"epoch": 0.3299264837726376,
"grad_norm": 0.49518213450267645,
"learning_rate": 5.465393794749404e-05,
"loss": 0.5441,
"step": 230
},
{
"epoch": 0.3442711135018827,
"grad_norm": 0.4081838992878798,
"learning_rate": 5.7040572792362765e-05,
"loss": 0.5342,
"step": 240
},
{
"epoch": 0.3586157432311278,
"grad_norm": 0.45201996051478616,
"learning_rate": 5.942720763723151e-05,
"loss": 0.5379,
"step": 250
},
{
"epoch": 0.372960372960373,
"grad_norm": 0.5163583891265898,
"learning_rate": 6.181384248210024e-05,
"loss": 0.5352,
"step": 260
},
{
"epoch": 0.3873050026896181,
"grad_norm": 0.43220360318869133,
"learning_rate": 6.420047732696898e-05,
"loss": 0.5319,
"step": 270
},
{
"epoch": 0.4016496324188632,
"grad_norm": 0.39341356230222857,
"learning_rate": 6.65871121718377e-05,
"loss": 0.531,
"step": 280
},
{
"epoch": 0.4159942621481083,
"grad_norm": 0.4843450935832954,
"learning_rate": 6.897374701670645e-05,
"loss": 0.5247,
"step": 290
},
{
"epoch": 0.4303388918773534,
"grad_norm": 0.43790304963108484,
"learning_rate": 7.136038186157519e-05,
"loss": 0.535,
"step": 300
},
{
"epoch": 0.4446835216065985,
"grad_norm": 0.3679883238347063,
"learning_rate": 7.374701670644391e-05,
"loss": 0.529,
"step": 310
},
{
"epoch": 0.4590281513358436,
"grad_norm": 0.38496528216858267,
"learning_rate": 7.613365155131266e-05,
"loss": 0.5358,
"step": 320
},
{
"epoch": 0.47337278106508873,
"grad_norm": 0.3554015988338894,
"learning_rate": 7.852028639618139e-05,
"loss": 0.5366,
"step": 330
},
{
"epoch": 0.4877174107943339,
"grad_norm": 0.3647316947863913,
"learning_rate": 8.090692124105012e-05,
"loss": 0.5305,
"step": 340
},
{
"epoch": 0.502062040523579,
"grad_norm": 0.40796953260473273,
"learning_rate": 8.329355608591885e-05,
"loss": 0.5361,
"step": 350
},
{
"epoch": 0.516406670252824,
"grad_norm": 0.345466577791704,
"learning_rate": 8.56801909307876e-05,
"loss": 0.5317,
"step": 360
},
{
"epoch": 0.5307512999820692,
"grad_norm": 0.3897824320306703,
"learning_rate": 8.806682577565633e-05,
"loss": 0.5384,
"step": 370
},
{
"epoch": 0.5450959297113144,
"grad_norm": 0.39750641265021325,
"learning_rate": 9.045346062052506e-05,
"loss": 0.5344,
"step": 380
},
{
"epoch": 0.5594405594405595,
"grad_norm": 0.3294501292354428,
"learning_rate": 9.28400954653938e-05,
"loss": 0.531,
"step": 390
},
{
"epoch": 0.5737851891698046,
"grad_norm": 0.3325704984940842,
"learning_rate": 9.522673031026254e-05,
"loss": 0.5282,
"step": 400
},
{
"epoch": 0.5881298188990497,
"grad_norm": 0.37577078172283135,
"learning_rate": 9.761336515513126e-05,
"loss": 0.5289,
"step": 410
},
{
"epoch": 0.6024744486282948,
"grad_norm": 0.32982285395946315,
"learning_rate": 0.0001,
"loss": 0.5326,
"step": 420
},
{
"epoch": 0.6168190783575399,
"grad_norm": 0.315570817514683,
"learning_rate": 9.999826305940802e-05,
"loss": 0.5276,
"step": 430
},
{
"epoch": 0.631163708086785,
"grad_norm": 0.32873479872603517,
"learning_rate": 9.99930523583106e-05,
"loss": 0.5329,
"step": 440
},
{
"epoch": 0.6455083378160301,
"grad_norm": 0.3754610016068392,
"learning_rate": 9.998436825873485e-05,
"loss": 0.5339,
"step": 450
},
{
"epoch": 0.6598529675452752,
"grad_norm": 0.2973537072730961,
"learning_rate": 9.997221136403139e-05,
"loss": 0.5249,
"step": 460
},
{
"epoch": 0.6741975972745203,
"grad_norm": 0.33421087994681,
"learning_rate": 9.995658251883237e-05,
"loss": 0.5196,
"step": 470
},
{
"epoch": 0.6885422270037654,
"grad_norm": 0.3120052809857615,
"learning_rate": 9.993748280899279e-05,
"loss": 0.5236,
"step": 480
},
{
"epoch": 0.7028868567330105,
"grad_norm": 0.28477505127058517,
"learning_rate": 9.991491356151515e-05,
"loss": 0.5166,
"step": 490
},
{
"epoch": 0.7172314864622557,
"grad_norm": 0.3282614273702267,
"learning_rate": 9.988887634445711e-05,
"loss": 0.5191,
"step": 500
},
{
"epoch": 0.7315761161915008,
"grad_norm": 0.3018575266984723,
"learning_rate": 9.985937296682264e-05,
"loss": 0.52,
"step": 510
},
{
"epoch": 0.745920745920746,
"grad_norm": 0.322642800391225,
"learning_rate": 9.982640547843628e-05,
"loss": 0.5193,
"step": 520
},
{
"epoch": 0.7602653756499911,
"grad_norm": 0.29334428049579575,
"learning_rate": 9.978997616980083e-05,
"loss": 0.5173,
"step": 530
},
{
"epoch": 0.7746100053792362,
"grad_norm": 0.30663608653187874,
"learning_rate": 9.975008757193805e-05,
"loss": 0.514,
"step": 540
},
{
"epoch": 0.7889546351084813,
"grad_norm": 0.2902575349921672,
"learning_rate": 9.970674245621296e-05,
"loss": 0.5173,
"step": 550
},
{
"epoch": 0.8032992648377264,
"grad_norm": 0.2750627268123107,
"learning_rate": 9.965994383414116e-05,
"loss": 0.5124,
"step": 560
},
{
"epoch": 0.8176438945669715,
"grad_norm": 0.31846444500772264,
"learning_rate": 9.960969495717975e-05,
"loss": 0.5105,
"step": 570
},
{
"epoch": 0.8319885242962166,
"grad_norm": 0.2850087845127407,
"learning_rate": 9.955599931650127e-05,
"loss": 0.505,
"step": 580
},
{
"epoch": 0.8463331540254617,
"grad_norm": 0.3011934822435399,
"learning_rate": 9.949886064275123e-05,
"loss": 0.4997,
"step": 590
},
{
"epoch": 0.8606777837547068,
"grad_norm": 0.28048546158040877,
"learning_rate": 9.943828290578892e-05,
"loss": 0.5039,
"step": 600
},
{
"epoch": 0.8750224134839519,
"grad_norm": 0.29482418586801606,
"learning_rate": 9.937427031441152e-05,
"loss": 0.5068,
"step": 610
},
{
"epoch": 0.889367043213197,
"grad_norm": 0.2868511054090736,
"learning_rate": 9.93068273160618e-05,
"loss": 0.5041,
"step": 620
},
{
"epoch": 0.9037116729424421,
"grad_norm": 0.26871712641833934,
"learning_rate": 9.9235958596519e-05,
"loss": 0.5031,
"step": 630
},
{
"epoch": 0.9180563026716873,
"grad_norm": 0.27176006324351715,
"learning_rate": 9.916166907957336e-05,
"loss": 0.4998,
"step": 640
},
{
"epoch": 0.9324009324009324,
"grad_norm": 0.2916123334399884,
"learning_rate": 9.908396392668397e-05,
"loss": 0.5045,
"step": 650
},
{
"epoch": 0.9467455621301775,
"grad_norm": 0.28657648009371867,
"learning_rate": 9.90028485366202e-05,
"loss": 0.5005,
"step": 660
},
{
"epoch": 0.9610901918594227,
"grad_norm": 0.2616587253845151,
"learning_rate": 9.891832854508661e-05,
"loss": 0.5017,
"step": 670
},
{
"epoch": 0.9754348215886678,
"grad_norm": 0.2642546302377749,
"learning_rate": 9.883040982433133e-05,
"loss": 0.492,
"step": 680
},
{
"epoch": 0.9897794513179129,
"grad_norm": 0.255936110537566,
"learning_rate": 9.87390984827382e-05,
"loss": 0.4934,
"step": 690
},
{
"epoch": 1.002868925945849,
"grad_norm": 0.3099905060480918,
"learning_rate": 9.864440086440223e-05,
"loss": 0.43,
"step": 700
},
{
"epoch": 1.017213555675094,
"grad_norm": 0.25984643430841164,
"learning_rate": 9.854632354868889e-05,
"loss": 0.3695,
"step": 710
},
{
"epoch": 1.0315581854043392,
"grad_norm": 0.28715953895875523,
"learning_rate": 9.844487334977705e-05,
"loss": 0.3792,
"step": 720
},
{
"epoch": 1.0459028151335843,
"grad_norm": 0.2649853690140653,
"learning_rate": 9.834005731618543e-05,
"loss": 0.3737,
"step": 730
},
{
"epoch": 1.0602474448628294,
"grad_norm": 0.26573177260306496,
"learning_rate": 9.823188273028297e-05,
"loss": 0.3771,
"step": 740
},
{
"epoch": 1.0745920745920745,
"grad_norm": 0.26574383245031147,
"learning_rate": 9.812035710778283e-05,
"loss": 0.3741,
"step": 750
},
{
"epoch": 1.0889367043213196,
"grad_norm": 0.28123208245100456,
"learning_rate": 9.800548819722026e-05,
"loss": 0.3731,
"step": 760
},
{
"epoch": 1.1032813340505647,
"grad_norm": 0.28349060376734364,
"learning_rate": 9.78872839794142e-05,
"loss": 0.3778,
"step": 770
},
{
"epoch": 1.11762596377981,
"grad_norm": 0.28893208285124705,
"learning_rate": 9.776575266691279e-05,
"loss": 0.3806,
"step": 780
},
{
"epoch": 1.1319705935090552,
"grad_norm": 0.2860386213893016,
"learning_rate": 9.764090270342286e-05,
"loss": 0.3799,
"step": 790
},
{
"epoch": 1.1463152232383003,
"grad_norm": 0.2460375162339276,
"learning_rate": 9.751274276322316e-05,
"loss": 0.3898,
"step": 800
},
{
"epoch": 1.1606598529675454,
"grad_norm": 0.251720247858439,
"learning_rate": 9.738128175056179e-05,
"loss": 0.3821,
"step": 810
},
{
"epoch": 1.1750044826967905,
"grad_norm": 0.25278081761302046,
"learning_rate": 9.724652879903751e-05,
"loss": 0.3798,
"step": 820
},
{
"epoch": 1.1893491124260356,
"grad_norm": 0.2599296471651565,
"learning_rate": 9.71084932709652e-05,
"loss": 0.3828,
"step": 830
},
{
"epoch": 1.2036937421552807,
"grad_norm": 0.2455565423628766,
"learning_rate": 9.696718475672532e-05,
"loss": 0.3743,
"step": 840
},
{
"epoch": 1.2180383718845258,
"grad_norm": 0.2888176539405626,
"learning_rate": 9.682261307409766e-05,
"loss": 0.381,
"step": 850
},
{
"epoch": 1.232383001613771,
"grad_norm": 0.25471629336116824,
"learning_rate": 9.667478826757916e-05,
"loss": 0.3832,
"step": 860
},
{
"epoch": 1.246727631343016,
"grad_norm": 0.2614351769400213,
"learning_rate": 9.652372060768608e-05,
"loss": 0.3848,
"step": 870
},
{
"epoch": 1.2610722610722611,
"grad_norm": 0.26412023986782596,
"learning_rate": 9.63694205902405e-05,
"loss": 0.3855,
"step": 880
},
{
"epoch": 1.2754168908015062,
"grad_norm": 0.2429603634039349,
"learning_rate": 9.621189893564092e-05,
"loss": 0.3819,
"step": 890
},
{
"epoch": 1.2897615205307513,
"grad_norm": 0.26829134565306034,
"learning_rate": 9.605116658811759e-05,
"loss": 0.3906,
"step": 900
},
{
"epoch": 1.3041061502599964,
"grad_norm": 0.27364921782590684,
"learning_rate": 9.588723471497208e-05,
"loss": 0.3848,
"step": 910
},
{
"epoch": 1.3184507799892415,
"grad_norm": 0.2515405826777948,
"learning_rate": 9.572011470580136e-05,
"loss": 0.3899,
"step": 920
},
{
"epoch": 1.3327954097184866,
"grad_norm": 0.2513235427794581,
"learning_rate": 9.554981817170655e-05,
"loss": 0.3912,
"step": 930
},
{
"epoch": 1.3471400394477318,
"grad_norm": 0.24955670786044107,
"learning_rate": 9.537635694448615e-05,
"loss": 0.3849,
"step": 940
},
{
"epoch": 1.3614846691769769,
"grad_norm": 0.24538389153554718,
"learning_rate": 9.519974307581404e-05,
"loss": 0.3867,
"step": 950
},
{
"epoch": 1.375829298906222,
"grad_norm": 0.25856641290644117,
"learning_rate": 9.50199888364021e-05,
"loss": 0.3899,
"step": 960
},
{
"epoch": 1.390173928635467,
"grad_norm": 0.26645029754281746,
"learning_rate": 9.483710671514777e-05,
"loss": 0.386,
"step": 970
},
{
"epoch": 1.4045185583647122,
"grad_norm": 0.25589965865278824,
"learning_rate": 9.465110941826622e-05,
"loss": 0.3856,
"step": 980
},
{
"epoch": 1.4188631880939573,
"grad_norm": 0.27815251555263987,
"learning_rate": 9.446200986840765e-05,
"loss": 0.3881,
"step": 990
},
{
"epoch": 1.4332078178232024,
"grad_norm": 0.26197920796578195,
"learning_rate": 9.426982120375943e-05,
"loss": 0.3878,
"step": 1000
},
{
"epoch": 1.4475524475524475,
"grad_norm": 0.2606524388643148,
"learning_rate": 9.407455677713328e-05,
"loss": 0.3883,
"step": 1010
},
{
"epoch": 1.4618970772816926,
"grad_norm": 0.2331343629588404,
"learning_rate": 9.387623015503753e-05,
"loss": 0.3848,
"step": 1020
},
{
"epoch": 1.4762417070109377,
"grad_norm": 0.25873984889230295,
"learning_rate": 9.367485511673462e-05,
"loss": 0.3895,
"step": 1030
},
{
"epoch": 1.4905863367401828,
"grad_norm": 0.2531065339256471,
"learning_rate": 9.347044565328367e-05,
"loss": 0.3937,
"step": 1040
},
{
"epoch": 1.504930966469428,
"grad_norm": 0.26767841128977615,
"learning_rate": 9.326301596656846e-05,
"loss": 0.3894,
"step": 1050
},
{
"epoch": 1.519275596198673,
"grad_norm": 0.2472066267101206,
"learning_rate": 9.30525804683107e-05,
"loss": 0.3889,
"step": 1060
},
{
"epoch": 1.5336202259279181,
"grad_norm": 0.24916114849718174,
"learning_rate": 9.283915377906875e-05,
"loss": 0.3874,
"step": 1070
},
{
"epoch": 1.5479648556571632,
"grad_norm": 0.24586564647507672,
"learning_rate": 9.262275072722181e-05,
"loss": 0.3899,
"step": 1080
},
{
"epoch": 1.5623094853864083,
"grad_norm": 0.24194199620674006,
"learning_rate": 9.240338634793969e-05,
"loss": 0.3867,
"step": 1090
},
{
"epoch": 1.5766541151156535,
"grad_norm": 0.24712089444412166,
"learning_rate": 9.218107588213813e-05,
"loss": 0.3902,
"step": 1100
},
{
"epoch": 1.5909987448448986,
"grad_norm": 0.24987071841082312,
"learning_rate": 9.195583477542009e-05,
"loss": 0.3851,
"step": 1110
},
{
"epoch": 1.6053433745741437,
"grad_norm": 0.2453499914136973,
"learning_rate": 9.172767867700236e-05,
"loss": 0.3906,
"step": 1120
},
{
"epoch": 1.6196880043033888,
"grad_norm": 0.2453362805106032,
"learning_rate": 9.149662343862851e-05,
"loss": 0.3905,
"step": 1130
},
{
"epoch": 1.6340326340326339,
"grad_norm": 0.23102523441076062,
"learning_rate": 9.126268511346744e-05,
"loss": 0.3903,
"step": 1140
},
{
"epoch": 1.648377263761879,
"grad_norm": 0.2542501707506408,
"learning_rate": 9.102587995499807e-05,
"loss": 0.3953,
"step": 1150
},
{
"epoch": 1.6627218934911243,
"grad_norm": 0.23285474972918488,
"learning_rate": 9.078622441588009e-05,
"loss": 0.391,
"step": 1160
},
{
"epoch": 1.6770665232203694,
"grad_norm": 0.24565957619352327,
"learning_rate": 9.054373514681085e-05,
"loss": 0.3923,
"step": 1170
},
{
"epoch": 1.6914111529496145,
"grad_norm": 0.2506739557436684,
"learning_rate": 9.029842899536853e-05,
"loss": 0.3909,
"step": 1180
},
{
"epoch": 1.7057557826788596,
"grad_norm": 0.2441587365940751,
"learning_rate": 9.005032300484162e-05,
"loss": 0.3915,
"step": 1190
},
{
"epoch": 1.7201004124081047,
"grad_norm": 0.26421110183322566,
"learning_rate": 8.979943441304473e-05,
"loss": 0.3904,
"step": 1200
},
{
"epoch": 1.7344450421373498,
"grad_norm": 0.24194171269463752,
"learning_rate": 8.954578065112107e-05,
"loss": 0.3892,
"step": 1210
},
{
"epoch": 1.748789671866595,
"grad_norm": 0.23054663152441476,
"learning_rate": 8.928937934233123e-05,
"loss": 0.3907,
"step": 1220
},
{
"epoch": 1.76313430159584,
"grad_norm": 0.2369813966398001,
"learning_rate": 8.903024830082887e-05,
"loss": 0.3849,
"step": 1230
},
{
"epoch": 1.7774789313250852,
"grad_norm": 0.24008220076352446,
"learning_rate": 8.876840553042296e-05,
"loss": 0.3904,
"step": 1240
},
{
"epoch": 1.7918235610543303,
"grad_norm": 0.23428608617416669,
"learning_rate": 8.850386922332696e-05,
"loss": 0.387,
"step": 1250
},
{
"epoch": 1.8061681907835754,
"grad_norm": 0.23331291847215246,
"learning_rate": 8.823665775889486e-05,
"loss": 0.3909,
"step": 1260
},
{
"epoch": 1.8205128205128205,
"grad_norm": 0.23850242149763548,
"learning_rate": 8.796678970234427e-05,
"loss": 0.3833,
"step": 1270
},
{
"epoch": 1.8348574502420656,
"grad_norm": 0.2219682422982644,
"learning_rate": 8.769428380346642e-05,
"loss": 0.3845,
"step": 1280
},
{
"epoch": 1.8492020799713107,
"grad_norm": 0.22216238994388604,
"learning_rate": 8.741915899532362e-05,
"loss": 0.3865,
"step": 1290
},
{
"epoch": 1.8635467097005558,
"grad_norm": 0.22824929990219722,
"learning_rate": 8.714143439293376e-05,
"loss": 0.3852,
"step": 1300
},
{
"epoch": 1.8778913394298011,
"grad_norm": 0.24280880504252028,
"learning_rate": 8.686112929194226e-05,
"loss": 0.3861,
"step": 1310
},
{
"epoch": 1.8922359691590462,
"grad_norm": 0.24001704017165865,
"learning_rate": 8.657826316728142e-05,
"loss": 0.3908,
"step": 1320
},
{
"epoch": 1.9065805988882913,
"grad_norm": 0.22100828767921185,
"learning_rate": 8.62928556718174e-05,
"loss": 0.3871,
"step": 1330
},
{
"epoch": 1.9209252286175365,
"grad_norm": 0.22202012109824457,
"learning_rate": 8.600492663498477e-05,
"loss": 0.3834,
"step": 1340
},
{
"epoch": 1.9352698583467816,
"grad_norm": 0.21529127705519435,
"learning_rate": 8.571449606140883e-05,
"loss": 0.388,
"step": 1350
},
{
"epoch": 1.9496144880760267,
"grad_norm": 0.23391440077905082,
"learning_rate": 8.542158412951563e-05,
"loss": 0.3844,
"step": 1360
},
{
"epoch": 1.9639591178052718,
"grad_norm": 0.2331711540562185,
"learning_rate": 8.512621119013013e-05,
"loss": 0.393,
"step": 1370
},
{
"epoch": 1.9783037475345169,
"grad_norm": 0.23305451446876246,
"learning_rate": 8.482839776506232e-05,
"loss": 0.3837,
"step": 1380
},
{
"epoch": 1.992648377263762,
"grad_norm": 0.24517430064973736,
"learning_rate": 8.452816454568124e-05,
"loss": 0.3852,
"step": 1390
},
{
"epoch": 2.005737851891698,
"grad_norm": 0.27916951560917247,
"learning_rate": 8.422553239147754e-05,
"loss": 0.2799,
"step": 1400
},
{
"epoch": 2.020082481620943,
"grad_norm": 0.23593724272097047,
"learning_rate": 8.392052232861411e-05,
"loss": 0.201,
"step": 1410
},
{
"epoch": 2.034427111350188,
"grad_norm": 0.23512809512185134,
"learning_rate": 8.361315554846534e-05,
"loss": 0.1983,
"step": 1420
},
{
"epoch": 2.0487717410794333,
"grad_norm": 0.2286549447255431,
"learning_rate": 8.330345340614471e-05,
"loss": 0.1942,
"step": 1430
},
{
"epoch": 2.0631163708086784,
"grad_norm": 0.24844506458021867,
"learning_rate": 8.299143741902111e-05,
"loss": 0.1943,
"step": 1440
},
{
"epoch": 2.0774610005379235,
"grad_norm": 0.25585326293058985,
"learning_rate": 8.267712926522389e-05,
"loss": 0.1993,
"step": 1450
},
{
"epoch": 2.0918056302671686,
"grad_norm": 0.2421387690266048,
"learning_rate": 8.236055078213666e-05,
"loss": 0.1965,
"step": 1460
},
{
"epoch": 2.1061502599964137,
"grad_norm": 0.24707973152402415,
"learning_rate": 8.204172396488013e-05,
"loss": 0.1992,
"step": 1470
},
{
"epoch": 2.120494889725659,
"grad_norm": 0.23509099311770917,
"learning_rate": 8.172067096478395e-05,
"loss": 0.2008,
"step": 1480
},
{
"epoch": 2.134839519454904,
"grad_norm": 0.23605150518346998,
"learning_rate": 8.139741408784764e-05,
"loss": 0.2019,
"step": 1490
},
{
"epoch": 2.149184149184149,
"grad_norm": 0.2546177191590111,
"learning_rate": 8.107197579319082e-05,
"loss": 0.2053,
"step": 1500
},
{
"epoch": 2.163528778913394,
"grad_norm": 0.2353676242323245,
"learning_rate": 8.074437869149288e-05,
"loss": 0.204,
"step": 1510
},
{
"epoch": 2.1778734086426392,
"grad_norm": 0.23401952893152606,
"learning_rate": 8.041464554342197e-05,
"loss": 0.2036,
"step": 1520
},
{
"epoch": 2.1922180383718843,
"grad_norm": 0.23141975512545726,
"learning_rate": 8.008279925805366e-05,
"loss": 0.2033,
"step": 1530
},
{
"epoch": 2.2065626681011294,
"grad_norm": 0.23587920943899052,
"learning_rate": 7.974886289127927e-05,
"loss": 0.2068,
"step": 1540
},
{
"epoch": 2.2209072978303745,
"grad_norm": 0.2394814609218661,
"learning_rate": 7.941285964420407e-05,
"loss": 0.2049,
"step": 1550
},
{
"epoch": 2.23525192755962,
"grad_norm": 0.2389110148718096,
"learning_rate": 7.907481286153516e-05,
"loss": 0.2116,
"step": 1560
},
{
"epoch": 2.249596557288865,
"grad_norm": 0.2282395291006806,
"learning_rate": 7.873474602995973e-05,
"loss": 0.2088,
"step": 1570
},
{
"epoch": 2.2639411870181103,
"grad_norm": 0.23275397540700887,
"learning_rate": 7.839268277651311e-05,
"loss": 0.2092,
"step": 1580
},
{
"epoch": 2.2782858167473554,
"grad_norm": 0.22624466416184327,
"learning_rate": 7.80486468669373e-05,
"loss": 0.2088,
"step": 1590
},
{
"epoch": 2.2926304464766005,
"grad_norm": 0.23126585599149074,
"learning_rate": 7.770266220402977e-05,
"loss": 0.2117,
"step": 1600
},
{
"epoch": 2.3069750762058456,
"grad_norm": 0.226948134461606,
"learning_rate": 7.735475282598271e-05,
"loss": 0.2097,
"step": 1610
},
{
"epoch": 2.3213197059350907,
"grad_norm": 0.22673465714008167,
"learning_rate": 7.700494290471296e-05,
"loss": 0.2104,
"step": 1620
},
{
"epoch": 2.335664335664336,
"grad_norm": 0.2556824784968339,
"learning_rate": 7.665325674418264e-05,
"loss": 0.2136,
"step": 1630
},
{
"epoch": 2.350008965393581,
"grad_norm": 0.25025658975825976,
"learning_rate": 7.629971877871039e-05,
"loss": 0.2084,
"step": 1640
},
{
"epoch": 2.364353595122826,
"grad_norm": 0.22536490579422702,
"learning_rate": 7.594435357127399e-05,
"loss": 0.2089,
"step": 1650
},
{
"epoch": 2.378698224852071,
"grad_norm": 0.2258065984765025,
"learning_rate": 7.558718581180355e-05,
"loss": 0.2067,
"step": 1660
},
{
"epoch": 2.3930428545813163,
"grad_norm": 0.2464593742203822,
"learning_rate": 7.522824031546629e-05,
"loss": 0.2137,
"step": 1670
},
{
"epoch": 2.4073874843105614,
"grad_norm": 0.24123071412945177,
"learning_rate": 7.486754202094229e-05,
"loss": 0.2115,
"step": 1680
},
{
"epoch": 2.4217321140398065,
"grad_norm": 0.23105649429700748,
"learning_rate": 7.450511598869194e-05,
"loss": 0.2138,
"step": 1690
},
{
"epoch": 2.4360767437690516,
"grad_norm": 0.22955721039077792,
"learning_rate": 7.414098739921471e-05,
"loss": 0.2125,
"step": 1700
},
{
"epoch": 2.4504213734982967,
"grad_norm": 0.23154193335740872,
"learning_rate": 7.377518155129973e-05,
"loss": 0.2183,
"step": 1710
},
{
"epoch": 2.464766003227542,
"grad_norm": 0.2340236121998045,
"learning_rate": 7.340772386026801e-05,
"loss": 0.2157,
"step": 1720
},
{
"epoch": 2.479110632956787,
"grad_norm": 0.2250255353665983,
"learning_rate": 7.303863985620676e-05,
"loss": 0.2123,
"step": 1730
},
{
"epoch": 2.493455262686032,
"grad_norm": 0.2283114308365594,
"learning_rate": 7.266795518219548e-05,
"loss": 0.2135,
"step": 1740
},
{
"epoch": 2.507799892415277,
"grad_norm": 0.23546636465212323,
"learning_rate": 7.22956955925245e-05,
"loss": 0.214,
"step": 1750
},
{
"epoch": 2.5221445221445222,
"grad_norm": 0.23275268765839288,
"learning_rate": 7.192188695090545e-05,
"loss": 0.2156,
"step": 1760
},
{
"epoch": 2.5364891518737673,
"grad_norm": 0.2457436947556184,
"learning_rate": 7.154655522867452e-05,
"loss": 0.2189,
"step": 1770
},
{
"epoch": 2.5508337816030124,
"grad_norm": 0.2385729628030818,
"learning_rate": 7.116972650298782e-05,
"loss": 0.2148,
"step": 1780
},
{
"epoch": 2.5651784113322575,
"grad_norm": 0.2382827317725779,
"learning_rate": 7.079142695500975e-05,
"loss": 0.2127,
"step": 1790
},
{
"epoch": 2.5795230410615027,
"grad_norm": 0.22496477508883403,
"learning_rate": 7.041168286809397e-05,
"loss": 0.2156,
"step": 1800
},
{
"epoch": 2.5938676707907478,
"grad_norm": 0.2337756123669142,
"learning_rate": 7.00305206259572e-05,
"loss": 0.2163,
"step": 1810
},
{
"epoch": 2.608212300519993,
"grad_norm": 0.23547675501490803,
"learning_rate": 6.964796671084631e-05,
"loss": 0.213,
"step": 1820
},
{
"epoch": 2.622556930249238,
"grad_norm": 0.236949625863052,
"learning_rate": 6.926404770169819e-05,
"loss": 0.2108,
"step": 1830
},
{
"epoch": 2.636901559978483,
"grad_norm": 0.22775808389184637,
"learning_rate": 6.887879027229332e-05,
"loss": 0.2131,
"step": 1840
},
{
"epoch": 2.651246189707728,
"grad_norm": 0.25558095929144115,
"learning_rate": 6.84922211894024e-05,
"loss": 0.2146,
"step": 1850
},
{
"epoch": 2.6655908194369733,
"grad_norm": 0.23865636643565702,
"learning_rate": 6.810436731092671e-05,
"loss": 0.2154,
"step": 1860
},
{
"epoch": 2.6799354491662184,
"grad_norm": 0.23347390914436725,
"learning_rate": 6.771525558403203e-05,
"loss": 0.2145,
"step": 1870
},
{
"epoch": 2.6942800788954635,
"grad_norm": 0.2311770851119529,
"learning_rate": 6.73249130432765e-05,
"loss": 0.2112,
"step": 1880
},
{
"epoch": 2.7086247086247086,
"grad_norm": 0.2326246785839781,
"learning_rate": 6.69333668087323e-05,
"loss": 0.2133,
"step": 1890
},
{
"epoch": 2.7229693383539537,
"grad_norm": 0.23563376415545254,
"learning_rate": 6.654064408410132e-05,
"loss": 0.2141,
"step": 1900
},
{
"epoch": 2.737313968083199,
"grad_norm": 0.2298522950109398,
"learning_rate": 6.614677215482527e-05,
"loss": 0.2142,
"step": 1910
},
{
"epoch": 2.751658597812444,
"grad_norm": 0.2364865163676101,
"learning_rate": 6.57517783861898e-05,
"loss": 0.2127,
"step": 1920
},
{
"epoch": 2.766003227541689,
"grad_norm": 0.22837021217881728,
"learning_rate": 6.535569022142335e-05,
"loss": 0.2145,
"step": 1930
},
{
"epoch": 2.780347857270934,
"grad_norm": 0.22749769763881308,
"learning_rate": 6.495853517979035e-05,
"loss": 0.2106,
"step": 1940
},
{
"epoch": 2.7946924870001792,
"grad_norm": 0.21764981978938533,
"learning_rate": 6.456034085467935e-05,
"loss": 0.2125,
"step": 1950
},
{
"epoch": 2.8090371167294244,
"grad_norm": 0.22774012921821585,
"learning_rate": 6.416113491168581e-05,
"loss": 0.213,
"step": 1960
},
{
"epoch": 2.8233817464586695,
"grad_norm": 0.22793686074861258,
"learning_rate": 6.376094508668999e-05,
"loss": 0.2116,
"step": 1970
},
{
"epoch": 2.8377263761879146,
"grad_norm": 0.24345345462191187,
"learning_rate": 6.335979918392999e-05,
"loss": 0.213,
"step": 1980
},
{
"epoch": 2.8520710059171597,
"grad_norm": 0.230566718186529,
"learning_rate": 6.295772507406982e-05,
"loss": 0.2123,
"step": 1990
},
{
"epoch": 2.866415635646405,
"grad_norm": 0.23922165240449358,
"learning_rate": 6.255475069226326e-05,
"loss": 0.211,
"step": 2000
},
{
"epoch": 2.88076026537565,
"grad_norm": 0.22058336484670613,
"learning_rate": 6.21509040362127e-05,
"loss": 0.2122,
"step": 2010
},
{
"epoch": 2.895104895104895,
"grad_norm": 0.2272702011851071,
"learning_rate": 6.174621316422417e-05,
"loss": 0.2147,
"step": 2020
},
{
"epoch": 2.90944952483414,
"grad_norm": 0.23799805104509125,
"learning_rate": 6.134070619325774e-05,
"loss": 0.212,
"step": 2030
},
{
"epoch": 2.923794154563385,
"grad_norm": 0.24608349983752625,
"learning_rate": 6.0934411296974184e-05,
"loss": 0.2122,
"step": 2040
},
{
"epoch": 2.9381387842926303,
"grad_norm": 0.23079480496683127,
"learning_rate": 6.052735670377736e-05,
"loss": 0.211,
"step": 2050
},
{
"epoch": 2.9524834140218754,
"grad_norm": 0.22680559271715478,
"learning_rate": 6.0119570694853155e-05,
"loss": 0.2102,
"step": 2060
},
{
"epoch": 2.9668280437511205,
"grad_norm": 0.22760761484882805,
"learning_rate": 5.97110816022044e-05,
"loss": 0.2113,
"step": 2070
},
{
"epoch": 2.9811726734803656,
"grad_norm": 0.23303799910976278,
"learning_rate": 5.930191780668258e-05,
"loss": 0.2088,
"step": 2080
},
{
"epoch": 2.9955173032096107,
"grad_norm": 0.22946738031807773,
"learning_rate": 5.88921077360159e-05,
"loss": 0.2097,
"step": 2090
},
{
"epoch": 3.008606777837547,
"grad_norm": 0.2697620900381124,
"learning_rate": 5.848167986283421e-05,
"loss": 0.1134,
"step": 2100
},
{
"epoch": 3.0229514075667923,
"grad_norm": 0.1885938841096422,
"learning_rate": 5.807066270269084e-05,
"loss": 0.0763,
"step": 2110
},
{
"epoch": 3.0372960372960374,
"grad_norm": 0.214693696805492,
"learning_rate": 5.765908481208139e-05,
"loss": 0.0756,
"step": 2120
},
{
"epoch": 3.0516406670252825,
"grad_norm": 0.2339101871402584,
"learning_rate": 5.724697478645963e-05,
"loss": 0.0744,
"step": 2130
},
{
"epoch": 3.0659852967545276,
"grad_norm": 0.1971755620952271,
"learning_rate": 5.6834361258250844e-05,
"loss": 0.072,
"step": 2140
},
{
"epoch": 3.0803299264837727,
"grad_norm": 0.1981153430750115,
"learning_rate": 5.642127289486246e-05,
"loss": 0.0748,
"step": 2150
},
{
"epoch": 3.094674556213018,
"grad_norm": 0.2172902671287561,
"learning_rate": 5.600773839669237e-05,
"loss": 0.0726,
"step": 2160
},
{
"epoch": 3.109019185942263,
"grad_norm": 0.19669334061877888,
"learning_rate": 5.559378649513478e-05,
"loss": 0.0733,
"step": 2170
},
{
"epoch": 3.123363815671508,
"grad_norm": 0.21027113699329436,
"learning_rate": 5.517944595058413e-05,
"loss": 0.0746,
"step": 2180
},
{
"epoch": 3.137708445400753,
"grad_norm": 0.20204087893287273,
"learning_rate": 5.476474555043688e-05,
"loss": 0.0748,
"step": 2190
},
{
"epoch": 3.152053075129998,
"grad_norm": 0.20638588917150788,
"learning_rate": 5.4349714107091335e-05,
"loss": 0.0744,
"step": 2200
},
{
"epoch": 3.1663977048592433,
"grad_norm": 0.20367882761147596,
"learning_rate": 5.393438045594595e-05,
"loss": 0.0755,
"step": 2210
},
{
"epoch": 3.1807423345884884,
"grad_norm": 0.20836979312681028,
"learning_rate": 5.351877345339583e-05,
"loss": 0.076,
"step": 2220
},
{
"epoch": 3.1950869643177335,
"grad_norm": 0.19643695987807314,
"learning_rate": 5.310292197482791e-05,
"loss": 0.0733,
"step": 2230
},
{
"epoch": 3.2094315940469786,
"grad_norm": 0.20621763947145422,
"learning_rate": 5.268685491261472e-05,
"loss": 0.075,
"step": 2240
},
{
"epoch": 3.2237762237762237,
"grad_norm": 0.20777873086593704,
"learning_rate": 5.227060117410702e-05,
"loss": 0.0746,
"step": 2250
},
{
"epoch": 3.238120853505469,
"grad_norm": 0.2021910407099938,
"learning_rate": 5.185418967962543e-05,
"loss": 0.0747,
"step": 2260
},
{
"epoch": 3.252465483234714,
"grad_norm": 0.2016612434414281,
"learning_rate": 5.143764936045106e-05,
"loss": 0.0743,
"step": 2270
},
{
"epoch": 3.266810112963959,
"grad_norm": 0.2180992659409795,
"learning_rate": 5.1021009156815414e-05,
"loss": 0.0744,
"step": 2280
},
{
"epoch": 3.281154742693204,
"grad_norm": 0.2056058145565962,
"learning_rate": 5.060429801588983e-05,
"loss": 0.0744,
"step": 2290
},
{
"epoch": 3.2954993724224493,
"grad_norm": 0.20164051829762908,
"learning_rate": 5.018754488977409e-05,
"loss": 0.0745,
"step": 2300
},
{
"epoch": 3.3098440021516944,
"grad_norm": 0.2026538165933443,
"learning_rate": 4.9770778733485065e-05,
"loss": 0.074,
"step": 2310
},
{
"epoch": 3.3241886318809395,
"grad_norm": 0.20427324673762595,
"learning_rate": 4.935402850294494e-05,
"loss": 0.0739,
"step": 2320
},
{
"epoch": 3.3385332616101846,
"grad_norm": 0.20831211218540635,
"learning_rate": 4.893732315296942e-05,
"loss": 0.0748,
"step": 2330
},
{
"epoch": 3.3528778913394297,
"grad_norm": 0.20740018500070947,
"learning_rate": 4.852069163525595e-05,
"loss": 0.0737,
"step": 2340
},
{
"epoch": 3.367222521068675,
"grad_norm": 0.20060155886370676,
"learning_rate": 4.810416289637234e-05,
"loss": 0.0729,
"step": 2350
},
{
"epoch": 3.38156715079792,
"grad_norm": 0.199826847154071,
"learning_rate": 4.7687765875745574e-05,
"loss": 0.0739,
"step": 2360
},
{
"epoch": 3.395911780527165,
"grad_norm": 0.20063705204581495,
"learning_rate": 4.727152950365117e-05,
"loss": 0.0737,
"step": 2370
},
{
"epoch": 3.41025641025641,
"grad_norm": 0.20947972363514977,
"learning_rate": 4.685548269920312e-05,
"loss": 0.0736,
"step": 2380
},
{
"epoch": 3.4246010399856552,
"grad_norm": 0.2006701925989043,
"learning_rate": 4.643965436834474e-05,
"loss": 0.075,
"step": 2390
},
{
"epoch": 3.4389456697149003,
"grad_norm": 0.20335025504735554,
"learning_rate": 4.6024073401840336e-05,
"loss": 0.0745,
"step": 2400
},
{
"epoch": 3.4532902994441455,
"grad_norm": 0.2192162565442083,
"learning_rate": 4.560876867326791e-05,
"loss": 0.0738,
"step": 2410
},
{
"epoch": 3.4676349291733906,
"grad_norm": 0.19858055523329815,
"learning_rate": 4.5193769037013066e-05,
"loss": 0.0732,
"step": 2420
},
{
"epoch": 3.4819795589026357,
"grad_norm": 0.20485303414115183,
"learning_rate": 4.477910332626438e-05,
"loss": 0.0728,
"step": 2430
},
{
"epoch": 3.4963241886318808,
"grad_norm": 0.19011594248287844,
"learning_rate": 4.4364800351010066e-05,
"loss": 0.0726,
"step": 2440
},
{
"epoch": 3.5106688183611263,
"grad_norm": 0.20410603253979742,
"learning_rate": 4.395088889603633e-05,
"loss": 0.0736,
"step": 2450
},
{
"epoch": 3.5250134480903714,
"grad_norm": 0.1994983957599032,
"learning_rate": 4.353739771892746e-05,
"loss": 0.073,
"step": 2460
},
{
"epoch": 3.5393580778196165,
"grad_norm": 0.20349060401414618,
"learning_rate": 4.312435554806787e-05,
"loss": 0.0736,
"step": 2470
},
{
"epoch": 3.5537027075488616,
"grad_norm": 0.20221336765718947,
"learning_rate": 4.271179108064605e-05,
"loss": 0.0713,
"step": 2480
},
{
"epoch": 3.5680473372781067,
"grad_norm": 0.1920539935100462,
"learning_rate": 4.229973298066083e-05,
"loss": 0.0714,
"step": 2490
},
{
"epoch": 3.582391967007352,
"grad_norm": 0.18514594535819984,
"learning_rate": 4.188820987692981e-05,
"loss": 0.0716,
"step": 2500
},
{
"epoch": 3.596736596736597,
"grad_norm": 0.19390555703637974,
"learning_rate": 4.1477250361100317e-05,
"loss": 0.072,
"step": 2510
},
{
"epoch": 3.611081226465842,
"grad_norm": 0.19881724163942532,
"learning_rate": 4.106688298566295e-05,
"loss": 0.0722,
"step": 2520
},
{
"epoch": 3.625425856195087,
"grad_norm": 0.19864848134388496,
"learning_rate": 4.065713626196778e-05,
"loss": 0.0697,
"step": 2530
},
{
"epoch": 3.6397704859243323,
"grad_norm": 0.20964033399772472,
"learning_rate": 4.0248038658243515e-05,
"loss": 0.0703,
"step": 2540
},
{
"epoch": 3.6541151156535774,
"grad_norm": 0.1887224816930325,
"learning_rate": 3.983961859761946e-05,
"loss": 0.071,
"step": 2550
},
{
"epoch": 3.6684597453828225,
"grad_norm": 0.1939910437911645,
"learning_rate": 3.9431904456150914e-05,
"loss": 0.0685,
"step": 2560
},
{
"epoch": 3.6828043751120676,
"grad_norm": 0.1905566250106664,
"learning_rate": 3.902492456084757e-05,
"loss": 0.0709,
"step": 2570
},
{
"epoch": 3.6971490048413127,
"grad_norm": 0.1954219594857734,
"learning_rate": 3.861870718770545e-05,
"loss": 0.0691,
"step": 2580
},
{
"epoch": 3.711493634570558,
"grad_norm": 0.20129771340548336,
"learning_rate": 3.821328055974231e-05,
"loss": 0.0688,
"step": 2590
},
{
"epoch": 3.725838264299803,
"grad_norm": 0.19424451985885532,
"learning_rate": 3.780867284503685e-05,
"loss": 0.0705,
"step": 2600
},
{
"epoch": 3.740182894029048,
"grad_norm": 0.19307288848206286,
"learning_rate": 3.7404912154771626e-05,
"loss": 0.069,
"step": 2610
},
{
"epoch": 3.754527523758293,
"grad_norm": 0.20224114631498458,
"learning_rate": 3.7002026541279905e-05,
"loss": 0.069,
"step": 2620
},
{
"epoch": 3.7688721534875382,
"grad_norm": 0.19645086260070405,
"learning_rate": 3.660004399609675e-05,
"loss": 0.0693,
"step": 2630
},
{
"epoch": 3.7832167832167833,
"grad_norm": 0.2009057118393354,
"learning_rate": 3.619899244801414e-05,
"loss": 0.0695,
"step": 2640
},
{
"epoch": 3.7975614129460284,
"grad_norm": 0.20154345565922616,
"learning_rate": 3.5798899761140626e-05,
"loss": 0.0688,
"step": 2650
},
{
"epoch": 3.8119060426752736,
"grad_norm": 0.19819908788727933,
"learning_rate": 3.5399793732965324e-05,
"loss": 0.0703,
"step": 2660
},
{
"epoch": 3.8262506724045187,
"grad_norm": 0.19579772630914064,
"learning_rate": 3.500170209242671e-05,
"loss": 0.0673,
"step": 2670
},
{
"epoch": 3.8405953021337638,
"grad_norm": 0.1930709905078437,
"learning_rate": 3.460465249798592e-05,
"loss": 0.068,
"step": 2680
},
{
"epoch": 3.854939931863009,
"grad_norm": 0.19375769656837338,
"learning_rate": 3.420867253570529e-05,
"loss": 0.0668,
"step": 2690
},
{
"epoch": 3.869284561592254,
"grad_norm": 0.19590535607906298,
"learning_rate": 3.381378971733161e-05,
"loss": 0.0658,
"step": 2700
},
{
"epoch": 3.883629191321499,
"grad_norm": 0.19485732453673113,
"learning_rate": 3.342003147838475e-05,
"loss": 0.0671,
"step": 2710
},
{
"epoch": 3.897973821050744,
"grad_norm": 0.19120048275674587,
"learning_rate": 3.302742517625144e-05,
"loss": 0.0665,
"step": 2720
},
{
"epoch": 3.9123184507799893,
"grad_norm": 0.19464302345990753,
"learning_rate": 3.2635998088284596e-05,
"loss": 0.0662,
"step": 2730
},
{
"epoch": 3.9266630805092344,
"grad_norm": 0.20017821890333443,
"learning_rate": 3.224577740990814e-05,
"loss": 0.0655,
"step": 2740
},
{
"epoch": 3.9410077102384795,
"grad_norm": 0.18866754533216776,
"learning_rate": 3.185679025272753e-05,
"loss": 0.0663,
"step": 2750
},
{
"epoch": 3.9553523399677246,
"grad_norm": 0.19243520237850759,
"learning_rate": 3.146906364264606e-05,
"loss": 0.0657,
"step": 2760
},
{
"epoch": 3.9696969696969697,
"grad_norm": 0.1924186499329835,
"learning_rate": 3.108262451798724e-05,
"loss": 0.0651,
"step": 2770
},
{
"epoch": 3.984041599426215,
"grad_norm": 0.21148279808210507,
"learning_rate": 3.069749972762316e-05,
"loss": 0.0648,
"step": 2780
},
{
"epoch": 3.99838622915546,
"grad_norm": 0.19991640585361364,
"learning_rate": 3.0313716029109064e-05,
"loss": 0.0645,
"step": 2790
},
{
"epoch": 4.011475703783396,
"grad_norm": 0.13064534629220334,
"learning_rate": 2.993130008682436e-05,
"loss": 0.0228,
"step": 2800
},
{
"epoch": 4.025820333512641,
"grad_norm": 0.14874535281957305,
"learning_rate": 2.955027847011993e-05,
"loss": 0.0176,
"step": 2810
},
{
"epoch": 4.040164963241886,
"grad_norm": 0.14336180228683498,
"learning_rate": 2.917067765147229e-05,
"loss": 0.0176,
"step": 2820
},
{
"epoch": 4.054509592971131,
"grad_norm": 0.12559441494646076,
"learning_rate": 2.8792524004644283e-05,
"loss": 0.0167,
"step": 2830
},
{
"epoch": 4.068854222700376,
"grad_norm": 0.12484448147694403,
"learning_rate": 2.8415843802852672e-05,
"loss": 0.0167,
"step": 2840
},
{
"epoch": 4.083198852429621,
"grad_norm": 0.1337296091314726,
"learning_rate": 2.8040663216942752e-05,
"loss": 0.0169,
"step": 2850
},
{
"epoch": 4.0975434821588665,
"grad_norm": 0.12242456577697475,
"learning_rate": 2.7667008313570076e-05,
"loss": 0.0161,
"step": 2860
},
{
"epoch": 4.111888111888112,
"grad_norm": 0.13243095768870966,
"learning_rate": 2.729490505338943e-05,
"loss": 0.0161,
"step": 2870
},
{
"epoch": 4.126232741617357,
"grad_norm": 0.12277718816926177,
"learning_rate": 2.692437928925109e-05,
"loss": 0.0157,
"step": 2880
},
{
"epoch": 4.140577371346602,
"grad_norm": 0.137540991678628,
"learning_rate": 2.655545676440464e-05,
"loss": 0.0159,
"step": 2890
},
{
"epoch": 4.154922001075847,
"grad_norm": 0.13131712471544715,
"learning_rate": 2.6188163110710435e-05,
"loss": 0.0161,
"step": 2900
},
{
"epoch": 4.169266630805092,
"grad_norm": 0.13640835620647865,
"learning_rate": 2.582252384685874e-05,
"loss": 0.0164,
"step": 2910
},
{
"epoch": 4.183611260534337,
"grad_norm": 0.12543818653508698,
"learning_rate": 2.5458564376596732e-05,
"loss": 0.0157,
"step": 2920
},
{
"epoch": 4.197955890263582,
"grad_norm": 0.11736167137678152,
"learning_rate": 2.509630998696349e-05,
"loss": 0.0154,
"step": 2930
},
{
"epoch": 4.212300519992827,
"grad_norm": 0.1245551329001544,
"learning_rate": 2.473578584653321e-05,
"loss": 0.0152,
"step": 2940
},
{
"epoch": 4.2266451497220725,
"grad_norm": 0.12763631881000323,
"learning_rate": 2.4377017003666413e-05,
"loss": 0.0155,
"step": 2950
},
{
"epoch": 4.240989779451318,
"grad_norm": 0.13313595943206588,
"learning_rate": 2.4020028384769795e-05,
"loss": 0.015,
"step": 2960
},
{
"epoch": 4.255334409180563,
"grad_norm": 0.125069284406997,
"learning_rate": 2.366484479256425e-05,
"loss": 0.015,
"step": 2970
},
{
"epoch": 4.269679038909808,
"grad_norm": 0.13167131954772826,
"learning_rate": 2.3311490904361738e-05,
"loss": 0.0159,
"step": 2980
},
{
"epoch": 4.284023668639053,
"grad_norm": 0.11344149792986571,
"learning_rate": 2.295999127035071e-05,
"loss": 0.0147,
"step": 2990
},
{
"epoch": 4.298368298368298,
"grad_norm": 0.1299095136285245,
"learning_rate": 2.26103703118905e-05,
"loss": 0.015,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 4188,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 940152769216512.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}