{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.298368298368298, "eval_steps": 500, "global_step": 3000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014344629729245113, "grad_norm": 2.1563808263401123, "learning_rate": 2.1479713603818614e-06, "loss": 0.9832, "step": 10 }, { "epoch": 0.028689259458490227, "grad_norm": 1.5376642197375787, "learning_rate": 4.5346062052505965e-06, "loss": 0.8531, "step": 20 }, { "epoch": 0.04303388918773534, "grad_norm": 0.7108211564362369, "learning_rate": 6.921241050119331e-06, "loss": 0.7191, "step": 30 }, { "epoch": 0.05737851891698045, "grad_norm": 0.5064006283904037, "learning_rate": 9.307875894988068e-06, "loss": 0.6532, "step": 40 }, { "epoch": 0.07172314864622557, "grad_norm": 0.5016524926299113, "learning_rate": 1.1694510739856803e-05, "loss": 0.6245, "step": 50 }, { "epoch": 0.08606777837547068, "grad_norm": 0.5064320977744877, "learning_rate": 1.4081145584725539e-05, "loss": 0.6017, "step": 60 }, { "epoch": 0.1004124081047158, "grad_norm": 0.4525315758811991, "learning_rate": 1.6467780429594274e-05, "loss": 0.5788, "step": 70 }, { "epoch": 0.1147570378339609, "grad_norm": 0.4891140282424216, "learning_rate": 1.885441527446301e-05, "loss": 0.5726, "step": 80 }, { "epoch": 0.129101667563206, "grad_norm": 0.5816465774117703, "learning_rate": 2.1241050119331742e-05, "loss": 0.5669, "step": 90 }, { "epoch": 0.14344629729245115, "grad_norm": 0.5416459422496235, "learning_rate": 2.3627684964200477e-05, "loss": 0.5621, "step": 100 }, { "epoch": 0.15779092702169625, "grad_norm": 0.5501550374753971, "learning_rate": 2.6014319809069216e-05, "loss": 0.5543, "step": 110 }, { "epoch": 0.17213555675094136, "grad_norm": 0.48897325487812016, "learning_rate": 2.840095465393795e-05, "loss": 0.5519, "step": 120 }, { "epoch": 0.1864801864801865, "grad_norm": 0.5193769497689257, "learning_rate": 3.0787589498806684e-05, "loss": 0.546, "step": 130 }, { "epoch": 0.2008248162094316, "grad_norm": 0.5100064220076413, "learning_rate": 3.3174224343675416e-05, "loss": 0.5454, "step": 140 }, { "epoch": 0.2151694459386767, "grad_norm": 0.4992592686432826, "learning_rate": 3.5560859188544155e-05, "loss": 0.5437, "step": 150 }, { "epoch": 0.2295140756679218, "grad_norm": 0.4838403686730783, "learning_rate": 3.794749403341289e-05, "loss": 0.5412, "step": 160 }, { "epoch": 0.24385870539716695, "grad_norm": 0.45590237103117387, "learning_rate": 4.0334128878281626e-05, "loss": 0.541, "step": 170 }, { "epoch": 0.258203335126412, "grad_norm": 0.44189108119445925, "learning_rate": 4.272076372315036e-05, "loss": 0.5328, "step": 180 }, { "epoch": 0.2725479648556572, "grad_norm": 0.4596950199469469, "learning_rate": 4.510739856801909e-05, "loss": 0.5328, "step": 190 }, { "epoch": 0.2868925945849023, "grad_norm": 0.4232582169854354, "learning_rate": 4.749403341288783e-05, "loss": 0.5331, "step": 200 }, { "epoch": 0.3012372243141474, "grad_norm": 0.47183677702470816, "learning_rate": 4.988066825775656e-05, "loss": 0.5405, "step": 210 }, { "epoch": 0.3155818540433925, "grad_norm": 0.510543623904468, "learning_rate": 5.22673031026253e-05, "loss": 0.5448, "step": 220 }, { "epoch": 0.3299264837726376, "grad_norm": 0.49518213450267645, "learning_rate": 5.465393794749404e-05, "loss": 0.5441, "step": 230 }, { "epoch": 0.3442711135018827, "grad_norm": 0.4081838992878798, "learning_rate": 5.7040572792362765e-05, "loss": 0.5342, "step": 240 }, { "epoch": 0.3586157432311278, "grad_norm": 0.45201996051478616, "learning_rate": 5.942720763723151e-05, "loss": 0.5379, "step": 250 }, { "epoch": 0.372960372960373, "grad_norm": 0.5163583891265898, "learning_rate": 6.181384248210024e-05, "loss": 0.5352, "step": 260 }, { "epoch": 0.3873050026896181, "grad_norm": 0.43220360318869133, "learning_rate": 6.420047732696898e-05, "loss": 0.5319, "step": 270 }, { "epoch": 0.4016496324188632, "grad_norm": 0.39341356230222857, "learning_rate": 6.65871121718377e-05, "loss": 0.531, "step": 280 }, { "epoch": 0.4159942621481083, "grad_norm": 0.4843450935832954, "learning_rate": 6.897374701670645e-05, "loss": 0.5247, "step": 290 }, { "epoch": 0.4303388918773534, "grad_norm": 0.43790304963108484, "learning_rate": 7.136038186157519e-05, "loss": 0.535, "step": 300 }, { "epoch": 0.4446835216065985, "grad_norm": 0.3679883238347063, "learning_rate": 7.374701670644391e-05, "loss": 0.529, "step": 310 }, { "epoch": 0.4590281513358436, "grad_norm": 0.38496528216858267, "learning_rate": 7.613365155131266e-05, "loss": 0.5358, "step": 320 }, { "epoch": 0.47337278106508873, "grad_norm": 0.3554015988338894, "learning_rate": 7.852028639618139e-05, "loss": 0.5366, "step": 330 }, { "epoch": 0.4877174107943339, "grad_norm": 0.3647316947863913, "learning_rate": 8.090692124105012e-05, "loss": 0.5305, "step": 340 }, { "epoch": 0.502062040523579, "grad_norm": 0.40796953260473273, "learning_rate": 8.329355608591885e-05, "loss": 0.5361, "step": 350 }, { "epoch": 0.516406670252824, "grad_norm": 0.345466577791704, "learning_rate": 8.56801909307876e-05, "loss": 0.5317, "step": 360 }, { "epoch": 0.5307512999820692, "grad_norm": 0.3897824320306703, "learning_rate": 8.806682577565633e-05, "loss": 0.5384, "step": 370 }, { "epoch": 0.5450959297113144, "grad_norm": 0.39750641265021325, "learning_rate": 9.045346062052506e-05, "loss": 0.5344, "step": 380 }, { "epoch": 0.5594405594405595, "grad_norm": 0.3294501292354428, "learning_rate": 9.28400954653938e-05, "loss": 0.531, "step": 390 }, { "epoch": 0.5737851891698046, "grad_norm": 0.3325704984940842, "learning_rate": 9.522673031026254e-05, "loss": 0.5282, "step": 400 }, { "epoch": 0.5881298188990497, "grad_norm": 0.37577078172283135, "learning_rate": 9.761336515513126e-05, "loss": 0.5289, "step": 410 }, { "epoch": 0.6024744486282948, "grad_norm": 0.32982285395946315, "learning_rate": 0.0001, "loss": 0.5326, "step": 420 }, { "epoch": 0.6168190783575399, "grad_norm": 0.315570817514683, "learning_rate": 9.999826305940802e-05, "loss": 0.5276, "step": 430 }, { "epoch": 0.631163708086785, "grad_norm": 0.32873479872603517, "learning_rate": 9.99930523583106e-05, "loss": 0.5329, "step": 440 }, { "epoch": 0.6455083378160301, "grad_norm": 0.3754610016068392, "learning_rate": 9.998436825873485e-05, "loss": 0.5339, "step": 450 }, { "epoch": 0.6598529675452752, "grad_norm": 0.2973537072730961, "learning_rate": 9.997221136403139e-05, "loss": 0.5249, "step": 460 }, { "epoch": 0.6741975972745203, "grad_norm": 0.33421087994681, "learning_rate": 9.995658251883237e-05, "loss": 0.5196, "step": 470 }, { "epoch": 0.6885422270037654, "grad_norm": 0.3120052809857615, "learning_rate": 9.993748280899279e-05, "loss": 0.5236, "step": 480 }, { "epoch": 0.7028868567330105, "grad_norm": 0.28477505127058517, "learning_rate": 9.991491356151515e-05, "loss": 0.5166, "step": 490 }, { "epoch": 0.7172314864622557, "grad_norm": 0.3282614273702267, "learning_rate": 9.988887634445711e-05, "loss": 0.5191, "step": 500 }, { "epoch": 0.7315761161915008, "grad_norm": 0.3018575266984723, "learning_rate": 9.985937296682264e-05, "loss": 0.52, "step": 510 }, { "epoch": 0.745920745920746, "grad_norm": 0.322642800391225, "learning_rate": 9.982640547843628e-05, "loss": 0.5193, "step": 520 }, { "epoch": 0.7602653756499911, "grad_norm": 0.29334428049579575, "learning_rate": 9.978997616980083e-05, "loss": 0.5173, "step": 530 }, { "epoch": 0.7746100053792362, "grad_norm": 0.30663608653187874, "learning_rate": 9.975008757193805e-05, "loss": 0.514, "step": 540 }, { "epoch": 0.7889546351084813, "grad_norm": 0.2902575349921672, "learning_rate": 9.970674245621296e-05, "loss": 0.5173, "step": 550 }, { "epoch": 0.8032992648377264, "grad_norm": 0.2750627268123107, "learning_rate": 9.965994383414116e-05, "loss": 0.5124, "step": 560 }, { "epoch": 0.8176438945669715, "grad_norm": 0.31846444500772264, "learning_rate": 9.960969495717975e-05, "loss": 0.5105, "step": 570 }, { "epoch": 0.8319885242962166, "grad_norm": 0.2850087845127407, "learning_rate": 9.955599931650127e-05, "loss": 0.505, "step": 580 }, { "epoch": 0.8463331540254617, "grad_norm": 0.3011934822435399, "learning_rate": 9.949886064275123e-05, "loss": 0.4997, "step": 590 }, { "epoch": 0.8606777837547068, "grad_norm": 0.28048546158040877, "learning_rate": 9.943828290578892e-05, "loss": 0.5039, "step": 600 }, { "epoch": 0.8750224134839519, "grad_norm": 0.29482418586801606, "learning_rate": 9.937427031441152e-05, "loss": 0.5068, "step": 610 }, { "epoch": 0.889367043213197, "grad_norm": 0.2868511054090736, "learning_rate": 9.93068273160618e-05, "loss": 0.5041, "step": 620 }, { "epoch": 0.9037116729424421, "grad_norm": 0.26871712641833934, "learning_rate": 9.9235958596519e-05, "loss": 0.5031, "step": 630 }, { "epoch": 0.9180563026716873, "grad_norm": 0.27176006324351715, "learning_rate": 9.916166907957336e-05, "loss": 0.4998, "step": 640 }, { "epoch": 0.9324009324009324, "grad_norm": 0.2916123334399884, "learning_rate": 9.908396392668397e-05, "loss": 0.5045, "step": 650 }, { "epoch": 0.9467455621301775, "grad_norm": 0.28657648009371867, "learning_rate": 9.90028485366202e-05, "loss": 0.5005, "step": 660 }, { "epoch": 0.9610901918594227, "grad_norm": 0.2616587253845151, "learning_rate": 9.891832854508661e-05, "loss": 0.5017, "step": 670 }, { "epoch": 0.9754348215886678, "grad_norm": 0.2642546302377749, "learning_rate": 9.883040982433133e-05, "loss": 0.492, "step": 680 }, { "epoch": 0.9897794513179129, "grad_norm": 0.255936110537566, "learning_rate": 9.87390984827382e-05, "loss": 0.4934, "step": 690 }, { "epoch": 1.002868925945849, "grad_norm": 0.3099905060480918, "learning_rate": 9.864440086440223e-05, "loss": 0.43, "step": 700 }, { "epoch": 1.017213555675094, "grad_norm": 0.25984643430841164, "learning_rate": 9.854632354868889e-05, "loss": 0.3695, "step": 710 }, { "epoch": 1.0315581854043392, "grad_norm": 0.28715953895875523, "learning_rate": 9.844487334977705e-05, "loss": 0.3792, "step": 720 }, { "epoch": 1.0459028151335843, "grad_norm": 0.2649853690140653, "learning_rate": 9.834005731618543e-05, "loss": 0.3737, "step": 730 }, { "epoch": 1.0602474448628294, "grad_norm": 0.26573177260306496, "learning_rate": 9.823188273028297e-05, "loss": 0.3771, "step": 740 }, { "epoch": 1.0745920745920745, "grad_norm": 0.26574383245031147, "learning_rate": 9.812035710778283e-05, "loss": 0.3741, "step": 750 }, { "epoch": 1.0889367043213196, "grad_norm": 0.28123208245100456, "learning_rate": 9.800548819722026e-05, "loss": 0.3731, "step": 760 }, { "epoch": 1.1032813340505647, "grad_norm": 0.28349060376734364, "learning_rate": 9.78872839794142e-05, "loss": 0.3778, "step": 770 }, { "epoch": 1.11762596377981, "grad_norm": 0.28893208285124705, "learning_rate": 9.776575266691279e-05, "loss": 0.3806, "step": 780 }, { "epoch": 1.1319705935090552, "grad_norm": 0.2860386213893016, "learning_rate": 9.764090270342286e-05, "loss": 0.3799, "step": 790 }, { "epoch": 1.1463152232383003, "grad_norm": 0.2460375162339276, "learning_rate": 9.751274276322316e-05, "loss": 0.3898, "step": 800 }, { "epoch": 1.1606598529675454, "grad_norm": 0.251720247858439, "learning_rate": 9.738128175056179e-05, "loss": 0.3821, "step": 810 }, { "epoch": 1.1750044826967905, "grad_norm": 0.25278081761302046, "learning_rate": 9.724652879903751e-05, "loss": 0.3798, "step": 820 }, { "epoch": 1.1893491124260356, "grad_norm": 0.2599296471651565, "learning_rate": 9.71084932709652e-05, "loss": 0.3828, "step": 830 }, { "epoch": 1.2036937421552807, "grad_norm": 0.2455565423628766, "learning_rate": 9.696718475672532e-05, "loss": 0.3743, "step": 840 }, { "epoch": 1.2180383718845258, "grad_norm": 0.2888176539405626, "learning_rate": 9.682261307409766e-05, "loss": 0.381, "step": 850 }, { "epoch": 1.232383001613771, "grad_norm": 0.25471629336116824, "learning_rate": 9.667478826757916e-05, "loss": 0.3832, "step": 860 }, { "epoch": 1.246727631343016, "grad_norm": 0.2614351769400213, "learning_rate": 9.652372060768608e-05, "loss": 0.3848, "step": 870 }, { "epoch": 1.2610722610722611, "grad_norm": 0.26412023986782596, "learning_rate": 9.63694205902405e-05, "loss": 0.3855, "step": 880 }, { "epoch": 1.2754168908015062, "grad_norm": 0.2429603634039349, "learning_rate": 9.621189893564092e-05, "loss": 0.3819, "step": 890 }, { "epoch": 1.2897615205307513, "grad_norm": 0.26829134565306034, "learning_rate": 9.605116658811759e-05, "loss": 0.3906, "step": 900 }, { "epoch": 1.3041061502599964, "grad_norm": 0.27364921782590684, "learning_rate": 9.588723471497208e-05, "loss": 0.3848, "step": 910 }, { "epoch": 1.3184507799892415, "grad_norm": 0.2515405826777948, "learning_rate": 9.572011470580136e-05, "loss": 0.3899, "step": 920 }, { "epoch": 1.3327954097184866, "grad_norm": 0.2513235427794581, "learning_rate": 9.554981817170655e-05, "loss": 0.3912, "step": 930 }, { "epoch": 1.3471400394477318, "grad_norm": 0.24955670786044107, "learning_rate": 9.537635694448615e-05, "loss": 0.3849, "step": 940 }, { "epoch": 1.3614846691769769, "grad_norm": 0.24538389153554718, "learning_rate": 9.519974307581404e-05, "loss": 0.3867, "step": 950 }, { "epoch": 1.375829298906222, "grad_norm": 0.25856641290644117, "learning_rate": 9.50199888364021e-05, "loss": 0.3899, "step": 960 }, { "epoch": 1.390173928635467, "grad_norm": 0.26645029754281746, "learning_rate": 9.483710671514777e-05, "loss": 0.386, "step": 970 }, { "epoch": 1.4045185583647122, "grad_norm": 0.25589965865278824, "learning_rate": 9.465110941826622e-05, "loss": 0.3856, "step": 980 }, { "epoch": 1.4188631880939573, "grad_norm": 0.27815251555263987, "learning_rate": 9.446200986840765e-05, "loss": 0.3881, "step": 990 }, { "epoch": 1.4332078178232024, "grad_norm": 0.26197920796578195, "learning_rate": 9.426982120375943e-05, "loss": 0.3878, "step": 1000 }, { "epoch": 1.4475524475524475, "grad_norm": 0.2606524388643148, "learning_rate": 9.407455677713328e-05, "loss": 0.3883, "step": 1010 }, { "epoch": 1.4618970772816926, "grad_norm": 0.2331343629588404, "learning_rate": 9.387623015503753e-05, "loss": 0.3848, "step": 1020 }, { "epoch": 1.4762417070109377, "grad_norm": 0.25873984889230295, "learning_rate": 9.367485511673462e-05, "loss": 0.3895, "step": 1030 }, { "epoch": 1.4905863367401828, "grad_norm": 0.2531065339256471, "learning_rate": 9.347044565328367e-05, "loss": 0.3937, "step": 1040 }, { "epoch": 1.504930966469428, "grad_norm": 0.26767841128977615, "learning_rate": 9.326301596656846e-05, "loss": 0.3894, "step": 1050 }, { "epoch": 1.519275596198673, "grad_norm": 0.2472066267101206, "learning_rate": 9.30525804683107e-05, "loss": 0.3889, "step": 1060 }, { "epoch": 1.5336202259279181, "grad_norm": 0.24916114849718174, "learning_rate": 9.283915377906875e-05, "loss": 0.3874, "step": 1070 }, { "epoch": 1.5479648556571632, "grad_norm": 0.24586564647507672, "learning_rate": 9.262275072722181e-05, "loss": 0.3899, "step": 1080 }, { "epoch": 1.5623094853864083, "grad_norm": 0.24194199620674006, "learning_rate": 9.240338634793969e-05, "loss": 0.3867, "step": 1090 }, { "epoch": 1.5766541151156535, "grad_norm": 0.24712089444412166, "learning_rate": 9.218107588213813e-05, "loss": 0.3902, "step": 1100 }, { "epoch": 1.5909987448448986, "grad_norm": 0.24987071841082312, "learning_rate": 9.195583477542009e-05, "loss": 0.3851, "step": 1110 }, { "epoch": 1.6053433745741437, "grad_norm": 0.2453499914136973, "learning_rate": 9.172767867700236e-05, "loss": 0.3906, "step": 1120 }, { "epoch": 1.6196880043033888, "grad_norm": 0.2453362805106032, "learning_rate": 9.149662343862851e-05, "loss": 0.3905, "step": 1130 }, { "epoch": 1.6340326340326339, "grad_norm": 0.23102523441076062, "learning_rate": 9.126268511346744e-05, "loss": 0.3903, "step": 1140 }, { "epoch": 1.648377263761879, "grad_norm": 0.2542501707506408, "learning_rate": 9.102587995499807e-05, "loss": 0.3953, "step": 1150 }, { "epoch": 1.6627218934911243, "grad_norm": 0.23285474972918488, "learning_rate": 9.078622441588009e-05, "loss": 0.391, "step": 1160 }, { "epoch": 1.6770665232203694, "grad_norm": 0.24565957619352327, "learning_rate": 9.054373514681085e-05, "loss": 0.3923, "step": 1170 }, { "epoch": 1.6914111529496145, "grad_norm": 0.2506739557436684, "learning_rate": 9.029842899536853e-05, "loss": 0.3909, "step": 1180 }, { "epoch": 1.7057557826788596, "grad_norm": 0.2441587365940751, "learning_rate": 9.005032300484162e-05, "loss": 0.3915, "step": 1190 }, { "epoch": 1.7201004124081047, "grad_norm": 0.26421110183322566, "learning_rate": 8.979943441304473e-05, "loss": 0.3904, "step": 1200 }, { "epoch": 1.7344450421373498, "grad_norm": 0.24194171269463752, "learning_rate": 8.954578065112107e-05, "loss": 0.3892, "step": 1210 }, { "epoch": 1.748789671866595, "grad_norm": 0.23054663152441476, "learning_rate": 8.928937934233123e-05, "loss": 0.3907, "step": 1220 }, { "epoch": 1.76313430159584, "grad_norm": 0.2369813966398001, "learning_rate": 8.903024830082887e-05, "loss": 0.3849, "step": 1230 }, { "epoch": 1.7774789313250852, "grad_norm": 0.24008220076352446, "learning_rate": 8.876840553042296e-05, "loss": 0.3904, "step": 1240 }, { "epoch": 1.7918235610543303, "grad_norm": 0.23428608617416669, "learning_rate": 8.850386922332696e-05, "loss": 0.387, "step": 1250 }, { "epoch": 1.8061681907835754, "grad_norm": 0.23331291847215246, "learning_rate": 8.823665775889486e-05, "loss": 0.3909, "step": 1260 }, { "epoch": 1.8205128205128205, "grad_norm": 0.23850242149763548, "learning_rate": 8.796678970234427e-05, "loss": 0.3833, "step": 1270 }, { "epoch": 1.8348574502420656, "grad_norm": 0.2219682422982644, "learning_rate": 8.769428380346642e-05, "loss": 0.3845, "step": 1280 }, { "epoch": 1.8492020799713107, "grad_norm": 0.22216238994388604, "learning_rate": 8.741915899532362e-05, "loss": 0.3865, "step": 1290 }, { "epoch": 1.8635467097005558, "grad_norm": 0.22824929990219722, "learning_rate": 8.714143439293376e-05, "loss": 0.3852, "step": 1300 }, { "epoch": 1.8778913394298011, "grad_norm": 0.24280880504252028, "learning_rate": 8.686112929194226e-05, "loss": 0.3861, "step": 1310 }, { "epoch": 1.8922359691590462, "grad_norm": 0.24001704017165865, "learning_rate": 8.657826316728142e-05, "loss": 0.3908, "step": 1320 }, { "epoch": 1.9065805988882913, "grad_norm": 0.22100828767921185, "learning_rate": 8.62928556718174e-05, "loss": 0.3871, "step": 1330 }, { "epoch": 1.9209252286175365, "grad_norm": 0.22202012109824457, "learning_rate": 8.600492663498477e-05, "loss": 0.3834, "step": 1340 }, { "epoch": 1.9352698583467816, "grad_norm": 0.21529127705519435, "learning_rate": 8.571449606140883e-05, "loss": 0.388, "step": 1350 }, { "epoch": 1.9496144880760267, "grad_norm": 0.23391440077905082, "learning_rate": 8.542158412951563e-05, "loss": 0.3844, "step": 1360 }, { "epoch": 1.9639591178052718, "grad_norm": 0.2331711540562185, "learning_rate": 8.512621119013013e-05, "loss": 0.393, "step": 1370 }, { "epoch": 1.9783037475345169, "grad_norm": 0.23305451446876246, "learning_rate": 8.482839776506232e-05, "loss": 0.3837, "step": 1380 }, { "epoch": 1.992648377263762, "grad_norm": 0.24517430064973736, "learning_rate": 8.452816454568124e-05, "loss": 0.3852, "step": 1390 }, { "epoch": 2.005737851891698, "grad_norm": 0.27916951560917247, "learning_rate": 8.422553239147754e-05, "loss": 0.2799, "step": 1400 }, { "epoch": 2.020082481620943, "grad_norm": 0.23593724272097047, "learning_rate": 8.392052232861411e-05, "loss": 0.201, "step": 1410 }, { "epoch": 2.034427111350188, "grad_norm": 0.23512809512185134, "learning_rate": 8.361315554846534e-05, "loss": 0.1983, "step": 1420 }, { "epoch": 2.0487717410794333, "grad_norm": 0.2286549447255431, "learning_rate": 8.330345340614471e-05, "loss": 0.1942, "step": 1430 }, { "epoch": 2.0631163708086784, "grad_norm": 0.24844506458021867, "learning_rate": 8.299143741902111e-05, "loss": 0.1943, "step": 1440 }, { "epoch": 2.0774610005379235, "grad_norm": 0.25585326293058985, "learning_rate": 8.267712926522389e-05, "loss": 0.1993, "step": 1450 }, { "epoch": 2.0918056302671686, "grad_norm": 0.2421387690266048, "learning_rate": 8.236055078213666e-05, "loss": 0.1965, "step": 1460 }, { "epoch": 2.1061502599964137, "grad_norm": 0.24707973152402415, "learning_rate": 8.204172396488013e-05, "loss": 0.1992, "step": 1470 }, { "epoch": 2.120494889725659, "grad_norm": 0.23509099311770917, "learning_rate": 8.172067096478395e-05, "loss": 0.2008, "step": 1480 }, { "epoch": 2.134839519454904, "grad_norm": 0.23605150518346998, "learning_rate": 8.139741408784764e-05, "loss": 0.2019, "step": 1490 }, { "epoch": 2.149184149184149, "grad_norm": 0.2546177191590111, "learning_rate": 8.107197579319082e-05, "loss": 0.2053, "step": 1500 }, { "epoch": 2.163528778913394, "grad_norm": 0.2353676242323245, "learning_rate": 8.074437869149288e-05, "loss": 0.204, "step": 1510 }, { "epoch": 2.1778734086426392, "grad_norm": 0.23401952893152606, "learning_rate": 8.041464554342197e-05, "loss": 0.2036, "step": 1520 }, { "epoch": 2.1922180383718843, "grad_norm": 0.23141975512545726, "learning_rate": 8.008279925805366e-05, "loss": 0.2033, "step": 1530 }, { "epoch": 2.2065626681011294, "grad_norm": 0.23587920943899052, "learning_rate": 7.974886289127927e-05, "loss": 0.2068, "step": 1540 }, { "epoch": 2.2209072978303745, "grad_norm": 0.2394814609218661, "learning_rate": 7.941285964420407e-05, "loss": 0.2049, "step": 1550 }, { "epoch": 2.23525192755962, "grad_norm": 0.2389110148718096, "learning_rate": 7.907481286153516e-05, "loss": 0.2116, "step": 1560 }, { "epoch": 2.249596557288865, "grad_norm": 0.2282395291006806, "learning_rate": 7.873474602995973e-05, "loss": 0.2088, "step": 1570 }, { "epoch": 2.2639411870181103, "grad_norm": 0.23275397540700887, "learning_rate": 7.839268277651311e-05, "loss": 0.2092, "step": 1580 }, { "epoch": 2.2782858167473554, "grad_norm": 0.22624466416184327, "learning_rate": 7.80486468669373e-05, "loss": 0.2088, "step": 1590 }, { "epoch": 2.2926304464766005, "grad_norm": 0.23126585599149074, "learning_rate": 7.770266220402977e-05, "loss": 0.2117, "step": 1600 }, { "epoch": 2.3069750762058456, "grad_norm": 0.226948134461606, "learning_rate": 7.735475282598271e-05, "loss": 0.2097, "step": 1610 }, { "epoch": 2.3213197059350907, "grad_norm": 0.22673465714008167, "learning_rate": 7.700494290471296e-05, "loss": 0.2104, "step": 1620 }, { "epoch": 2.335664335664336, "grad_norm": 0.2556824784968339, "learning_rate": 7.665325674418264e-05, "loss": 0.2136, "step": 1630 }, { "epoch": 2.350008965393581, "grad_norm": 0.25025658975825976, "learning_rate": 7.629971877871039e-05, "loss": 0.2084, "step": 1640 }, { "epoch": 2.364353595122826, "grad_norm": 0.22536490579422702, "learning_rate": 7.594435357127399e-05, "loss": 0.2089, "step": 1650 }, { "epoch": 2.378698224852071, "grad_norm": 0.2258065984765025, "learning_rate": 7.558718581180355e-05, "loss": 0.2067, "step": 1660 }, { "epoch": 2.3930428545813163, "grad_norm": 0.2464593742203822, "learning_rate": 7.522824031546629e-05, "loss": 0.2137, "step": 1670 }, { "epoch": 2.4073874843105614, "grad_norm": 0.24123071412945177, "learning_rate": 7.486754202094229e-05, "loss": 0.2115, "step": 1680 }, { "epoch": 2.4217321140398065, "grad_norm": 0.23105649429700748, "learning_rate": 7.450511598869194e-05, "loss": 0.2138, "step": 1690 }, { "epoch": 2.4360767437690516, "grad_norm": 0.22955721039077792, "learning_rate": 7.414098739921471e-05, "loss": 0.2125, "step": 1700 }, { "epoch": 2.4504213734982967, "grad_norm": 0.23154193335740872, "learning_rate": 7.377518155129973e-05, "loss": 0.2183, "step": 1710 }, { "epoch": 2.464766003227542, "grad_norm": 0.2340236121998045, "learning_rate": 7.340772386026801e-05, "loss": 0.2157, "step": 1720 }, { "epoch": 2.479110632956787, "grad_norm": 0.2250255353665983, "learning_rate": 7.303863985620676e-05, "loss": 0.2123, "step": 1730 }, { "epoch": 2.493455262686032, "grad_norm": 0.2283114308365594, "learning_rate": 7.266795518219548e-05, "loss": 0.2135, "step": 1740 }, { "epoch": 2.507799892415277, "grad_norm": 0.23546636465212323, "learning_rate": 7.22956955925245e-05, "loss": 0.214, "step": 1750 }, { "epoch": 2.5221445221445222, "grad_norm": 0.23275268765839288, "learning_rate": 7.192188695090545e-05, "loss": 0.2156, "step": 1760 }, { "epoch": 2.5364891518737673, "grad_norm": 0.2457436947556184, "learning_rate": 7.154655522867452e-05, "loss": 0.2189, "step": 1770 }, { "epoch": 2.5508337816030124, "grad_norm": 0.2385729628030818, "learning_rate": 7.116972650298782e-05, "loss": 0.2148, "step": 1780 }, { "epoch": 2.5651784113322575, "grad_norm": 0.2382827317725779, "learning_rate": 7.079142695500975e-05, "loss": 0.2127, "step": 1790 }, { "epoch": 2.5795230410615027, "grad_norm": 0.22496477508883403, "learning_rate": 7.041168286809397e-05, "loss": 0.2156, "step": 1800 }, { "epoch": 2.5938676707907478, "grad_norm": 0.2337756123669142, "learning_rate": 7.00305206259572e-05, "loss": 0.2163, "step": 1810 }, { "epoch": 2.608212300519993, "grad_norm": 0.23547675501490803, "learning_rate": 6.964796671084631e-05, "loss": 0.213, "step": 1820 }, { "epoch": 2.622556930249238, "grad_norm": 0.236949625863052, "learning_rate": 6.926404770169819e-05, "loss": 0.2108, "step": 1830 }, { "epoch": 2.636901559978483, "grad_norm": 0.22775808389184637, "learning_rate": 6.887879027229332e-05, "loss": 0.2131, "step": 1840 }, { "epoch": 2.651246189707728, "grad_norm": 0.25558095929144115, "learning_rate": 6.84922211894024e-05, "loss": 0.2146, "step": 1850 }, { "epoch": 2.6655908194369733, "grad_norm": 0.23865636643565702, "learning_rate": 6.810436731092671e-05, "loss": 0.2154, "step": 1860 }, { "epoch": 2.6799354491662184, "grad_norm": 0.23347390914436725, "learning_rate": 6.771525558403203e-05, "loss": 0.2145, "step": 1870 }, { "epoch": 2.6942800788954635, "grad_norm": 0.2311770851119529, "learning_rate": 6.73249130432765e-05, "loss": 0.2112, "step": 1880 }, { "epoch": 2.7086247086247086, "grad_norm": 0.2326246785839781, "learning_rate": 6.69333668087323e-05, "loss": 0.2133, "step": 1890 }, { "epoch": 2.7229693383539537, "grad_norm": 0.23563376415545254, "learning_rate": 6.654064408410132e-05, "loss": 0.2141, "step": 1900 }, { "epoch": 2.737313968083199, "grad_norm": 0.2298522950109398, "learning_rate": 6.614677215482527e-05, "loss": 0.2142, "step": 1910 }, { "epoch": 2.751658597812444, "grad_norm": 0.2364865163676101, "learning_rate": 6.57517783861898e-05, "loss": 0.2127, "step": 1920 }, { "epoch": 2.766003227541689, "grad_norm": 0.22837021217881728, "learning_rate": 6.535569022142335e-05, "loss": 0.2145, "step": 1930 }, { "epoch": 2.780347857270934, "grad_norm": 0.22749769763881308, "learning_rate": 6.495853517979035e-05, "loss": 0.2106, "step": 1940 }, { "epoch": 2.7946924870001792, "grad_norm": 0.21764981978938533, "learning_rate": 6.456034085467935e-05, "loss": 0.2125, "step": 1950 }, { "epoch": 2.8090371167294244, "grad_norm": 0.22774012921821585, "learning_rate": 6.416113491168581e-05, "loss": 0.213, "step": 1960 }, { "epoch": 2.8233817464586695, "grad_norm": 0.22793686074861258, "learning_rate": 6.376094508668999e-05, "loss": 0.2116, "step": 1970 }, { "epoch": 2.8377263761879146, "grad_norm": 0.24345345462191187, "learning_rate": 6.335979918392999e-05, "loss": 0.213, "step": 1980 }, { "epoch": 2.8520710059171597, "grad_norm": 0.230566718186529, "learning_rate": 6.295772507406982e-05, "loss": 0.2123, "step": 1990 }, { "epoch": 2.866415635646405, "grad_norm": 0.23922165240449358, "learning_rate": 6.255475069226326e-05, "loss": 0.211, "step": 2000 }, { "epoch": 2.88076026537565, "grad_norm": 0.22058336484670613, "learning_rate": 6.21509040362127e-05, "loss": 0.2122, "step": 2010 }, { "epoch": 2.895104895104895, "grad_norm": 0.2272702011851071, "learning_rate": 6.174621316422417e-05, "loss": 0.2147, "step": 2020 }, { "epoch": 2.90944952483414, "grad_norm": 0.23799805104509125, "learning_rate": 6.134070619325774e-05, "loss": 0.212, "step": 2030 }, { "epoch": 2.923794154563385, "grad_norm": 0.24608349983752625, "learning_rate": 6.0934411296974184e-05, "loss": 0.2122, "step": 2040 }, { "epoch": 2.9381387842926303, "grad_norm": 0.23079480496683127, "learning_rate": 6.052735670377736e-05, "loss": 0.211, "step": 2050 }, { "epoch": 2.9524834140218754, "grad_norm": 0.22680559271715478, "learning_rate": 6.0119570694853155e-05, "loss": 0.2102, "step": 2060 }, { "epoch": 2.9668280437511205, "grad_norm": 0.22760761484882805, "learning_rate": 5.97110816022044e-05, "loss": 0.2113, "step": 2070 }, { "epoch": 2.9811726734803656, "grad_norm": 0.23303799910976278, "learning_rate": 5.930191780668258e-05, "loss": 0.2088, "step": 2080 }, { "epoch": 2.9955173032096107, "grad_norm": 0.22946738031807773, "learning_rate": 5.88921077360159e-05, "loss": 0.2097, "step": 2090 }, { "epoch": 3.008606777837547, "grad_norm": 0.2697620900381124, "learning_rate": 5.848167986283421e-05, "loss": 0.1134, "step": 2100 }, { "epoch": 3.0229514075667923, "grad_norm": 0.1885938841096422, "learning_rate": 5.807066270269084e-05, "loss": 0.0763, "step": 2110 }, { "epoch": 3.0372960372960374, "grad_norm": 0.214693696805492, "learning_rate": 5.765908481208139e-05, "loss": 0.0756, "step": 2120 }, { "epoch": 3.0516406670252825, "grad_norm": 0.2339101871402584, "learning_rate": 5.724697478645963e-05, "loss": 0.0744, "step": 2130 }, { "epoch": 3.0659852967545276, "grad_norm": 0.1971755620952271, "learning_rate": 5.6834361258250844e-05, "loss": 0.072, "step": 2140 }, { "epoch": 3.0803299264837727, "grad_norm": 0.1981153430750115, "learning_rate": 5.642127289486246e-05, "loss": 0.0748, "step": 2150 }, { "epoch": 3.094674556213018, "grad_norm": 0.2172902671287561, "learning_rate": 5.600773839669237e-05, "loss": 0.0726, "step": 2160 }, { "epoch": 3.109019185942263, "grad_norm": 0.19669334061877888, "learning_rate": 5.559378649513478e-05, "loss": 0.0733, "step": 2170 }, { "epoch": 3.123363815671508, "grad_norm": 0.21027113699329436, "learning_rate": 5.517944595058413e-05, "loss": 0.0746, "step": 2180 }, { "epoch": 3.137708445400753, "grad_norm": 0.20204087893287273, "learning_rate": 5.476474555043688e-05, "loss": 0.0748, "step": 2190 }, { "epoch": 3.152053075129998, "grad_norm": 0.20638588917150788, "learning_rate": 5.4349714107091335e-05, "loss": 0.0744, "step": 2200 }, { "epoch": 3.1663977048592433, "grad_norm": 0.20367882761147596, "learning_rate": 5.393438045594595e-05, "loss": 0.0755, "step": 2210 }, { "epoch": 3.1807423345884884, "grad_norm": 0.20836979312681028, "learning_rate": 5.351877345339583e-05, "loss": 0.076, "step": 2220 }, { "epoch": 3.1950869643177335, "grad_norm": 0.19643695987807314, "learning_rate": 5.310292197482791e-05, "loss": 0.0733, "step": 2230 }, { "epoch": 3.2094315940469786, "grad_norm": 0.20621763947145422, "learning_rate": 5.268685491261472e-05, "loss": 0.075, "step": 2240 }, { "epoch": 3.2237762237762237, "grad_norm": 0.20777873086593704, "learning_rate": 5.227060117410702e-05, "loss": 0.0746, "step": 2250 }, { "epoch": 3.238120853505469, "grad_norm": 0.2021910407099938, "learning_rate": 5.185418967962543e-05, "loss": 0.0747, "step": 2260 }, { "epoch": 3.252465483234714, "grad_norm": 0.2016612434414281, "learning_rate": 5.143764936045106e-05, "loss": 0.0743, "step": 2270 }, { "epoch": 3.266810112963959, "grad_norm": 0.2180992659409795, "learning_rate": 5.1021009156815414e-05, "loss": 0.0744, "step": 2280 }, { "epoch": 3.281154742693204, "grad_norm": 0.2056058145565962, "learning_rate": 5.060429801588983e-05, "loss": 0.0744, "step": 2290 }, { "epoch": 3.2954993724224493, "grad_norm": 0.20164051829762908, "learning_rate": 5.018754488977409e-05, "loss": 0.0745, "step": 2300 }, { "epoch": 3.3098440021516944, "grad_norm": 0.2026538165933443, "learning_rate": 4.9770778733485065e-05, "loss": 0.074, "step": 2310 }, { "epoch": 3.3241886318809395, "grad_norm": 0.20427324673762595, "learning_rate": 4.935402850294494e-05, "loss": 0.0739, "step": 2320 }, { "epoch": 3.3385332616101846, "grad_norm": 0.20831211218540635, "learning_rate": 4.893732315296942e-05, "loss": 0.0748, "step": 2330 }, { "epoch": 3.3528778913394297, "grad_norm": 0.20740018500070947, "learning_rate": 4.852069163525595e-05, "loss": 0.0737, "step": 2340 }, { "epoch": 3.367222521068675, "grad_norm": 0.20060155886370676, "learning_rate": 4.810416289637234e-05, "loss": 0.0729, "step": 2350 }, { "epoch": 3.38156715079792, "grad_norm": 0.199826847154071, "learning_rate": 4.7687765875745574e-05, "loss": 0.0739, "step": 2360 }, { "epoch": 3.395911780527165, "grad_norm": 0.20063705204581495, "learning_rate": 4.727152950365117e-05, "loss": 0.0737, "step": 2370 }, { "epoch": 3.41025641025641, "grad_norm": 0.20947972363514977, "learning_rate": 4.685548269920312e-05, "loss": 0.0736, "step": 2380 }, { "epoch": 3.4246010399856552, "grad_norm": 0.2006701925989043, "learning_rate": 4.643965436834474e-05, "loss": 0.075, "step": 2390 }, { "epoch": 3.4389456697149003, "grad_norm": 0.20335025504735554, "learning_rate": 4.6024073401840336e-05, "loss": 0.0745, "step": 2400 }, { "epoch": 3.4532902994441455, "grad_norm": 0.2192162565442083, "learning_rate": 4.560876867326791e-05, "loss": 0.0738, "step": 2410 }, { "epoch": 3.4676349291733906, "grad_norm": 0.19858055523329815, "learning_rate": 4.5193769037013066e-05, "loss": 0.0732, "step": 2420 }, { "epoch": 3.4819795589026357, "grad_norm": 0.20485303414115183, "learning_rate": 4.477910332626438e-05, "loss": 0.0728, "step": 2430 }, { "epoch": 3.4963241886318808, "grad_norm": 0.19011594248287844, "learning_rate": 4.4364800351010066e-05, "loss": 0.0726, "step": 2440 }, { "epoch": 3.5106688183611263, "grad_norm": 0.20410603253979742, "learning_rate": 4.395088889603633e-05, "loss": 0.0736, "step": 2450 }, { "epoch": 3.5250134480903714, "grad_norm": 0.1994983957599032, "learning_rate": 4.353739771892746e-05, "loss": 0.073, "step": 2460 }, { "epoch": 3.5393580778196165, "grad_norm": 0.20349060401414618, "learning_rate": 4.312435554806787e-05, "loss": 0.0736, "step": 2470 }, { "epoch": 3.5537027075488616, "grad_norm": 0.20221336765718947, "learning_rate": 4.271179108064605e-05, "loss": 0.0713, "step": 2480 }, { "epoch": 3.5680473372781067, "grad_norm": 0.1920539935100462, "learning_rate": 4.229973298066083e-05, "loss": 0.0714, "step": 2490 }, { "epoch": 3.582391967007352, "grad_norm": 0.18514594535819984, "learning_rate": 4.188820987692981e-05, "loss": 0.0716, "step": 2500 }, { "epoch": 3.596736596736597, "grad_norm": 0.19390555703637974, "learning_rate": 4.1477250361100317e-05, "loss": 0.072, "step": 2510 }, { "epoch": 3.611081226465842, "grad_norm": 0.19881724163942532, "learning_rate": 4.106688298566295e-05, "loss": 0.0722, "step": 2520 }, { "epoch": 3.625425856195087, "grad_norm": 0.19864848134388496, "learning_rate": 4.065713626196778e-05, "loss": 0.0697, "step": 2530 }, { "epoch": 3.6397704859243323, "grad_norm": 0.20964033399772472, "learning_rate": 4.0248038658243515e-05, "loss": 0.0703, "step": 2540 }, { "epoch": 3.6541151156535774, "grad_norm": 0.1887224816930325, "learning_rate": 3.983961859761946e-05, "loss": 0.071, "step": 2550 }, { "epoch": 3.6684597453828225, "grad_norm": 0.1939910437911645, "learning_rate": 3.9431904456150914e-05, "loss": 0.0685, "step": 2560 }, { "epoch": 3.6828043751120676, "grad_norm": 0.1905566250106664, "learning_rate": 3.902492456084757e-05, "loss": 0.0709, "step": 2570 }, { "epoch": 3.6971490048413127, "grad_norm": 0.1954219594857734, "learning_rate": 3.861870718770545e-05, "loss": 0.0691, "step": 2580 }, { "epoch": 3.711493634570558, "grad_norm": 0.20129771340548336, "learning_rate": 3.821328055974231e-05, "loss": 0.0688, "step": 2590 }, { "epoch": 3.725838264299803, "grad_norm": 0.19424451985885532, "learning_rate": 3.780867284503685e-05, "loss": 0.0705, "step": 2600 }, { "epoch": 3.740182894029048, "grad_norm": 0.19307288848206286, "learning_rate": 3.7404912154771626e-05, "loss": 0.069, "step": 2610 }, { "epoch": 3.754527523758293, "grad_norm": 0.20224114631498458, "learning_rate": 3.7002026541279905e-05, "loss": 0.069, "step": 2620 }, { "epoch": 3.7688721534875382, "grad_norm": 0.19645086260070405, "learning_rate": 3.660004399609675e-05, "loss": 0.0693, "step": 2630 }, { "epoch": 3.7832167832167833, "grad_norm": 0.2009057118393354, "learning_rate": 3.619899244801414e-05, "loss": 0.0695, "step": 2640 }, { "epoch": 3.7975614129460284, "grad_norm": 0.20154345565922616, "learning_rate": 3.5798899761140626e-05, "loss": 0.0688, "step": 2650 }, { "epoch": 3.8119060426752736, "grad_norm": 0.19819908788727933, "learning_rate": 3.5399793732965324e-05, "loss": 0.0703, "step": 2660 }, { "epoch": 3.8262506724045187, "grad_norm": 0.19579772630914064, "learning_rate": 3.500170209242671e-05, "loss": 0.0673, "step": 2670 }, { "epoch": 3.8405953021337638, "grad_norm": 0.1930709905078437, "learning_rate": 3.460465249798592e-05, "loss": 0.068, "step": 2680 }, { "epoch": 3.854939931863009, "grad_norm": 0.19375769656837338, "learning_rate": 3.420867253570529e-05, "loss": 0.0668, "step": 2690 }, { "epoch": 3.869284561592254, "grad_norm": 0.19590535607906298, "learning_rate": 3.381378971733161e-05, "loss": 0.0658, "step": 2700 }, { "epoch": 3.883629191321499, "grad_norm": 0.19485732453673113, "learning_rate": 3.342003147838475e-05, "loss": 0.0671, "step": 2710 }, { "epoch": 3.897973821050744, "grad_norm": 0.19120048275674587, "learning_rate": 3.302742517625144e-05, "loss": 0.0665, "step": 2720 }, { "epoch": 3.9123184507799893, "grad_norm": 0.19464302345990753, "learning_rate": 3.2635998088284596e-05, "loss": 0.0662, "step": 2730 }, { "epoch": 3.9266630805092344, "grad_norm": 0.20017821890333443, "learning_rate": 3.224577740990814e-05, "loss": 0.0655, "step": 2740 }, { "epoch": 3.9410077102384795, "grad_norm": 0.18866754533216776, "learning_rate": 3.185679025272753e-05, "loss": 0.0663, "step": 2750 }, { "epoch": 3.9553523399677246, "grad_norm": 0.19243520237850759, "learning_rate": 3.146906364264606e-05, "loss": 0.0657, "step": 2760 }, { "epoch": 3.9696969696969697, "grad_norm": 0.1924186499329835, "learning_rate": 3.108262451798724e-05, "loss": 0.0651, "step": 2770 }, { "epoch": 3.984041599426215, "grad_norm": 0.21148279808210507, "learning_rate": 3.069749972762316e-05, "loss": 0.0648, "step": 2780 }, { "epoch": 3.99838622915546, "grad_norm": 0.19991640585361364, "learning_rate": 3.0313716029109064e-05, "loss": 0.0645, "step": 2790 }, { "epoch": 4.011475703783396, "grad_norm": 0.13064534629220334, "learning_rate": 2.993130008682436e-05, "loss": 0.0228, "step": 2800 }, { "epoch": 4.025820333512641, "grad_norm": 0.14874535281957305, "learning_rate": 2.955027847011993e-05, "loss": 0.0176, "step": 2810 }, { "epoch": 4.040164963241886, "grad_norm": 0.14336180228683498, "learning_rate": 2.917067765147229e-05, "loss": 0.0176, "step": 2820 }, { "epoch": 4.054509592971131, "grad_norm": 0.12559441494646076, "learning_rate": 2.8792524004644283e-05, "loss": 0.0167, "step": 2830 }, { "epoch": 4.068854222700376, "grad_norm": 0.12484448147694403, "learning_rate": 2.8415843802852672e-05, "loss": 0.0167, "step": 2840 }, { "epoch": 4.083198852429621, "grad_norm": 0.1337296091314726, "learning_rate": 2.8040663216942752e-05, "loss": 0.0169, "step": 2850 }, { "epoch": 4.0975434821588665, "grad_norm": 0.12242456577697475, "learning_rate": 2.7667008313570076e-05, "loss": 0.0161, "step": 2860 }, { "epoch": 4.111888111888112, "grad_norm": 0.13243095768870966, "learning_rate": 2.729490505338943e-05, "loss": 0.0161, "step": 2870 }, { "epoch": 4.126232741617357, "grad_norm": 0.12277718816926177, "learning_rate": 2.692437928925109e-05, "loss": 0.0157, "step": 2880 }, { "epoch": 4.140577371346602, "grad_norm": 0.137540991678628, "learning_rate": 2.655545676440464e-05, "loss": 0.0159, "step": 2890 }, { "epoch": 4.154922001075847, "grad_norm": 0.13131712471544715, "learning_rate": 2.6188163110710435e-05, "loss": 0.0161, "step": 2900 }, { "epoch": 4.169266630805092, "grad_norm": 0.13640835620647865, "learning_rate": 2.582252384685874e-05, "loss": 0.0164, "step": 2910 }, { "epoch": 4.183611260534337, "grad_norm": 0.12543818653508698, "learning_rate": 2.5458564376596732e-05, "loss": 0.0157, "step": 2920 }, { "epoch": 4.197955890263582, "grad_norm": 0.11736167137678152, "learning_rate": 2.509630998696349e-05, "loss": 0.0154, "step": 2930 }, { "epoch": 4.212300519992827, "grad_norm": 0.1245551329001544, "learning_rate": 2.473578584653321e-05, "loss": 0.0152, "step": 2940 }, { "epoch": 4.2266451497220725, "grad_norm": 0.12763631881000323, "learning_rate": 2.4377017003666413e-05, "loss": 0.0155, "step": 2950 }, { "epoch": 4.240989779451318, "grad_norm": 0.13313595943206588, "learning_rate": 2.4020028384769795e-05, "loss": 0.015, "step": 2960 }, { "epoch": 4.255334409180563, "grad_norm": 0.125069284406997, "learning_rate": 2.366484479256425e-05, "loss": 0.015, "step": 2970 }, { "epoch": 4.269679038909808, "grad_norm": 0.13167131954772826, "learning_rate": 2.3311490904361738e-05, "loss": 0.0159, "step": 2980 }, { "epoch": 4.284023668639053, "grad_norm": 0.11344149792986571, "learning_rate": 2.295999127035071e-05, "loss": 0.0147, "step": 2990 }, { "epoch": 4.298368298368298, "grad_norm": 0.1299095136285245, "learning_rate": 2.26103703118905e-05, "loss": 0.015, "step": 3000 } ], "logging_steps": 10, "max_steps": 4188, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 940152769216512.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }