wallet_deberta_v10 / trainer_state.json
officeuseaitf2024's picture
Upload checkpoint-2754 from Kaggle training
7dfcd52 verified
{
"best_global_step": 2754,
"best_metric": 1.0,
"best_model_checkpoint": "./results/wallet_deberta_v10/checkpoint-2754",
"epoch": 0.4001452960406829,
"eval_steps": 1377,
"global_step": 2754,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000726480203414457,
"grad_norm": 1.474841833114624,
"learning_rate": 5.809731299927378e-07,
"loss": 0.6929,
"step": 5
},
{
"epoch": 0.001452960406828914,
"grad_norm": 0.7840715646743774,
"learning_rate": 1.3071895424836602e-06,
"loss": 0.6982,
"step": 10
},
{
"epoch": 0.002179440610243371,
"grad_norm": 0.6173088550567627,
"learning_rate": 2.0334059549745822e-06,
"loss": 0.6906,
"step": 15
},
{
"epoch": 0.002905920813657828,
"grad_norm": 1.7541619539260864,
"learning_rate": 2.759622367465505e-06,
"loss": 0.6917,
"step": 20
},
{
"epoch": 0.003632401017072285,
"grad_norm": 1.0843828916549683,
"learning_rate": 3.4858387799564276e-06,
"loss": 0.6885,
"step": 25
},
{
"epoch": 0.004358881220486742,
"grad_norm": 0.6747203469276428,
"learning_rate": 4.212055192447349e-06,
"loss": 0.6854,
"step": 30
},
{
"epoch": 0.005085361423901199,
"grad_norm": 2.215040445327759,
"learning_rate": 4.938271604938272e-06,
"loss": 0.6895,
"step": 35
},
{
"epoch": 0.005811841627315656,
"grad_norm": 1.2605243921279907,
"learning_rate": 5.664488017429194e-06,
"loss": 0.6914,
"step": 40
},
{
"epoch": 0.006538321830730113,
"grad_norm": 1.5572800636291504,
"learning_rate": 6.390704429920116e-06,
"loss": 0.6913,
"step": 45
},
{
"epoch": 0.00726480203414457,
"grad_norm": 0.5582659244537354,
"learning_rate": 7.116920842411038e-06,
"loss": 0.7015,
"step": 50
},
{
"epoch": 0.007991282237559027,
"grad_norm": 2.64322829246521,
"learning_rate": 7.84313725490196e-06,
"loss": 0.6883,
"step": 55
},
{
"epoch": 0.008717762440973484,
"grad_norm": 0.4942118525505066,
"learning_rate": 8.569353667392883e-06,
"loss": 0.6863,
"step": 60
},
{
"epoch": 0.00944424264438794,
"grad_norm": 1.1258032321929932,
"learning_rate": 9.295570079883805e-06,
"loss": 0.6999,
"step": 65
},
{
"epoch": 0.010170722847802398,
"grad_norm": 1.727752923965454,
"learning_rate": 1.0021786492374727e-05,
"loss": 0.6932,
"step": 70
},
{
"epoch": 0.010897203051216855,
"grad_norm": 1.4421207904815674,
"learning_rate": 1.0748002904865651e-05,
"loss": 0.6898,
"step": 75
},
{
"epoch": 0.011623683254631312,
"grad_norm": 1.5667537450790405,
"learning_rate": 1.1474219317356574e-05,
"loss": 0.6965,
"step": 80
},
{
"epoch": 0.012350163458045769,
"grad_norm": 0.44926533102989197,
"learning_rate": 1.2200435729847496e-05,
"loss": 0.6929,
"step": 85
},
{
"epoch": 0.013076643661460226,
"grad_norm": 0.425881028175354,
"learning_rate": 1.2926652142338416e-05,
"loss": 0.6907,
"step": 90
},
{
"epoch": 0.013803123864874683,
"grad_norm": 0.4951478838920593,
"learning_rate": 1.365286855482934e-05,
"loss": 0.6993,
"step": 95
},
{
"epoch": 0.01452960406828914,
"grad_norm": 0.3694448471069336,
"learning_rate": 1.4379084967320261e-05,
"loss": 0.6831,
"step": 100
},
{
"epoch": 0.015256084271703597,
"grad_norm": 1.17753005027771,
"learning_rate": 1.5105301379811185e-05,
"loss": 0.68,
"step": 105
},
{
"epoch": 0.015982564475118054,
"grad_norm": 0.611096978187561,
"learning_rate": 1.5831517792302105e-05,
"loss": 0.6921,
"step": 110
},
{
"epoch": 0.01670904467853251,
"grad_norm": 1.563934087753296,
"learning_rate": 1.655773420479303e-05,
"loss": 0.6865,
"step": 115
},
{
"epoch": 0.017435524881946968,
"grad_norm": 0.928711473941803,
"learning_rate": 1.728395061728395e-05,
"loss": 0.6861,
"step": 120
},
{
"epoch": 0.018162005085361425,
"grad_norm": 1.1964377164840698,
"learning_rate": 1.8010167029774874e-05,
"loss": 0.7021,
"step": 125
},
{
"epoch": 0.01888848528877588,
"grad_norm": 0.3896273970603943,
"learning_rate": 1.8736383442265794e-05,
"loss": 0.6918,
"step": 130
},
{
"epoch": 0.01961496549219034,
"grad_norm": 0.4799005091190338,
"learning_rate": 1.946259985475672e-05,
"loss": 0.6954,
"step": 135
},
{
"epoch": 0.020341445695604796,
"grad_norm": 0.6113623380661011,
"learning_rate": 2.018881626724764e-05,
"loss": 0.6886,
"step": 140
},
{
"epoch": 0.021067925899019253,
"grad_norm": 0.6965861320495605,
"learning_rate": 2.0915032679738563e-05,
"loss": 0.6814,
"step": 145
},
{
"epoch": 0.02179440610243371,
"grad_norm": 0.46387240290641785,
"learning_rate": 2.1641249092229483e-05,
"loss": 0.681,
"step": 150
},
{
"epoch": 0.022520886305848167,
"grad_norm": 0.4296594560146332,
"learning_rate": 2.2367465504720407e-05,
"loss": 0.6853,
"step": 155
},
{
"epoch": 0.023247366509262624,
"grad_norm": 0.8854900002479553,
"learning_rate": 2.3093681917211328e-05,
"loss": 0.6784,
"step": 160
},
{
"epoch": 0.02397384671267708,
"grad_norm": 0.7150282263755798,
"learning_rate": 2.3819898329702252e-05,
"loss": 0.6774,
"step": 165
},
{
"epoch": 0.024700326916091538,
"grad_norm": 0.8739128708839417,
"learning_rate": 2.4546114742193176e-05,
"loss": 0.6662,
"step": 170
},
{
"epoch": 0.025426807119505995,
"grad_norm": 0.7919905781745911,
"learning_rate": 2.5272331154684096e-05,
"loss": 0.6601,
"step": 175
},
{
"epoch": 0.02615328732292045,
"grad_norm": 0.6220109462738037,
"learning_rate": 2.599854756717502e-05,
"loss": 0.649,
"step": 180
},
{
"epoch": 0.02687976752633491,
"grad_norm": 0.8708673715591431,
"learning_rate": 2.672476397966594e-05,
"loss": 0.6318,
"step": 185
},
{
"epoch": 0.027606247729749366,
"grad_norm": 1.0253198146820068,
"learning_rate": 2.7450980392156865e-05,
"loss": 0.6025,
"step": 190
},
{
"epoch": 0.028332727933163823,
"grad_norm": 1.1449552774429321,
"learning_rate": 2.817719680464779e-05,
"loss": 0.5564,
"step": 195
},
{
"epoch": 0.02905920813657828,
"grad_norm": 3.2159643173217773,
"learning_rate": 2.890341321713871e-05,
"loss": 0.515,
"step": 200
},
{
"epoch": 0.029785688339992736,
"grad_norm": 1.912434458732605,
"learning_rate": 2.962962962962963e-05,
"loss": 0.4635,
"step": 205
},
{
"epoch": 0.030512168543407193,
"grad_norm": 1.9808599948883057,
"learning_rate": 3.0355846042120557e-05,
"loss": 0.4233,
"step": 210
},
{
"epoch": 0.03123864874682165,
"grad_norm": 1.929961085319519,
"learning_rate": 3.1082062454611474e-05,
"loss": 0.3505,
"step": 215
},
{
"epoch": 0.03196512895023611,
"grad_norm": 2.4213836193084717,
"learning_rate": 3.1808278867102395e-05,
"loss": 0.3079,
"step": 220
},
{
"epoch": 0.03269160915365056,
"grad_norm": 8.767487525939941,
"learning_rate": 3.2534495279593316e-05,
"loss": 0.2805,
"step": 225
},
{
"epoch": 0.03341808935706502,
"grad_norm": 6.868554592132568,
"learning_rate": 3.326071169208424e-05,
"loss": 0.2501,
"step": 230
},
{
"epoch": 0.034144569560479475,
"grad_norm": 10.502647399902344,
"learning_rate": 3.3986928104575163e-05,
"loss": 0.2505,
"step": 235
},
{
"epoch": 0.034871049763893935,
"grad_norm": 2.8313727378845215,
"learning_rate": 3.471314451706609e-05,
"loss": 0.2265,
"step": 240
},
{
"epoch": 0.03559752996730839,
"grad_norm": 4.173934459686279,
"learning_rate": 3.543936092955701e-05,
"loss": 0.2045,
"step": 245
},
{
"epoch": 0.03632401017072285,
"grad_norm": 1.775830626487732,
"learning_rate": 3.616557734204793e-05,
"loss": 0.1584,
"step": 250
},
{
"epoch": 0.0370504903741373,
"grad_norm": 3.129055976867676,
"learning_rate": 3.689179375453886e-05,
"loss": 0.131,
"step": 255
},
{
"epoch": 0.03777697057755176,
"grad_norm": 4.053362846374512,
"learning_rate": 3.761801016702978e-05,
"loss": 0.1333,
"step": 260
},
{
"epoch": 0.03850345078096622,
"grad_norm": 3.1923694610595703,
"learning_rate": 3.83442265795207e-05,
"loss": 0.1734,
"step": 265
},
{
"epoch": 0.03922993098438068,
"grad_norm": 4.751387119293213,
"learning_rate": 3.907044299201162e-05,
"loss": 0.1296,
"step": 270
},
{
"epoch": 0.03995641118779513,
"grad_norm": 1.9991039037704468,
"learning_rate": 3.979665940450255e-05,
"loss": 0.1297,
"step": 275
},
{
"epoch": 0.04068289139120959,
"grad_norm": 5.077785015106201,
"learning_rate": 4.052287581699347e-05,
"loss": 0.1127,
"step": 280
},
{
"epoch": 0.041409371594624045,
"grad_norm": 3.6270077228546143,
"learning_rate": 4.124909222948439e-05,
"loss": 0.1125,
"step": 285
},
{
"epoch": 0.042135851798038505,
"grad_norm": 1.017204999923706,
"learning_rate": 4.197530864197531e-05,
"loss": 0.0802,
"step": 290
},
{
"epoch": 0.04286233200145296,
"grad_norm": 1.492018222808838,
"learning_rate": 4.270152505446624e-05,
"loss": 0.1008,
"step": 295
},
{
"epoch": 0.04358881220486742,
"grad_norm": 1.7372925281524658,
"learning_rate": 4.342774146695716e-05,
"loss": 0.1172,
"step": 300
},
{
"epoch": 0.04431529240828187,
"grad_norm": 2.817929983139038,
"learning_rate": 4.415395787944808e-05,
"loss": 0.1096,
"step": 305
},
{
"epoch": 0.04504177261169633,
"grad_norm": 0.9688124656677246,
"learning_rate": 4.4880174291939e-05,
"loss": 0.0663,
"step": 310
},
{
"epoch": 0.04576825281511079,
"grad_norm": 3.9759960174560547,
"learning_rate": 4.5606390704429926e-05,
"loss": 0.0706,
"step": 315
},
{
"epoch": 0.04649473301852525,
"grad_norm": 5.014834880828857,
"learning_rate": 4.633260711692085e-05,
"loss": 0.0681,
"step": 320
},
{
"epoch": 0.0472212132219397,
"grad_norm": 3.1871182918548584,
"learning_rate": 4.705882352941177e-05,
"loss": 0.0934,
"step": 325
},
{
"epoch": 0.04794769342535416,
"grad_norm": 5.145167350769043,
"learning_rate": 4.778503994190269e-05,
"loss": 0.0777,
"step": 330
},
{
"epoch": 0.048674173628768615,
"grad_norm": 6.0013275146484375,
"learning_rate": 4.8511256354393615e-05,
"loss": 0.0825,
"step": 335
},
{
"epoch": 0.049400653832183075,
"grad_norm": 0.9712790250778198,
"learning_rate": 4.9237472766884536e-05,
"loss": 0.0417,
"step": 340
},
{
"epoch": 0.05012713403559753,
"grad_norm": 1.4921551942825317,
"learning_rate": 4.9963689179375456e-05,
"loss": 0.0316,
"step": 345
},
{
"epoch": 0.05085361423901199,
"grad_norm": 4.556818008422852,
"learning_rate": 5.0689905591866384e-05,
"loss": 0.0592,
"step": 350
},
{
"epoch": 0.05158009444242644,
"grad_norm": 2.8250820636749268,
"learning_rate": 5.1416122004357304e-05,
"loss": 0.0591,
"step": 355
},
{
"epoch": 0.0523065746458409,
"grad_norm": 2.345460891723633,
"learning_rate": 5.2142338416848225e-05,
"loss": 0.073,
"step": 360
},
{
"epoch": 0.05303305484925536,
"grad_norm": 0.44890037178993225,
"learning_rate": 5.2868554829339145e-05,
"loss": 0.057,
"step": 365
},
{
"epoch": 0.05375953505266982,
"grad_norm": 1.1340672969818115,
"learning_rate": 5.3594771241830066e-05,
"loss": 0.0466,
"step": 370
},
{
"epoch": 0.05448601525608427,
"grad_norm": 0.7642996311187744,
"learning_rate": 5.4320987654320986e-05,
"loss": 0.0437,
"step": 375
},
{
"epoch": 0.05521249545949873,
"grad_norm": 4.866988658905029,
"learning_rate": 5.504720406681191e-05,
"loss": 0.0654,
"step": 380
},
{
"epoch": 0.055938975662913185,
"grad_norm": 0.9396504163742065,
"learning_rate": 5.577342047930284e-05,
"loss": 0.0184,
"step": 385
},
{
"epoch": 0.056665455866327645,
"grad_norm": 0.5176196098327637,
"learning_rate": 5.649963689179376e-05,
"loss": 0.0208,
"step": 390
},
{
"epoch": 0.0573919360697421,
"grad_norm": 4.328392028808594,
"learning_rate": 5.722585330428468e-05,
"loss": 0.0186,
"step": 395
},
{
"epoch": 0.05811841627315656,
"grad_norm": 6.72576379776001,
"learning_rate": 5.79520697167756e-05,
"loss": 0.0311,
"step": 400
},
{
"epoch": 0.05884489647657101,
"grad_norm": 10.39561653137207,
"learning_rate": 5.8678286129266523e-05,
"loss": 0.0453,
"step": 405
},
{
"epoch": 0.05957137667998547,
"grad_norm": 8.84882926940918,
"learning_rate": 5.9404502541757444e-05,
"loss": 0.0821,
"step": 410
},
{
"epoch": 0.060297856883399926,
"grad_norm": 0.5676841735839844,
"learning_rate": 6.0130718954248365e-05,
"loss": 0.1021,
"step": 415
},
{
"epoch": 0.06102433708681439,
"grad_norm": 3.4484715461730957,
"learning_rate": 6.0856935366739285e-05,
"loss": 0.0397,
"step": 420
},
{
"epoch": 0.06175081729022884,
"grad_norm": 8.414154052734375,
"learning_rate": 6.158315177923021e-05,
"loss": 0.0864,
"step": 425
},
{
"epoch": 0.0624772974936433,
"grad_norm": 5.502734184265137,
"learning_rate": 6.230936819172115e-05,
"loss": 0.0313,
"step": 430
},
{
"epoch": 0.06320377769705776,
"grad_norm": 6.950675964355469,
"learning_rate": 6.303558460421207e-05,
"loss": 0.0271,
"step": 435
},
{
"epoch": 0.06393025790047221,
"grad_norm": 2.7828145027160645,
"learning_rate": 6.376180101670299e-05,
"loss": 0.016,
"step": 440
},
{
"epoch": 0.06465673810388667,
"grad_norm": 4.585832118988037,
"learning_rate": 6.448801742919391e-05,
"loss": 0.0402,
"step": 445
},
{
"epoch": 0.06538321830730112,
"grad_norm": 5.096743106842041,
"learning_rate": 6.521423384168483e-05,
"loss": 0.0719,
"step": 450
},
{
"epoch": 0.06610969851071559,
"grad_norm": 5.883689880371094,
"learning_rate": 6.594045025417575e-05,
"loss": 0.0691,
"step": 455
},
{
"epoch": 0.06683617871413004,
"grad_norm": 1.7454990148544312,
"learning_rate": 6.666666666666667e-05,
"loss": 0.0504,
"step": 460
},
{
"epoch": 0.0675626589175445,
"grad_norm": 2.231943368911743,
"learning_rate": 6.739288307915759e-05,
"loss": 0.0184,
"step": 465
},
{
"epoch": 0.06828913912095895,
"grad_norm": 4.1820268630981445,
"learning_rate": 6.811909949164852e-05,
"loss": 0.025,
"step": 470
},
{
"epoch": 0.06901561932437342,
"grad_norm": 0.06752662360668182,
"learning_rate": 6.884531590413945e-05,
"loss": 0.0061,
"step": 475
},
{
"epoch": 0.06974209952778787,
"grad_norm": 0.034968651831150055,
"learning_rate": 6.957153231663037e-05,
"loss": 0.0246,
"step": 480
},
{
"epoch": 0.07046857973120232,
"grad_norm": 4.133062839508057,
"learning_rate": 7.029774872912129e-05,
"loss": 0.0483,
"step": 485
},
{
"epoch": 0.07119505993461678,
"grad_norm": 0.14520829916000366,
"learning_rate": 7.10239651416122e-05,
"loss": 0.0242,
"step": 490
},
{
"epoch": 0.07192154013803125,
"grad_norm": 0.08248770982027054,
"learning_rate": 7.175018155410313e-05,
"loss": 0.0389,
"step": 495
},
{
"epoch": 0.0726480203414457,
"grad_norm": 0.09677606076002121,
"learning_rate": 7.247639796659405e-05,
"loss": 0.0813,
"step": 500
},
{
"epoch": 0.07337450054486015,
"grad_norm": 2.2317094802856445,
"learning_rate": 7.320261437908497e-05,
"loss": 0.0425,
"step": 505
},
{
"epoch": 0.0741009807482746,
"grad_norm": 0.9524332284927368,
"learning_rate": 7.39288307915759e-05,
"loss": 0.0165,
"step": 510
},
{
"epoch": 0.07482746095168907,
"grad_norm": 1.2688440084457397,
"learning_rate": 7.465504720406682e-05,
"loss": 0.0376,
"step": 515
},
{
"epoch": 0.07555394115510353,
"grad_norm": 0.5410459637641907,
"learning_rate": 7.538126361655774e-05,
"loss": 0.0132,
"step": 520
},
{
"epoch": 0.07628042135851798,
"grad_norm": 1.0646350383758545,
"learning_rate": 7.610748002904866e-05,
"loss": 0.0357,
"step": 525
},
{
"epoch": 0.07700690156193243,
"grad_norm": 0.05422890931367874,
"learning_rate": 7.683369644153958e-05,
"loss": 0.0024,
"step": 530
},
{
"epoch": 0.0777333817653469,
"grad_norm": 1.7686655521392822,
"learning_rate": 7.75599128540305e-05,
"loss": 0.011,
"step": 535
},
{
"epoch": 0.07845986196876135,
"grad_norm": 1.7055928707122803,
"learning_rate": 7.828612926652143e-05,
"loss": 0.0283,
"step": 540
},
{
"epoch": 0.07918634217217581,
"grad_norm": 7.1870245933532715,
"learning_rate": 7.901234567901235e-05,
"loss": 0.0255,
"step": 545
},
{
"epoch": 0.07991282237559026,
"grad_norm": 4.143937110900879,
"learning_rate": 7.973856209150328e-05,
"loss": 0.0163,
"step": 550
},
{
"epoch": 0.08063930257900472,
"grad_norm": 2.7253036499023438,
"learning_rate": 8.04647785039942e-05,
"loss": 0.0356,
"step": 555
},
{
"epoch": 0.08136578278241918,
"grad_norm": 0.1260932832956314,
"learning_rate": 8.119099491648512e-05,
"loss": 0.0897,
"step": 560
},
{
"epoch": 0.08209226298583364,
"grad_norm": 0.8739075064659119,
"learning_rate": 8.191721132897604e-05,
"loss": 0.0212,
"step": 565
},
{
"epoch": 0.08281874318924809,
"grad_norm": 0.07976645231246948,
"learning_rate": 8.264342774146696e-05,
"loss": 0.0202,
"step": 570
},
{
"epoch": 0.08354522339266254,
"grad_norm": 3.089498996734619,
"learning_rate": 8.336964415395788e-05,
"loss": 0.0288,
"step": 575
},
{
"epoch": 0.08427170359607701,
"grad_norm": 1.1282787322998047,
"learning_rate": 8.40958605664488e-05,
"loss": 0.0236,
"step": 580
},
{
"epoch": 0.08499818379949146,
"grad_norm": 2.50753116607666,
"learning_rate": 8.482207697893972e-05,
"loss": 0.0491,
"step": 585
},
{
"epoch": 0.08572466400290592,
"grad_norm": 15.398341178894043,
"learning_rate": 8.554829339143065e-05,
"loss": 0.0541,
"step": 590
},
{
"epoch": 0.08645114420632037,
"grad_norm": 0.3026963174343109,
"learning_rate": 8.627450980392158e-05,
"loss": 0.0053,
"step": 595
},
{
"epoch": 0.08717762440973484,
"grad_norm": 0.12404945492744446,
"learning_rate": 8.70007262164125e-05,
"loss": 0.0404,
"step": 600
},
{
"epoch": 0.08790410461314929,
"grad_norm": 0.9239891767501831,
"learning_rate": 8.772694262890342e-05,
"loss": 0.0221,
"step": 605
},
{
"epoch": 0.08863058481656375,
"grad_norm": 1.404173493385315,
"learning_rate": 8.845315904139434e-05,
"loss": 0.0122,
"step": 610
},
{
"epoch": 0.0893570650199782,
"grad_norm": 3.049877405166626,
"learning_rate": 8.917937545388526e-05,
"loss": 0.02,
"step": 615
},
{
"epoch": 0.09008354522339267,
"grad_norm": 0.6122508645057678,
"learning_rate": 8.990559186637618e-05,
"loss": 0.0191,
"step": 620
},
{
"epoch": 0.09081002542680712,
"grad_norm": 0.021131640300154686,
"learning_rate": 9.06318082788671e-05,
"loss": 0.0257,
"step": 625
},
{
"epoch": 0.09153650563022157,
"grad_norm": 1.1997209787368774,
"learning_rate": 9.135802469135802e-05,
"loss": 0.0067,
"step": 630
},
{
"epoch": 0.09226298583363603,
"grad_norm": 2.1676833629608154,
"learning_rate": 9.208424110384896e-05,
"loss": 0.0078,
"step": 635
},
{
"epoch": 0.0929894660370505,
"grad_norm": 0.29366588592529297,
"learning_rate": 9.281045751633988e-05,
"loss": 0.0052,
"step": 640
},
{
"epoch": 0.09371594624046495,
"grad_norm": 0.6021141409873962,
"learning_rate": 9.35366739288308e-05,
"loss": 0.0147,
"step": 645
},
{
"epoch": 0.0944424264438794,
"grad_norm": 0.05590349808335304,
"learning_rate": 9.426289034132172e-05,
"loss": 0.0041,
"step": 650
},
{
"epoch": 0.09516890664729385,
"grad_norm": 0.010648532770574093,
"learning_rate": 9.498910675381264e-05,
"loss": 0.0004,
"step": 655
},
{
"epoch": 0.09589538685070832,
"grad_norm": 0.6512329578399658,
"learning_rate": 9.571532316630356e-05,
"loss": 0.0057,
"step": 660
},
{
"epoch": 0.09662186705412278,
"grad_norm": 0.040556080639362335,
"learning_rate": 9.644153957879448e-05,
"loss": 0.0006,
"step": 665
},
{
"epoch": 0.09734834725753723,
"grad_norm": 0.03349559009075165,
"learning_rate": 9.71677559912854e-05,
"loss": 0.0025,
"step": 670
},
{
"epoch": 0.09807482746095168,
"grad_norm": 0.22619083523750305,
"learning_rate": 9.789397240377634e-05,
"loss": 0.0008,
"step": 675
},
{
"epoch": 0.09880130766436615,
"grad_norm": 0.005620414856821299,
"learning_rate": 9.862018881626726e-05,
"loss": 0.0004,
"step": 680
},
{
"epoch": 0.0995277878677806,
"grad_norm": 0.05560583993792534,
"learning_rate": 9.934640522875818e-05,
"loss": 0.0115,
"step": 685
},
{
"epoch": 0.10025426807119506,
"grad_norm": 0.003737515537068248,
"learning_rate": 0.00010007262164124908,
"loss": 0.006,
"step": 690
},
{
"epoch": 0.10098074827460951,
"grad_norm": 4.636546611785889,
"learning_rate": 0.00010079883805374002,
"loss": 0.0039,
"step": 695
},
{
"epoch": 0.10170722847802398,
"grad_norm": 0.0036786955315619707,
"learning_rate": 0.00010152505446623095,
"loss": 0.0193,
"step": 700
},
{
"epoch": 0.10243370868143843,
"grad_norm": 1.0555495023727417,
"learning_rate": 0.00010225127087872186,
"loss": 0.024,
"step": 705
},
{
"epoch": 0.10316018888485289,
"grad_norm": 0.32569730281829834,
"learning_rate": 0.00010297748729121279,
"loss": 0.0168,
"step": 710
},
{
"epoch": 0.10388666908826734,
"grad_norm": 0.6908342242240906,
"learning_rate": 0.0001037037037037037,
"loss": 0.0359,
"step": 715
},
{
"epoch": 0.1046131492916818,
"grad_norm": 0.044849053025245667,
"learning_rate": 0.00010442992011619463,
"loss": 0.0009,
"step": 720
},
{
"epoch": 0.10533962949509626,
"grad_norm": 0.011313475668430328,
"learning_rate": 0.00010515613652868554,
"loss": 0.0003,
"step": 725
},
{
"epoch": 0.10606610969851071,
"grad_norm": 0.011058060452342033,
"learning_rate": 0.00010588235294117647,
"loss": 0.0047,
"step": 730
},
{
"epoch": 0.10679258990192517,
"grad_norm": 0.013103635981678963,
"learning_rate": 0.00010660856935366741,
"loss": 0.0005,
"step": 735
},
{
"epoch": 0.10751907010533963,
"grad_norm": 0.0076889158226549625,
"learning_rate": 0.00010733478576615832,
"loss": 0.0122,
"step": 740
},
{
"epoch": 0.10824555030875409,
"grad_norm": 0.6621626019477844,
"learning_rate": 0.00010806100217864925,
"loss": 0.0027,
"step": 745
},
{
"epoch": 0.10897203051216854,
"grad_norm": 1.037239670753479,
"learning_rate": 0.00010878721859114016,
"loss": 0.0385,
"step": 750
},
{
"epoch": 0.109698510715583,
"grad_norm": 0.11858850717544556,
"learning_rate": 0.00010951343500363109,
"loss": 0.0522,
"step": 755
},
{
"epoch": 0.11042499091899746,
"grad_norm": 3.902498245239258,
"learning_rate": 0.000110239651416122,
"loss": 0.0037,
"step": 760
},
{
"epoch": 0.11115147112241192,
"grad_norm": 0.03190886229276657,
"learning_rate": 0.00011096586782861293,
"loss": 0.0092,
"step": 765
},
{
"epoch": 0.11187795132582637,
"grad_norm": 0.028368664905428886,
"learning_rate": 0.00011169208424110384,
"loss": 0.0011,
"step": 770
},
{
"epoch": 0.11260443152924082,
"grad_norm": 1.0788954496383667,
"learning_rate": 0.00011241830065359477,
"loss": 0.0205,
"step": 775
},
{
"epoch": 0.11333091173265529,
"grad_norm": 0.10793304443359375,
"learning_rate": 0.00011314451706608571,
"loss": 0.0128,
"step": 780
},
{
"epoch": 0.11405739193606974,
"grad_norm": 1.4622502326965332,
"learning_rate": 0.00011387073347857661,
"loss": 0.0525,
"step": 785
},
{
"epoch": 0.1147838721394842,
"grad_norm": 0.43396472930908203,
"learning_rate": 0.00011459694989106755,
"loss": 0.0016,
"step": 790
},
{
"epoch": 0.11551035234289865,
"grad_norm": 2.9861035346984863,
"learning_rate": 0.00011532316630355846,
"loss": 0.0138,
"step": 795
},
{
"epoch": 0.11623683254631312,
"grad_norm": 14.586094856262207,
"learning_rate": 0.00011604938271604939,
"loss": 0.1007,
"step": 800
},
{
"epoch": 0.11696331274972757,
"grad_norm": 0.014536268077790737,
"learning_rate": 0.0001167755991285403,
"loss": 0.0147,
"step": 805
},
{
"epoch": 0.11768979295314203,
"grad_norm": 0.05354047194123268,
"learning_rate": 0.00011750181554103123,
"loss": 0.0176,
"step": 810
},
{
"epoch": 0.11841627315655648,
"grad_norm": 0.3078368902206421,
"learning_rate": 0.00011822803195352215,
"loss": 0.0049,
"step": 815
},
{
"epoch": 0.11914275335997095,
"grad_norm": 0.011014469899237156,
"learning_rate": 0.00011895424836601307,
"loss": 0.0035,
"step": 820
},
{
"epoch": 0.1198692335633854,
"grad_norm": 0.5486952662467957,
"learning_rate": 0.000119680464778504,
"loss": 0.0168,
"step": 825
},
{
"epoch": 0.12059571376679985,
"grad_norm": 0.02629510499536991,
"learning_rate": 0.00012040668119099491,
"loss": 0.001,
"step": 830
},
{
"epoch": 0.1213221939702143,
"grad_norm": 0.019840385764837265,
"learning_rate": 0.00012113289760348585,
"loss": 0.0037,
"step": 835
},
{
"epoch": 0.12204867417362877,
"grad_norm": 1.3649095296859741,
"learning_rate": 0.00012185911401597675,
"loss": 0.0133,
"step": 840
},
{
"epoch": 0.12277515437704323,
"grad_norm": 0.025183813646435738,
"learning_rate": 0.0001225853304284677,
"loss": 0.0006,
"step": 845
},
{
"epoch": 0.12350163458045768,
"grad_norm": 0.07554338127374649,
"learning_rate": 0.0001233115468409586,
"loss": 0.0041,
"step": 850
},
{
"epoch": 0.12422811478387213,
"grad_norm": 0.04600398242473602,
"learning_rate": 0.00012403776325344953,
"loss": 0.0022,
"step": 855
},
{
"epoch": 0.1249545949872866,
"grad_norm": 4.709814548492432,
"learning_rate": 0.00012476397966594048,
"loss": 0.0037,
"step": 860
},
{
"epoch": 0.12568107519070104,
"grad_norm": 0.020981954410672188,
"learning_rate": 0.00012549019607843137,
"loss": 0.0121,
"step": 865
},
{
"epoch": 0.12640755539411552,
"grad_norm": 0.3170248866081238,
"learning_rate": 0.00012621641249092232,
"loss": 0.0027,
"step": 870
},
{
"epoch": 0.12713403559752998,
"grad_norm": 0.01148161105811596,
"learning_rate": 0.0001269426289034132,
"loss": 0.0005,
"step": 875
},
{
"epoch": 0.12786051580094443,
"grad_norm": 0.005348953418433666,
"learning_rate": 0.00012766884531590416,
"loss": 0.0002,
"step": 880
},
{
"epoch": 0.12858699600435888,
"grad_norm": 3.101860761642456,
"learning_rate": 0.00012839506172839505,
"loss": 0.0038,
"step": 885
},
{
"epoch": 0.12931347620777334,
"grad_norm": 2.680506706237793,
"learning_rate": 0.000129121278140886,
"loss": 0.0472,
"step": 890
},
{
"epoch": 0.1300399564111878,
"grad_norm": 22.780397415161133,
"learning_rate": 0.0001298474945533769,
"loss": 0.0232,
"step": 895
},
{
"epoch": 0.13076643661460224,
"grad_norm": 0.08615617454051971,
"learning_rate": 0.00013057371096586784,
"loss": 0.0278,
"step": 900
},
{
"epoch": 0.1314929168180167,
"grad_norm": 0.4959210455417633,
"learning_rate": 0.00013129992737835876,
"loss": 0.0007,
"step": 905
},
{
"epoch": 0.13221939702143118,
"grad_norm": 0.0067051006481051445,
"learning_rate": 0.00013202614379084968,
"loss": 0.0167,
"step": 910
},
{
"epoch": 0.13294587722484563,
"grad_norm": 10.88768482208252,
"learning_rate": 0.0001327523602033406,
"loss": 0.0682,
"step": 915
},
{
"epoch": 0.13367235742826009,
"grad_norm": 0.007390011567622423,
"learning_rate": 0.00013347857661583152,
"loss": 0.0003,
"step": 920
},
{
"epoch": 0.13439883763167454,
"grad_norm": 0.12825822830200195,
"learning_rate": 0.00013420479302832244,
"loss": 0.0244,
"step": 925
},
{
"epoch": 0.135125317835089,
"grad_norm": 0.8949776291847229,
"learning_rate": 0.00013493100944081336,
"loss": 0.0214,
"step": 930
},
{
"epoch": 0.13585179803850345,
"grad_norm": 0.007870912551879883,
"learning_rate": 0.00013565722585330429,
"loss": 0.0004,
"step": 935
},
{
"epoch": 0.1365782782419179,
"grad_norm": 0.013382726348936558,
"learning_rate": 0.0001363834422657952,
"loss": 0.0004,
"step": 940
},
{
"epoch": 0.13730475844533235,
"grad_norm": 0.037289395928382874,
"learning_rate": 0.00013710965867828613,
"loss": 0.0012,
"step": 945
},
{
"epoch": 0.13803123864874683,
"grad_norm": 0.9494091272354126,
"learning_rate": 0.00013783587509077707,
"loss": 0.0249,
"step": 950
},
{
"epoch": 0.1387577188521613,
"grad_norm": 2.1269211769104004,
"learning_rate": 0.00013856209150326797,
"loss": 0.0041,
"step": 955
},
{
"epoch": 0.13948419905557574,
"grad_norm": 0.03475005179643631,
"learning_rate": 0.00013928830791575892,
"loss": 0.0039,
"step": 960
},
{
"epoch": 0.1402106792589902,
"grad_norm": 0.778325080871582,
"learning_rate": 0.0001400145243282498,
"loss": 0.0044,
"step": 965
},
{
"epoch": 0.14093715946240465,
"grad_norm": 0.06391960382461548,
"learning_rate": 0.00014074074074074076,
"loss": 0.0011,
"step": 970
},
{
"epoch": 0.1416636396658191,
"grad_norm": 0.015311076305806637,
"learning_rate": 0.00014146695715323165,
"loss": 0.0156,
"step": 975
},
{
"epoch": 0.14239011986923356,
"grad_norm": 0.005620781797915697,
"learning_rate": 0.0001421931735657226,
"loss": 0.0005,
"step": 980
},
{
"epoch": 0.143116600072648,
"grad_norm": 0.006361651234328747,
"learning_rate": 0.00014291938997821352,
"loss": 0.0173,
"step": 985
},
{
"epoch": 0.1438430802760625,
"grad_norm": 0.6632714867591858,
"learning_rate": 0.00014364560639070444,
"loss": 0.0008,
"step": 990
},
{
"epoch": 0.14456956047947694,
"grad_norm": 3.7890255451202393,
"learning_rate": 0.00014437182280319536,
"loss": 0.0591,
"step": 995
},
{
"epoch": 0.1452960406828914,
"grad_norm": 4.573298454284668,
"learning_rate": 0.00014509803921568628,
"loss": 0.0182,
"step": 1000
},
{
"epoch": 0.14602252088630585,
"grad_norm": 0.18653298914432526,
"learning_rate": 0.0001458242556281772,
"loss": 0.011,
"step": 1005
},
{
"epoch": 0.1467490010897203,
"grad_norm": 0.0030135358683764935,
"learning_rate": 0.00014655047204066812,
"loss": 0.017,
"step": 1010
},
{
"epoch": 0.14747548129313476,
"grad_norm": 13.294329643249512,
"learning_rate": 0.00014727668845315904,
"loss": 0.0359,
"step": 1015
},
{
"epoch": 0.1482019614965492,
"grad_norm": 1.0047153234481812,
"learning_rate": 0.00014800290486564996,
"loss": 0.0014,
"step": 1020
},
{
"epoch": 0.14892844169996367,
"grad_norm": 0.0042244489304721355,
"learning_rate": 0.00014872912127814088,
"loss": 0.0008,
"step": 1025
},
{
"epoch": 0.14965492190337815,
"grad_norm": 0.005744027905166149,
"learning_rate": 0.00014945533769063183,
"loss": 0.0005,
"step": 1030
},
{
"epoch": 0.1503814021067926,
"grad_norm": 0.0027218873146921396,
"learning_rate": 0.00015018155410312272,
"loss": 0.0009,
"step": 1035
},
{
"epoch": 0.15110788231020705,
"grad_norm": 1.5683510303497314,
"learning_rate": 0.00015090777051561367,
"loss": 0.0009,
"step": 1040
},
{
"epoch": 0.1518343625136215,
"grad_norm": 0.0024358402006328106,
"learning_rate": 0.00015163398692810456,
"loss": 0.0008,
"step": 1045
},
{
"epoch": 0.15256084271703596,
"grad_norm": 0.0035784540232270956,
"learning_rate": 0.0001523602033405955,
"loss": 0.0217,
"step": 1050
},
{
"epoch": 0.15328732292045041,
"grad_norm": 0.017342494800686836,
"learning_rate": 0.0001530864197530864,
"loss": 0.0002,
"step": 1055
},
{
"epoch": 0.15401380312386487,
"grad_norm": 0.0023592431098222733,
"learning_rate": 0.00015381263616557735,
"loss": 0.0001,
"step": 1060
},
{
"epoch": 0.15474028332727932,
"grad_norm": 0.0029132033232599497,
"learning_rate": 0.00015453885257806827,
"loss": 0.0002,
"step": 1065
},
{
"epoch": 0.1554667635306938,
"grad_norm": 5.089969158172607,
"learning_rate": 0.0001552650689905592,
"loss": 0.0124,
"step": 1070
},
{
"epoch": 0.15619324373410826,
"grad_norm": 0.0020955149084329605,
"learning_rate": 0.00015599128540305012,
"loss": 0.0002,
"step": 1075
},
{
"epoch": 0.1569197239375227,
"grad_norm": 0.001827805070206523,
"learning_rate": 0.00015671750181554104,
"loss": 0.0026,
"step": 1080
},
{
"epoch": 0.15764620414093716,
"grad_norm": 0.0018593213753774762,
"learning_rate": 0.00015744371822803196,
"loss": 0.0001,
"step": 1085
},
{
"epoch": 0.15837268434435162,
"grad_norm": 8.548373222351074,
"learning_rate": 0.00015816993464052288,
"loss": 0.0116,
"step": 1090
},
{
"epoch": 0.15909916454776607,
"grad_norm": 0.003052167361602187,
"learning_rate": 0.0001588961510530138,
"loss": 0.023,
"step": 1095
},
{
"epoch": 0.15982564475118052,
"grad_norm": 0.01510961726307869,
"learning_rate": 0.00015962236746550472,
"loss": 0.0003,
"step": 1100
},
{
"epoch": 0.16055212495459498,
"grad_norm": 0.006872969213873148,
"learning_rate": 0.00016034858387799564,
"loss": 0.0008,
"step": 1105
},
{
"epoch": 0.16127860515800943,
"grad_norm": 0.0075376201421022415,
"learning_rate": 0.00016107480029048659,
"loss": 0.001,
"step": 1110
},
{
"epoch": 0.1620050853614239,
"grad_norm": 1.308592438697815,
"learning_rate": 0.00016180101670297748,
"loss": 0.0206,
"step": 1115
},
{
"epoch": 0.16273156556483837,
"grad_norm": 0.01441910769790411,
"learning_rate": 0.00016252723311546843,
"loss": 0.0003,
"step": 1120
},
{
"epoch": 0.16345804576825282,
"grad_norm": 4.73635721206665,
"learning_rate": 0.00016325344952795935,
"loss": 0.0048,
"step": 1125
},
{
"epoch": 0.16418452597166727,
"grad_norm": 0.07317811995744705,
"learning_rate": 0.00016397966594045027,
"loss": 0.0079,
"step": 1130
},
{
"epoch": 0.16491100617508173,
"grad_norm": 3.066941976547241,
"learning_rate": 0.0001647058823529412,
"loss": 0.0245,
"step": 1135
},
{
"epoch": 0.16563748637849618,
"grad_norm": 0.20101045072078705,
"learning_rate": 0.0001654320987654321,
"loss": 0.0538,
"step": 1140
},
{
"epoch": 0.16636396658191063,
"grad_norm": 0.03498254343867302,
"learning_rate": 0.00016615831517792303,
"loss": 0.0009,
"step": 1145
},
{
"epoch": 0.1670904467853251,
"grad_norm": 0.044696319848299026,
"learning_rate": 0.00016688453159041395,
"loss": 0.001,
"step": 1150
},
{
"epoch": 0.16781692698873957,
"grad_norm": 0.005176996346563101,
"learning_rate": 0.00016761074800290487,
"loss": 0.001,
"step": 1155
},
{
"epoch": 0.16854340719215402,
"grad_norm": 0.0034458874724805355,
"learning_rate": 0.0001683369644153958,
"loss": 0.0295,
"step": 1160
},
{
"epoch": 0.16926988739556847,
"grad_norm": 0.01240626908838749,
"learning_rate": 0.0001690631808278867,
"loss": 0.0055,
"step": 1165
},
{
"epoch": 0.16999636759898293,
"grad_norm": 0.0073911272920668125,
"learning_rate": 0.00016978939724037763,
"loss": 0.0002,
"step": 1170
},
{
"epoch": 0.17072284780239738,
"grad_norm": 0.010020995512604713,
"learning_rate": 0.00017051561365286855,
"loss": 0.0002,
"step": 1175
},
{
"epoch": 0.17144932800581184,
"grad_norm": 0.0028329354245215654,
"learning_rate": 0.00017124183006535947,
"loss": 0.0002,
"step": 1180
},
{
"epoch": 0.1721758082092263,
"grad_norm": 0.009768263436853886,
"learning_rate": 0.00017196804647785042,
"loss": 0.0001,
"step": 1185
},
{
"epoch": 0.17290228841264074,
"grad_norm": 0.006985844578593969,
"learning_rate": 0.00017269426289034134,
"loss": 0.0001,
"step": 1190
},
{
"epoch": 0.17362876861605522,
"grad_norm": 0.003910423722118139,
"learning_rate": 0.00017342047930283226,
"loss": 0.0001,
"step": 1195
},
{
"epoch": 0.17435524881946968,
"grad_norm": 0.0018550670938566327,
"learning_rate": 0.00017414669571532318,
"loss": 0.0001,
"step": 1200
},
{
"epoch": 0.17508172902288413,
"grad_norm": 0.003561209188774228,
"learning_rate": 0.0001748729121278141,
"loss": 0.0001,
"step": 1205
},
{
"epoch": 0.17580820922629858,
"grad_norm": 0.0017712870612740517,
"learning_rate": 0.00017559912854030502,
"loss": 0.0001,
"step": 1210
},
{
"epoch": 0.17653468942971304,
"grad_norm": 0.002323460765182972,
"learning_rate": 0.00017632534495279595,
"loss": 0.0001,
"step": 1215
},
{
"epoch": 0.1772611696331275,
"grad_norm": 0.0017775703454390168,
"learning_rate": 0.00017705156136528687,
"loss": 0.0004,
"step": 1220
},
{
"epoch": 0.17798764983654194,
"grad_norm": 0.003454179735854268,
"learning_rate": 0.00017777777777777779,
"loss": 0.0001,
"step": 1225
},
{
"epoch": 0.1787141300399564,
"grad_norm": 0.003128621494397521,
"learning_rate": 0.0001785039941902687,
"loss": 0.0001,
"step": 1230
},
{
"epoch": 0.17944061024337088,
"grad_norm": 0.013285885564982891,
"learning_rate": 0.00017923021060275963,
"loss": 0.0001,
"step": 1235
},
{
"epoch": 0.18016709044678533,
"grad_norm": 0.0012834910303354263,
"learning_rate": 0.00017995642701525055,
"loss": 0.0004,
"step": 1240
},
{
"epoch": 0.1808935706501998,
"grad_norm": 0.0010866275988519192,
"learning_rate": 0.0001806826434277415,
"loss": 0.0001,
"step": 1245
},
{
"epoch": 0.18162005085361424,
"grad_norm": 0.0010630824835970998,
"learning_rate": 0.0001814088598402324,
"loss": 0.0,
"step": 1250
},
{
"epoch": 0.1823465310570287,
"grad_norm": 0.0011757917236536741,
"learning_rate": 0.00018213507625272334,
"loss": 0.0,
"step": 1255
},
{
"epoch": 0.18307301126044315,
"grad_norm": 0.0009444226743653417,
"learning_rate": 0.00018286129266521423,
"loss": 0.008,
"step": 1260
},
{
"epoch": 0.1837994914638576,
"grad_norm": 0.0011839661747217178,
"learning_rate": 0.00018358750907770518,
"loss": 0.0,
"step": 1265
},
{
"epoch": 0.18452597166727205,
"grad_norm": 0.0008903779671527445,
"learning_rate": 0.00018431372549019607,
"loss": 0.0,
"step": 1270
},
{
"epoch": 0.18525245187068654,
"grad_norm": 0.0010285211028531194,
"learning_rate": 0.00018503994190268702,
"loss": 0.0001,
"step": 1275
},
{
"epoch": 0.185978932074101,
"grad_norm": 0.0016522291116416454,
"learning_rate": 0.00018576615831517794,
"loss": 0.0338,
"step": 1280
},
{
"epoch": 0.18670541227751544,
"grad_norm": 0.001982170157134533,
"learning_rate": 0.00018649237472766886,
"loss": 0.0009,
"step": 1285
},
{
"epoch": 0.1874318924809299,
"grad_norm": 0.002856120467185974,
"learning_rate": 0.00018721859114015978,
"loss": 0.0001,
"step": 1290
},
{
"epoch": 0.18815837268434435,
"grad_norm": 6.8815484046936035,
"learning_rate": 0.0001879448075526507,
"loss": 0.0064,
"step": 1295
},
{
"epoch": 0.1888848528877588,
"grad_norm": 0.002711124252527952,
"learning_rate": 0.00018867102396514162,
"loss": 0.0001,
"step": 1300
},
{
"epoch": 0.18961133309117326,
"grad_norm": 0.01453580055385828,
"learning_rate": 0.00018939724037763254,
"loss": 0.0002,
"step": 1305
},
{
"epoch": 0.1903378132945877,
"grad_norm": 0.004619908984750509,
"learning_rate": 0.00019012345679012346,
"loss": 0.0108,
"step": 1310
},
{
"epoch": 0.1910642934980022,
"grad_norm": 0.002147579798474908,
"learning_rate": 0.0001908496732026144,
"loss": 0.0002,
"step": 1315
},
{
"epoch": 0.19179077370141664,
"grad_norm": 0.006444690283387899,
"learning_rate": 0.0001915758896151053,
"loss": 0.0013,
"step": 1320
},
{
"epoch": 0.1925172539048311,
"grad_norm": 0.0015877482946962118,
"learning_rate": 0.00019230210602759625,
"loss": 0.0011,
"step": 1325
},
{
"epoch": 0.19324373410824555,
"grad_norm": 2.8192436695098877,
"learning_rate": 0.00019302832244008715,
"loss": 0.0018,
"step": 1330
},
{
"epoch": 0.19397021431166,
"grad_norm": 6.506179332733154,
"learning_rate": 0.0001937545388525781,
"loss": 0.0067,
"step": 1335
},
{
"epoch": 0.19469669451507446,
"grad_norm": 0.0016660846304148436,
"learning_rate": 0.00019448075526506899,
"loss": 0.0001,
"step": 1340
},
{
"epoch": 0.1954231747184889,
"grad_norm": 0.0011433791369199753,
"learning_rate": 0.00019520697167755993,
"loss": 0.0141,
"step": 1345
},
{
"epoch": 0.19614965492190337,
"grad_norm": 0.001556798000819981,
"learning_rate": 0.00019593318809005083,
"loss": 0.0,
"step": 1350
},
{
"epoch": 0.19687613512531785,
"grad_norm": 0.0035784265492111444,
"learning_rate": 0.00019665940450254178,
"loss": 0.0001,
"step": 1355
},
{
"epoch": 0.1976026153287323,
"grad_norm": 4.246982097625732,
"learning_rate": 0.0001973856209150327,
"loss": 0.0068,
"step": 1360
},
{
"epoch": 0.19832909553214675,
"grad_norm": 0.001589273801073432,
"learning_rate": 0.00019811183732752362,
"loss": 0.012,
"step": 1365
},
{
"epoch": 0.1990555757355612,
"grad_norm": 0.0008454394992440939,
"learning_rate": 0.00019883805374001454,
"loss": 0.0001,
"step": 1370
},
{
"epoch": 0.19978205593897566,
"grad_norm": 0.0013743549352511764,
"learning_rate": 0.00019956427015250546,
"loss": 0.0001,
"step": 1375
},
{
"epoch": 0.20007264802034144,
"eval_accuracy": 0.9996397787212145,
"eval_f1": 0.9995310825294748,
"eval_loss": 0.0019488565158098936,
"eval_precision": 0.9990626046200201,
"eval_recall": 1.0,
"eval_runtime": 124.3971,
"eval_samples_per_second": 312.427,
"eval_steps_per_second": 2.444,
"step": 1377
},
{
"epoch": 0.20050853614239011,
"grad_norm": 0.002287400420755148,
"learning_rate": 0.00019996771329405116,
"loss": 0.0093,
"step": 1380
},
{
"epoch": 0.20123501634580457,
"grad_norm": 0.004998628981411457,
"learning_rate": 0.00019988699652917914,
"loss": 0.0159,
"step": 1385
},
{
"epoch": 0.20196149654921902,
"grad_norm": 0.003076898632571101,
"learning_rate": 0.00019980627976430706,
"loss": 0.0073,
"step": 1390
},
{
"epoch": 0.2026879767526335,
"grad_norm": 0.7471761107444763,
"learning_rate": 0.00019972556299943498,
"loss": 0.0033,
"step": 1395
},
{
"epoch": 0.20341445695604796,
"grad_norm": 0.1988172084093094,
"learning_rate": 0.00019964484623456293,
"loss": 0.0009,
"step": 1400
},
{
"epoch": 0.2041409371594624,
"grad_norm": 0.005002092570066452,
"learning_rate": 0.00019956412946969088,
"loss": 0.0009,
"step": 1405
},
{
"epoch": 0.20486741736287686,
"grad_norm": 0.24179202318191528,
"learning_rate": 0.0001994834127048188,
"loss": 0.0182,
"step": 1410
},
{
"epoch": 0.20559389756629132,
"grad_norm": 4.140319347381592,
"learning_rate": 0.00019940269593994673,
"loss": 0.0052,
"step": 1415
},
{
"epoch": 0.20632037776970577,
"grad_norm": 0.0015831501223146915,
"learning_rate": 0.00019932197917507468,
"loss": 0.0,
"step": 1420
},
{
"epoch": 0.20704685797312022,
"grad_norm": 0.0023513727355748415,
"learning_rate": 0.0001992412624102026,
"loss": 0.0001,
"step": 1425
},
{
"epoch": 0.20777333817653468,
"grad_norm": 0.0018358491361141205,
"learning_rate": 0.00019916054564533055,
"loss": 0.0217,
"step": 1430
},
{
"epoch": 0.20849981837994916,
"grad_norm": 2.7655224800109863,
"learning_rate": 0.00019907982888045847,
"loss": 0.0393,
"step": 1435
},
{
"epoch": 0.2092262985833636,
"grad_norm": 0.00683799060061574,
"learning_rate": 0.00019899911211558642,
"loss": 0.0002,
"step": 1440
},
{
"epoch": 0.20995277878677807,
"grad_norm": 0.011541269719600677,
"learning_rate": 0.00019891839535071435,
"loss": 0.0045,
"step": 1445
},
{
"epoch": 0.21067925899019252,
"grad_norm": 0.013042348437011242,
"learning_rate": 0.00019883767858584227,
"loss": 0.0003,
"step": 1450
},
{
"epoch": 0.21140573919360697,
"grad_norm": 0.01146721187978983,
"learning_rate": 0.00019875696182097025,
"loss": 0.0009,
"step": 1455
},
{
"epoch": 0.21213221939702143,
"grad_norm": 0.0053860582411289215,
"learning_rate": 0.00019867624505609817,
"loss": 0.003,
"step": 1460
},
{
"epoch": 0.21285869960043588,
"grad_norm": 0.35763925313949585,
"learning_rate": 0.0001985955282912261,
"loss": 0.0066,
"step": 1465
},
{
"epoch": 0.21358517980385033,
"grad_norm": 0.003207879839465022,
"learning_rate": 0.00019851481152635401,
"loss": 0.0002,
"step": 1470
},
{
"epoch": 0.21431166000726481,
"grad_norm": 0.004152906127274036,
"learning_rate": 0.00019843409476148196,
"loss": 0.0001,
"step": 1475
},
{
"epoch": 0.21503814021067927,
"grad_norm": 0.003981268033385277,
"learning_rate": 0.00019835337799660991,
"loss": 0.0002,
"step": 1480
},
{
"epoch": 0.21576462041409372,
"grad_norm": 0.0030321148224174976,
"learning_rate": 0.00019827266123173784,
"loss": 0.0004,
"step": 1485
},
{
"epoch": 0.21649110061750818,
"grad_norm": 0.0033642794005572796,
"learning_rate": 0.00019819194446686579,
"loss": 0.0002,
"step": 1490
},
{
"epoch": 0.21721758082092263,
"grad_norm": 0.0015044253086671233,
"learning_rate": 0.0001981112277019937,
"loss": 0.0,
"step": 1495
},
{
"epoch": 0.21794406102433708,
"grad_norm": 0.0013194256462156773,
"learning_rate": 0.00019803051093712166,
"loss": 0.0064,
"step": 1500
},
{
"epoch": 0.21867054122775154,
"grad_norm": 0.003604642581194639,
"learning_rate": 0.00019794979417224958,
"loss": 0.0001,
"step": 1505
},
{
"epoch": 0.219397021431166,
"grad_norm": 0.002144684549421072,
"learning_rate": 0.00019786907740737753,
"loss": 0.0011,
"step": 1510
},
{
"epoch": 0.22012350163458047,
"grad_norm": 0.00234671076759696,
"learning_rate": 0.00019778836064250545,
"loss": 0.0001,
"step": 1515
},
{
"epoch": 0.22084998183799492,
"grad_norm": 0.027411388233304024,
"learning_rate": 0.00019770764387763338,
"loss": 0.0002,
"step": 1520
},
{
"epoch": 0.22157646204140938,
"grad_norm": 0.00431784288957715,
"learning_rate": 0.00019762692711276135,
"loss": 0.0016,
"step": 1525
},
{
"epoch": 0.22230294224482383,
"grad_norm": 0.007216178812086582,
"learning_rate": 0.00019754621034788928,
"loss": 0.0031,
"step": 1530
},
{
"epoch": 0.22302942244823828,
"grad_norm": 0.0020561974961310625,
"learning_rate": 0.0001974654935830172,
"loss": 0.0,
"step": 1535
},
{
"epoch": 0.22375590265165274,
"grad_norm": 0.003935552202165127,
"learning_rate": 0.00019738477681814512,
"loss": 0.0,
"step": 1540
},
{
"epoch": 0.2244823828550672,
"grad_norm": 0.0017273337580263615,
"learning_rate": 0.00019730406005327307,
"loss": 0.0018,
"step": 1545
},
{
"epoch": 0.22520886305848165,
"grad_norm": 0.0009397296234965324,
"learning_rate": 0.00019722334328840102,
"loss": 0.0071,
"step": 1550
},
{
"epoch": 0.22593534326189613,
"grad_norm": 4.2714738845825195,
"learning_rate": 0.00019714262652352894,
"loss": 0.0043,
"step": 1555
},
{
"epoch": 0.22666182346531058,
"grad_norm": 0.008737271651625633,
"learning_rate": 0.0001970619097586569,
"loss": 0.0,
"step": 1560
},
{
"epoch": 0.22738830366872503,
"grad_norm": 0.0011167083866894245,
"learning_rate": 0.00019698119299378482,
"loss": 0.0195,
"step": 1565
},
{
"epoch": 0.2281147838721395,
"grad_norm": 0.0015777769731357694,
"learning_rate": 0.00019690047622891274,
"loss": 0.0093,
"step": 1570
},
{
"epoch": 0.22884126407555394,
"grad_norm": 1.3581019639968872,
"learning_rate": 0.0001968197594640407,
"loss": 0.0371,
"step": 1575
},
{
"epoch": 0.2295677442789684,
"grad_norm": 0.005585103295743465,
"learning_rate": 0.00019673904269916864,
"loss": 0.0003,
"step": 1580
},
{
"epoch": 0.23029422448238285,
"grad_norm": 0.013055490329861641,
"learning_rate": 0.00019665832593429656,
"loss": 0.0009,
"step": 1585
},
{
"epoch": 0.2310207046857973,
"grad_norm": 0.012752565555274487,
"learning_rate": 0.00019657760916942448,
"loss": 0.0057,
"step": 1590
},
{
"epoch": 0.23174718488921178,
"grad_norm": 0.016765527427196503,
"learning_rate": 0.00019649689240455243,
"loss": 0.0053,
"step": 1595
},
{
"epoch": 0.23247366509262624,
"grad_norm": 0.04566654935479164,
"learning_rate": 0.00019641617563968038,
"loss": 0.0031,
"step": 1600
},
{
"epoch": 0.2332001452960407,
"grad_norm": 0.05058443173766136,
"learning_rate": 0.0001963354588748083,
"loss": 0.0064,
"step": 1605
},
{
"epoch": 0.23392662549945514,
"grad_norm": 0.006236894056200981,
"learning_rate": 0.00019625474210993623,
"loss": 0.0202,
"step": 1610
},
{
"epoch": 0.2346531057028696,
"grad_norm": 0.00453936355188489,
"learning_rate": 0.00019617402534506418,
"loss": 0.0002,
"step": 1615
},
{
"epoch": 0.23537958590628405,
"grad_norm": 0.01652829721570015,
"learning_rate": 0.00019609330858019213,
"loss": 0.0005,
"step": 1620
},
{
"epoch": 0.2361060661096985,
"grad_norm": 0.28086262941360474,
"learning_rate": 0.00019601259181532005,
"loss": 0.0012,
"step": 1625
},
{
"epoch": 0.23683254631311296,
"grad_norm": 0.002964381594210863,
"learning_rate": 0.000195931875050448,
"loss": 0.0001,
"step": 1630
},
{
"epoch": 0.23755902651652744,
"grad_norm": 0.004744562786072493,
"learning_rate": 0.00019585115828557592,
"loss": 0.0001,
"step": 1635
},
{
"epoch": 0.2382855067199419,
"grad_norm": 0.002022289205342531,
"learning_rate": 0.00019577044152070385,
"loss": 0.0003,
"step": 1640
},
{
"epoch": 0.23901198692335635,
"grad_norm": 1.325679063796997,
"learning_rate": 0.0001956897247558318,
"loss": 0.0223,
"step": 1645
},
{
"epoch": 0.2397384671267708,
"grad_norm": 0.005906618200242519,
"learning_rate": 0.00019560900799095974,
"loss": 0.0004,
"step": 1650
},
{
"epoch": 0.24046494733018525,
"grad_norm": 0.022973524406552315,
"learning_rate": 0.00019552829122608767,
"loss": 0.0004,
"step": 1655
},
{
"epoch": 0.2411914275335997,
"grad_norm": 0.017179157584905624,
"learning_rate": 0.0001954475744612156,
"loss": 0.0005,
"step": 1660
},
{
"epoch": 0.24191790773701416,
"grad_norm": 0.011254935525357723,
"learning_rate": 0.00019536685769634354,
"loss": 0.0008,
"step": 1665
},
{
"epoch": 0.2426443879404286,
"grad_norm": 0.004135392606258392,
"learning_rate": 0.0001952861409314715,
"loss": 0.0003,
"step": 1670
},
{
"epoch": 0.24337086814384307,
"grad_norm": 0.002715233713388443,
"learning_rate": 0.0001952054241665994,
"loss": 0.0002,
"step": 1675
},
{
"epoch": 0.24409734834725755,
"grad_norm": 0.00374965975061059,
"learning_rate": 0.00019512470740172734,
"loss": 0.0001,
"step": 1680
},
{
"epoch": 0.244823828550672,
"grad_norm": 0.0033891089260578156,
"learning_rate": 0.00019504399063685528,
"loss": 0.0001,
"step": 1685
},
{
"epoch": 0.24555030875408645,
"grad_norm": 0.001574166351929307,
"learning_rate": 0.0001949632738719832,
"loss": 0.0001,
"step": 1690
},
{
"epoch": 0.2462767889575009,
"grad_norm": 0.001289655570872128,
"learning_rate": 0.00019488255710711116,
"loss": 0.0,
"step": 1695
},
{
"epoch": 0.24700326916091536,
"grad_norm": 0.0012494047405198216,
"learning_rate": 0.0001948018403422391,
"loss": 0.0,
"step": 1700
},
{
"epoch": 0.24772974936432982,
"grad_norm": 0.0028091860003769398,
"learning_rate": 0.00019472112357736703,
"loss": 0.0,
"step": 1705
},
{
"epoch": 0.24845622956774427,
"grad_norm": 0.0020063440315425396,
"learning_rate": 0.00019464040681249495,
"loss": 0.0,
"step": 1710
},
{
"epoch": 0.24918270977115872,
"grad_norm": 0.00732283852994442,
"learning_rate": 0.0001945596900476229,
"loss": 0.0001,
"step": 1715
},
{
"epoch": 0.2499091899745732,
"grad_norm": 0.0009436274995096028,
"learning_rate": 0.00019447897328275085,
"loss": 0.0,
"step": 1720
},
{
"epoch": 0.25063567017798766,
"grad_norm": 0.001065302756614983,
"learning_rate": 0.00019439825651787877,
"loss": 0.0,
"step": 1725
},
{
"epoch": 0.2513621503814021,
"grad_norm": 0.0007398009183816612,
"learning_rate": 0.0001943175397530067,
"loss": 0.0,
"step": 1730
},
{
"epoch": 0.25208863058481656,
"grad_norm": 0.0009731051395647228,
"learning_rate": 0.00019423682298813465,
"loss": 0.0001,
"step": 1735
},
{
"epoch": 0.25281511078823105,
"grad_norm": 0.0006832171930000186,
"learning_rate": 0.0001941561062232626,
"loss": 0.0,
"step": 1740
},
{
"epoch": 0.25354159099164547,
"grad_norm": 0.0011063286801800132,
"learning_rate": 0.00019407538945839052,
"loss": 0.0,
"step": 1745
},
{
"epoch": 0.25426807119505995,
"grad_norm": 0.0012475239345803857,
"learning_rate": 0.00019399467269351844,
"loss": 0.0,
"step": 1750
},
{
"epoch": 0.2549945513984744,
"grad_norm": 0.0008868346340022981,
"learning_rate": 0.0001939139559286464,
"loss": 0.0,
"step": 1755
},
{
"epoch": 0.25572103160188886,
"grad_norm": 0.0013618022203445435,
"learning_rate": 0.00019383323916377431,
"loss": 0.0,
"step": 1760
},
{
"epoch": 0.2564475118053033,
"grad_norm": 0.0008328580879606307,
"learning_rate": 0.00019375252239890226,
"loss": 0.0,
"step": 1765
},
{
"epoch": 0.25717399200871777,
"grad_norm": 0.0017324545187875628,
"learning_rate": 0.00019367180563403021,
"loss": 0.0,
"step": 1770
},
{
"epoch": 0.2579004722121322,
"grad_norm": 0.0010712060611695051,
"learning_rate": 0.00019359108886915814,
"loss": 0.0,
"step": 1775
},
{
"epoch": 0.2586269524155467,
"grad_norm": 0.0005095238448120654,
"learning_rate": 0.00019351037210428606,
"loss": 0.0,
"step": 1780
},
{
"epoch": 0.25935343261896115,
"grad_norm": 0.0014343465445563197,
"learning_rate": 0.00019342965533941398,
"loss": 0.0,
"step": 1785
},
{
"epoch": 0.2600799128223756,
"grad_norm": 0.0007007729145698249,
"learning_rate": 0.00019334893857454196,
"loss": 0.0,
"step": 1790
},
{
"epoch": 0.26080639302579006,
"grad_norm": 0.0005924066063016653,
"learning_rate": 0.00019326822180966988,
"loss": 0.0,
"step": 1795
},
{
"epoch": 0.2615328732292045,
"grad_norm": 0.0004458896000869572,
"learning_rate": 0.0001931875050447978,
"loss": 0.0,
"step": 1800
},
{
"epoch": 0.26225935343261897,
"grad_norm": 0.005087355151772499,
"learning_rate": 0.00019310678827992575,
"loss": 0.0,
"step": 1805
},
{
"epoch": 0.2629858336360334,
"grad_norm": 0.11598234623670578,
"learning_rate": 0.0001930260715150537,
"loss": 0.0004,
"step": 1810
},
{
"epoch": 0.2637123138394479,
"grad_norm": 0.0015027482295408845,
"learning_rate": 0.00019294535475018163,
"loss": 0.0473,
"step": 1815
},
{
"epoch": 0.26443879404286236,
"grad_norm": 0.04484843090176582,
"learning_rate": 0.00019286463798530955,
"loss": 0.0003,
"step": 1820
},
{
"epoch": 0.2651652742462768,
"grad_norm": 0.007797603961080313,
"learning_rate": 0.0001927839212204375,
"loss": 0.0031,
"step": 1825
},
{
"epoch": 0.26589175444969126,
"grad_norm": 0.006486868020147085,
"learning_rate": 0.00019270320445556542,
"loss": 0.0003,
"step": 1830
},
{
"epoch": 0.2666182346531057,
"grad_norm": 0.005536284297704697,
"learning_rate": 0.00019262248769069337,
"loss": 0.0004,
"step": 1835
},
{
"epoch": 0.26734471485652017,
"grad_norm": 0.014443649910390377,
"learning_rate": 0.00019254177092582132,
"loss": 0.0001,
"step": 1840
},
{
"epoch": 0.2680711950599346,
"grad_norm": 0.0030865217559039593,
"learning_rate": 0.00019246105416094924,
"loss": 0.0001,
"step": 1845
},
{
"epoch": 0.2687976752633491,
"grad_norm": 0.15668638050556183,
"learning_rate": 0.00019238033739607717,
"loss": 0.0002,
"step": 1850
},
{
"epoch": 0.2695241554667635,
"grad_norm": 0.04532123729586601,
"learning_rate": 0.0001922996206312051,
"loss": 0.0003,
"step": 1855
},
{
"epoch": 0.270250635670178,
"grad_norm": 0.00196210783906281,
"learning_rate": 0.00019221890386633307,
"loss": 0.0001,
"step": 1860
},
{
"epoch": 0.27097711587359247,
"grad_norm": 0.0017535451333969831,
"learning_rate": 0.000192138187101461,
"loss": 0.0012,
"step": 1865
},
{
"epoch": 0.2717035960770069,
"grad_norm": 0.0014856884954497218,
"learning_rate": 0.0001920574703365889,
"loss": 0.0002,
"step": 1870
},
{
"epoch": 0.2724300762804214,
"grad_norm": 0.004271077457815409,
"learning_rate": 0.00019197675357171686,
"loss": 0.0049,
"step": 1875
},
{
"epoch": 0.2731565564838358,
"grad_norm": 0.009308665059506893,
"learning_rate": 0.00019189603680684478,
"loss": 0.0013,
"step": 1880
},
{
"epoch": 0.2738830366872503,
"grad_norm": 0.001470932038500905,
"learning_rate": 0.00019181532004197273,
"loss": 0.0009,
"step": 1885
},
{
"epoch": 0.2746095168906647,
"grad_norm": 0.0009906482882797718,
"learning_rate": 0.00019173460327710066,
"loss": 0.0074,
"step": 1890
},
{
"epoch": 0.2753359970940792,
"grad_norm": 0.5366028547286987,
"learning_rate": 0.0001916538865122286,
"loss": 0.0004,
"step": 1895
},
{
"epoch": 0.27606247729749367,
"grad_norm": 0.0012202219804748893,
"learning_rate": 0.00019157316974735653,
"loss": 0.0007,
"step": 1900
},
{
"epoch": 0.2767889575009081,
"grad_norm": 0.5043062567710876,
"learning_rate": 0.00019149245298248445,
"loss": 0.0007,
"step": 1905
},
{
"epoch": 0.2775154377043226,
"grad_norm": 0.0006929966621100903,
"learning_rate": 0.0001914117362176124,
"loss": 0.01,
"step": 1910
},
{
"epoch": 0.278241917907737,
"grad_norm": 0.0005868257721886039,
"learning_rate": 0.00019133101945274035,
"loss": 0.0,
"step": 1915
},
{
"epoch": 0.2789683981111515,
"grad_norm": 1.5353443622589111,
"learning_rate": 0.00019125030268786827,
"loss": 0.0012,
"step": 1920
},
{
"epoch": 0.2796948783145659,
"grad_norm": 0.0007161126704886556,
"learning_rate": 0.0001911695859229962,
"loss": 0.0,
"step": 1925
},
{
"epoch": 0.2804213585179804,
"grad_norm": 0.0007424887735396624,
"learning_rate": 0.00019108886915812417,
"loss": 0.0,
"step": 1930
},
{
"epoch": 0.2811478387213948,
"grad_norm": 0.0006449614884331822,
"learning_rate": 0.0001910081523932521,
"loss": 0.0,
"step": 1935
},
{
"epoch": 0.2818743189248093,
"grad_norm": 0.0006138585740700364,
"learning_rate": 0.00019092743562838002,
"loss": 0.0,
"step": 1940
},
{
"epoch": 0.2826007991282238,
"grad_norm": 0.0006936938152648509,
"learning_rate": 0.00019084671886350797,
"loss": 0.0,
"step": 1945
},
{
"epoch": 0.2833272793316382,
"grad_norm": 0.0004829142999369651,
"learning_rate": 0.0001907660020986359,
"loss": 0.0001,
"step": 1950
},
{
"epoch": 0.2840537595350527,
"grad_norm": 0.0005034743226133287,
"learning_rate": 0.00019068528533376384,
"loss": 0.0,
"step": 1955
},
{
"epoch": 0.2847802397384671,
"grad_norm": 0.0004061743093188852,
"learning_rate": 0.00019060456856889176,
"loss": 0.0,
"step": 1960
},
{
"epoch": 0.2855067199418816,
"grad_norm": 0.6731203198432922,
"learning_rate": 0.0001905238518040197,
"loss": 0.0282,
"step": 1965
},
{
"epoch": 0.286233200145296,
"grad_norm": 0.010977654717862606,
"learning_rate": 0.00019044313503914764,
"loss": 0.0002,
"step": 1970
},
{
"epoch": 0.2869596803487105,
"grad_norm": 0.022831691429018974,
"learning_rate": 0.00019036241827427556,
"loss": 0.0006,
"step": 1975
},
{
"epoch": 0.287686160552125,
"grad_norm": 0.026040196418762207,
"learning_rate": 0.0001902817015094035,
"loss": 0.0005,
"step": 1980
},
{
"epoch": 0.2884126407555394,
"grad_norm": 0.011391847394406796,
"learning_rate": 0.00019020098474453146,
"loss": 0.0004,
"step": 1985
},
{
"epoch": 0.2891391209589539,
"grad_norm": 0.013334060087800026,
"learning_rate": 0.00019012026797965938,
"loss": 0.0003,
"step": 1990
},
{
"epoch": 0.2898656011623683,
"grad_norm": 0.0060678147710859776,
"learning_rate": 0.0001900395512147873,
"loss": 0.0002,
"step": 1995
},
{
"epoch": 0.2905920813657828,
"grad_norm": 0.004468259867280722,
"learning_rate": 0.00018995883444991525,
"loss": 0.0002,
"step": 2000
},
{
"epoch": 0.2913185615691972,
"grad_norm": 0.0036872695200145245,
"learning_rate": 0.0001898781176850432,
"loss": 0.0001,
"step": 2005
},
{
"epoch": 0.2920450417726117,
"grad_norm": 0.0026169579941779375,
"learning_rate": 0.00018979740092017113,
"loss": 0.0001,
"step": 2010
},
{
"epoch": 0.29277152197602613,
"grad_norm": 0.0021394200157374144,
"learning_rate": 0.00018971668415529907,
"loss": 0.0001,
"step": 2015
},
{
"epoch": 0.2934980021794406,
"grad_norm": 0.0022201493848115206,
"learning_rate": 0.000189635967390427,
"loss": 0.0001,
"step": 2020
},
{
"epoch": 0.2942244823828551,
"grad_norm": 0.0021840811241418123,
"learning_rate": 0.00018955525062555495,
"loss": 0.0001,
"step": 2025
},
{
"epoch": 0.2949509625862695,
"grad_norm": 0.0016265831654891372,
"learning_rate": 0.00018947453386068287,
"loss": 0.0001,
"step": 2030
},
{
"epoch": 0.295677442789684,
"grad_norm": 0.0015095279086381197,
"learning_rate": 0.00018939381709581082,
"loss": 0.0001,
"step": 2035
},
{
"epoch": 0.2964039229930984,
"grad_norm": 0.0013007308589294553,
"learning_rate": 0.00018931310033093874,
"loss": 0.0,
"step": 2040
},
{
"epoch": 0.2971304031965129,
"grad_norm": 0.0011377567425370216,
"learning_rate": 0.00018923238356606666,
"loss": 0.0,
"step": 2045
},
{
"epoch": 0.29785688339992733,
"grad_norm": 0.0017277223523706198,
"learning_rate": 0.00018915166680119461,
"loss": 0.0,
"step": 2050
},
{
"epoch": 0.2985833636033418,
"grad_norm": 0.0009744380367919803,
"learning_rate": 0.00018907095003632256,
"loss": 0.0158,
"step": 2055
},
{
"epoch": 0.2993098438067563,
"grad_norm": 0.0012453808449208736,
"learning_rate": 0.0001889902332714505,
"loss": 0.0,
"step": 2060
},
{
"epoch": 0.3000363240101707,
"grad_norm": 0.05199315398931503,
"learning_rate": 0.0001889095165065784,
"loss": 0.0002,
"step": 2065
},
{
"epoch": 0.3007628042135852,
"grad_norm": 0.0018118784064427018,
"learning_rate": 0.00018882879974170636,
"loss": 0.0116,
"step": 2070
},
{
"epoch": 0.3014892844169996,
"grad_norm": 0.002479708520695567,
"learning_rate": 0.0001887480829768343,
"loss": 0.0096,
"step": 2075
},
{
"epoch": 0.3022157646204141,
"grad_norm": 0.001789470436051488,
"learning_rate": 0.00018866736621196223,
"loss": 0.0,
"step": 2080
},
{
"epoch": 0.30294224482382853,
"grad_norm": 1.2244577407836914,
"learning_rate": 0.00018858664944709018,
"loss": 0.002,
"step": 2085
},
{
"epoch": 0.303668725027243,
"grad_norm": 0.001510178786702454,
"learning_rate": 0.0001885059326822181,
"loss": 0.0001,
"step": 2090
},
{
"epoch": 0.30439520523065744,
"grad_norm": 0.0012227630941197276,
"learning_rate": 0.00018842521591734603,
"loss": 0.0007,
"step": 2095
},
{
"epoch": 0.3051216854340719,
"grad_norm": 6.986842155456543,
"learning_rate": 0.00018834449915247398,
"loss": 0.0041,
"step": 2100
},
{
"epoch": 0.3058481656374864,
"grad_norm": 0.0014463558327406645,
"learning_rate": 0.00018826378238760193,
"loss": 0.0006,
"step": 2105
},
{
"epoch": 0.30657464584090083,
"grad_norm": 0.0013261119602248073,
"learning_rate": 0.00018818306562272985,
"loss": 0.0058,
"step": 2110
},
{
"epoch": 0.3073011260443153,
"grad_norm": 0.0014859420480206609,
"learning_rate": 0.00018810234885785777,
"loss": 0.0,
"step": 2115
},
{
"epoch": 0.30802760624772973,
"grad_norm": 0.001101717702113092,
"learning_rate": 0.00018802163209298572,
"loss": 0.0,
"step": 2120
},
{
"epoch": 0.3087540864511442,
"grad_norm": 0.0022333369124680758,
"learning_rate": 0.00018794091532811367,
"loss": 0.0003,
"step": 2125
},
{
"epoch": 0.30948056665455864,
"grad_norm": 0.011202757246792316,
"learning_rate": 0.0001878601985632416,
"loss": 0.0001,
"step": 2130
},
{
"epoch": 0.3102070468579731,
"grad_norm": 0.001800977042876184,
"learning_rate": 0.00018777948179836952,
"loss": 0.0218,
"step": 2135
},
{
"epoch": 0.3109335270613876,
"grad_norm": 0.004161295481026173,
"learning_rate": 0.00018769876503349747,
"loss": 0.0002,
"step": 2140
},
{
"epoch": 0.31166000726480203,
"grad_norm": 0.0032398079056292772,
"learning_rate": 0.00018761804826862542,
"loss": 0.0006,
"step": 2145
},
{
"epoch": 0.3123864874682165,
"grad_norm": 0.04649796336889267,
"learning_rate": 0.00018753733150375334,
"loss": 0.0001,
"step": 2150
},
{
"epoch": 0.31311296767163094,
"grad_norm": 0.0010927373077720404,
"learning_rate": 0.0001874566147388813,
"loss": 0.0001,
"step": 2155
},
{
"epoch": 0.3138394478750454,
"grad_norm": 0.002848146017640829,
"learning_rate": 0.0001873758979740092,
"loss": 0.0001,
"step": 2160
},
{
"epoch": 0.31456592807845984,
"grad_norm": 0.001080561545677483,
"learning_rate": 0.00018729518120913713,
"loss": 0.0,
"step": 2165
},
{
"epoch": 0.3152924082818743,
"grad_norm": 0.0011905552819371223,
"learning_rate": 0.00018721446444426508,
"loss": 0.0,
"step": 2170
},
{
"epoch": 0.31601888848528875,
"grad_norm": 0.002129076048731804,
"learning_rate": 0.00018713374767939303,
"loss": 0.0001,
"step": 2175
},
{
"epoch": 0.31674536868870323,
"grad_norm": 0.0015021953731775284,
"learning_rate": 0.00018705303091452096,
"loss": 0.0,
"step": 2180
},
{
"epoch": 0.3174718488921177,
"grad_norm": 0.0011074721114709973,
"learning_rate": 0.00018697231414964888,
"loss": 0.0,
"step": 2185
},
{
"epoch": 0.31819832909553214,
"grad_norm": 0.0013954649912193418,
"learning_rate": 0.00018689159738477683,
"loss": 0.0,
"step": 2190
},
{
"epoch": 0.3189248092989466,
"grad_norm": 0.0008435107301920652,
"learning_rate": 0.00018681088061990478,
"loss": 0.0,
"step": 2195
},
{
"epoch": 0.31965128950236105,
"grad_norm": 0.0015673066955059767,
"learning_rate": 0.0001867301638550327,
"loss": 0.0001,
"step": 2200
},
{
"epoch": 0.32037776970577553,
"grad_norm": 0.0006937576690688729,
"learning_rate": 0.00018664944709016062,
"loss": 0.0,
"step": 2205
},
{
"epoch": 0.32110424990918995,
"grad_norm": 0.0006967806257307529,
"learning_rate": 0.00018656873032528857,
"loss": 0.0,
"step": 2210
},
{
"epoch": 0.32183073011260444,
"grad_norm": 0.0010916970204561949,
"learning_rate": 0.0001864880135604165,
"loss": 0.0,
"step": 2215
},
{
"epoch": 0.32255721031601886,
"grad_norm": 0.054137326776981354,
"learning_rate": 0.00018640729679554445,
"loss": 0.0001,
"step": 2220
},
{
"epoch": 0.32328369051943334,
"grad_norm": 0.0007835402502678335,
"learning_rate": 0.00018632658003067237,
"loss": 0.0,
"step": 2225
},
{
"epoch": 0.3240101707228478,
"grad_norm": 0.0006136346491985023,
"learning_rate": 0.00018624586326580032,
"loss": 0.0,
"step": 2230
},
{
"epoch": 0.32473665092626225,
"grad_norm": 0.0005693508428521454,
"learning_rate": 0.00018616514650092824,
"loss": 0.0,
"step": 2235
},
{
"epoch": 0.32546313112967673,
"grad_norm": 0.001010082894936204,
"learning_rate": 0.0001860844297360562,
"loss": 0.0,
"step": 2240
},
{
"epoch": 0.32618961133309116,
"grad_norm": 0.0006115248543210328,
"learning_rate": 0.00018600371297118414,
"loss": 0.0,
"step": 2245
},
{
"epoch": 0.32691609153650564,
"grad_norm": 0.005977267399430275,
"learning_rate": 0.00018592299620631206,
"loss": 0.0,
"step": 2250
},
{
"epoch": 0.32764257173992006,
"grad_norm": 0.0004075188480783254,
"learning_rate": 0.00018584227944143999,
"loss": 0.0,
"step": 2255
},
{
"epoch": 0.32836905194333454,
"grad_norm": 0.0005186618654988706,
"learning_rate": 0.00018576156267656794,
"loss": 0.0,
"step": 2260
},
{
"epoch": 0.329095532146749,
"grad_norm": 0.0005320632481016219,
"learning_rate": 0.00018568084591169589,
"loss": 0.0,
"step": 2265
},
{
"epoch": 0.32982201235016345,
"grad_norm": 0.029953761026263237,
"learning_rate": 0.0001856001291468238,
"loss": 0.0,
"step": 2270
},
{
"epoch": 0.33054849255357793,
"grad_norm": 0.0003188280388712883,
"learning_rate": 0.00018551941238195173,
"loss": 0.0,
"step": 2275
},
{
"epoch": 0.33127497275699236,
"grad_norm": 0.0004120226367376745,
"learning_rate": 0.00018543869561707968,
"loss": 0.0,
"step": 2280
},
{
"epoch": 0.33200145296040684,
"grad_norm": 0.0005906698643229902,
"learning_rate": 0.0001853579788522076,
"loss": 0.0,
"step": 2285
},
{
"epoch": 0.33272793316382127,
"grad_norm": 0.00045190524542704225,
"learning_rate": 0.00018527726208733555,
"loss": 0.0,
"step": 2290
},
{
"epoch": 0.33345441336723575,
"grad_norm": 0.0008185270125977695,
"learning_rate": 0.00018519654532246348,
"loss": 0.0,
"step": 2295
},
{
"epoch": 0.3341808935706502,
"grad_norm": 0.0003965144860558212,
"learning_rate": 0.00018511582855759143,
"loss": 0.0,
"step": 2300
},
{
"epoch": 0.33490737377406465,
"grad_norm": 0.0003858699928969145,
"learning_rate": 0.00018503511179271935,
"loss": 0.0,
"step": 2305
},
{
"epoch": 0.33563385397747914,
"grad_norm": 0.0005558038246817887,
"learning_rate": 0.00018495439502784727,
"loss": 0.0,
"step": 2310
},
{
"epoch": 0.33636033418089356,
"grad_norm": 0.00037957995664328337,
"learning_rate": 0.00018487367826297525,
"loss": 0.0,
"step": 2315
},
{
"epoch": 0.33708681438430804,
"grad_norm": 0.0003773049684241414,
"learning_rate": 0.00018479296149810317,
"loss": 0.0,
"step": 2320
},
{
"epoch": 0.33781329458772247,
"grad_norm": 0.0006691055023111403,
"learning_rate": 0.0001847122447332311,
"loss": 0.0,
"step": 2325
},
{
"epoch": 0.33853977479113695,
"grad_norm": 0.000681467994581908,
"learning_rate": 0.00018463152796835904,
"loss": 0.0,
"step": 2330
},
{
"epoch": 0.3392662549945514,
"grad_norm": 0.0005777952610515058,
"learning_rate": 0.000184550811203487,
"loss": 0.0,
"step": 2335
},
{
"epoch": 0.33999273519796586,
"grad_norm": 0.0005241065518930554,
"learning_rate": 0.00018447009443861492,
"loss": 0.0,
"step": 2340
},
{
"epoch": 0.34071921540138034,
"grad_norm": 0.00039175679557956755,
"learning_rate": 0.00018438937767374284,
"loss": 0.0,
"step": 2345
},
{
"epoch": 0.34144569560479476,
"grad_norm": 0.00041981766116805375,
"learning_rate": 0.0001843086609088708,
"loss": 0.0,
"step": 2350
},
{
"epoch": 0.34217217580820924,
"grad_norm": 0.000371248199371621,
"learning_rate": 0.0001842279441439987,
"loss": 0.0,
"step": 2355
},
{
"epoch": 0.34289865601162367,
"grad_norm": 0.00031778172706253827,
"learning_rate": 0.00018414722737912666,
"loss": 0.0,
"step": 2360
},
{
"epoch": 0.34362513621503815,
"grad_norm": 0.00029086892027407885,
"learning_rate": 0.00018406651061425458,
"loss": 0.0,
"step": 2365
},
{
"epoch": 0.3443516164184526,
"grad_norm": 0.0002902498235926032,
"learning_rate": 0.00018398579384938253,
"loss": 0.0,
"step": 2370
},
{
"epoch": 0.34507809662186706,
"grad_norm": 0.00040075520519167185,
"learning_rate": 0.00018390507708451045,
"loss": 0.0,
"step": 2375
},
{
"epoch": 0.3458045768252815,
"grad_norm": 0.00024263348313979805,
"learning_rate": 0.00018382436031963838,
"loss": 0.0,
"step": 2380
},
{
"epoch": 0.34653105702869597,
"grad_norm": 0.0003889152139890939,
"learning_rate": 0.00018374364355476635,
"loss": 0.0,
"step": 2385
},
{
"epoch": 0.34725753723211045,
"grad_norm": 0.00022724135487806052,
"learning_rate": 0.00018366292678989428,
"loss": 0.0,
"step": 2390
},
{
"epoch": 0.3479840174355249,
"grad_norm": 0.0003505950153339654,
"learning_rate": 0.0001835822100250222,
"loss": 0.0,
"step": 2395
},
{
"epoch": 0.34871049763893935,
"grad_norm": 0.27515658736228943,
"learning_rate": 0.00018350149326015015,
"loss": 0.0002,
"step": 2400
},
{
"epoch": 0.3494369778423538,
"grad_norm": 0.0003519939782563597,
"learning_rate": 0.00018342077649527807,
"loss": 0.0,
"step": 2405
},
{
"epoch": 0.35016345804576826,
"grad_norm": 0.00033144818735308945,
"learning_rate": 0.00018334005973040602,
"loss": 0.0,
"step": 2410
},
{
"epoch": 0.3508899382491827,
"grad_norm": 0.0003098642046097666,
"learning_rate": 0.00018325934296553394,
"loss": 0.0,
"step": 2415
},
{
"epoch": 0.35161641845259717,
"grad_norm": 0.0002381189988227561,
"learning_rate": 0.0001831786262006619,
"loss": 0.0,
"step": 2420
},
{
"epoch": 0.35234289865601165,
"grad_norm": 0.0007972380262799561,
"learning_rate": 0.00018309790943578982,
"loss": 0.0,
"step": 2425
},
{
"epoch": 0.3530693788594261,
"grad_norm": 0.0025481837801635265,
"learning_rate": 0.00018301719267091774,
"loss": 0.0,
"step": 2430
},
{
"epoch": 0.35379585906284056,
"grad_norm": 0.00035965273855254054,
"learning_rate": 0.0001829364759060457,
"loss": 0.0,
"step": 2435
},
{
"epoch": 0.354522339266255,
"grad_norm": 0.000297486170893535,
"learning_rate": 0.00018285575914117364,
"loss": 0.0,
"step": 2440
},
{
"epoch": 0.35524881946966946,
"grad_norm": 0.00028157353517599404,
"learning_rate": 0.00018277504237630156,
"loss": 0.0021,
"step": 2445
},
{
"epoch": 0.3559752996730839,
"grad_norm": 0.0003479410079307854,
"learning_rate": 0.00018269432561142948,
"loss": 0.0,
"step": 2450
},
{
"epoch": 0.35670177987649837,
"grad_norm": 0.002874035155400634,
"learning_rate": 0.00018261360884655746,
"loss": 0.0003,
"step": 2455
},
{
"epoch": 0.3574282600799128,
"grad_norm": 0.00015613746654707938,
"learning_rate": 0.00018253289208168538,
"loss": 0.0004,
"step": 2460
},
{
"epoch": 0.3581547402833273,
"grad_norm": 0.00013312845840118825,
"learning_rate": 0.0001824521753168133,
"loss": 0.0,
"step": 2465
},
{
"epoch": 0.35888122048674176,
"grad_norm": 0.0001981378736672923,
"learning_rate": 0.00018237145855194126,
"loss": 0.0,
"step": 2470
},
{
"epoch": 0.3596077006901562,
"grad_norm": 0.00027879534172825515,
"learning_rate": 0.00018229074178706918,
"loss": 0.0,
"step": 2475
},
{
"epoch": 0.36033418089357067,
"grad_norm": 0.00016323383897542953,
"learning_rate": 0.00018221002502219713,
"loss": 0.0,
"step": 2480
},
{
"epoch": 0.3610606610969851,
"grad_norm": 0.0005233317497186363,
"learning_rate": 0.00018212930825732505,
"loss": 0.0,
"step": 2485
},
{
"epoch": 0.3617871413003996,
"grad_norm": 0.00013268415932543576,
"learning_rate": 0.000182048591492453,
"loss": 0.0,
"step": 2490
},
{
"epoch": 0.362513621503814,
"grad_norm": 0.01259111799299717,
"learning_rate": 0.00018196787472758092,
"loss": 0.0001,
"step": 2495
},
{
"epoch": 0.3632401017072285,
"grad_norm": 0.00014725365326739848,
"learning_rate": 0.00018188715796270885,
"loss": 0.0,
"step": 2500
},
{
"epoch": 0.36396658191064296,
"grad_norm": 0.00021464233577717096,
"learning_rate": 0.0001818064411978368,
"loss": 0.0,
"step": 2505
},
{
"epoch": 0.3646930621140574,
"grad_norm": 0.00011434618500061333,
"learning_rate": 0.00018172572443296475,
"loss": 0.0,
"step": 2510
},
{
"epoch": 0.36541954231747187,
"grad_norm": 0.00012706074630841613,
"learning_rate": 0.00018164500766809267,
"loss": 0.0,
"step": 2515
},
{
"epoch": 0.3661460225208863,
"grad_norm": 0.00015453774540219456,
"learning_rate": 0.0001815642909032206,
"loss": 0.0,
"step": 2520
},
{
"epoch": 0.3668725027243008,
"grad_norm": 0.00014317889872472733,
"learning_rate": 0.00018148357413834854,
"loss": 0.0,
"step": 2525
},
{
"epoch": 0.3675989829277152,
"grad_norm": 0.00014966298476792872,
"learning_rate": 0.0001814028573734765,
"loss": 0.0,
"step": 2530
},
{
"epoch": 0.3683254631311297,
"grad_norm": 0.0001484445674577728,
"learning_rate": 0.00018132214060860441,
"loss": 0.0,
"step": 2535
},
{
"epoch": 0.3690519433345441,
"grad_norm": 0.00012702727690339088,
"learning_rate": 0.00018124142384373234,
"loss": 0.0,
"step": 2540
},
{
"epoch": 0.3697784235379586,
"grad_norm": 0.0001310681545874104,
"learning_rate": 0.00018116070707886029,
"loss": 0.0,
"step": 2545
},
{
"epoch": 0.37050490374137307,
"grad_norm": 0.0001544792321510613,
"learning_rate": 0.00018107999031398824,
"loss": 0.0,
"step": 2550
},
{
"epoch": 0.3712313839447875,
"grad_norm": 0.0003174188022967428,
"learning_rate": 0.00018099927354911616,
"loss": 0.0,
"step": 2555
},
{
"epoch": 0.371957864148202,
"grad_norm": 0.00012976166908629239,
"learning_rate": 0.0001809185567842441,
"loss": 0.0,
"step": 2560
},
{
"epoch": 0.3726843443516164,
"grad_norm": 0.00011333979637129232,
"learning_rate": 0.00018083784001937203,
"loss": 0.0,
"step": 2565
},
{
"epoch": 0.3734108245550309,
"grad_norm": 0.00014128838665783405,
"learning_rate": 0.00018075712325449995,
"loss": 0.0,
"step": 2570
},
{
"epoch": 0.3741373047584453,
"grad_norm": 9.816375677473843e-05,
"learning_rate": 0.0001806764064896279,
"loss": 0.0,
"step": 2575
},
{
"epoch": 0.3748637849618598,
"grad_norm": 0.00012458849232643843,
"learning_rate": 0.00018059568972475585,
"loss": 0.0,
"step": 2580
},
{
"epoch": 0.3755902651652743,
"grad_norm": 0.00011874383199028671,
"learning_rate": 0.00018051497295988378,
"loss": 0.0,
"step": 2585
},
{
"epoch": 0.3763167453686887,
"grad_norm": 0.00010492030560271814,
"learning_rate": 0.0001804342561950117,
"loss": 0.0,
"step": 2590
},
{
"epoch": 0.3770432255721032,
"grad_norm": 0.00012079241423634812,
"learning_rate": 0.00018035353943013965,
"loss": 0.0,
"step": 2595
},
{
"epoch": 0.3777697057755176,
"grad_norm": 0.0010301030706614256,
"learning_rate": 0.0001802728226652676,
"loss": 0.0,
"step": 2600
},
{
"epoch": 0.3784961859789321,
"grad_norm": 0.00020237726857885718,
"learning_rate": 0.00018019210590039552,
"loss": 0.0,
"step": 2605
},
{
"epoch": 0.3792226661823465,
"grad_norm": 0.00014590570935979486,
"learning_rate": 0.00018011138913552344,
"loss": 0.0,
"step": 2610
},
{
"epoch": 0.379949146385761,
"grad_norm": 0.00012144942593295127,
"learning_rate": 0.0001800306723706514,
"loss": 0.0,
"step": 2615
},
{
"epoch": 0.3806756265891754,
"grad_norm": 0.00011861774692079052,
"learning_rate": 0.00017994995560577932,
"loss": 0.0,
"step": 2620
},
{
"epoch": 0.3814021067925899,
"grad_norm": 0.0002795616746880114,
"learning_rate": 0.00017986923884090727,
"loss": 0.0,
"step": 2625
},
{
"epoch": 0.3821285869960044,
"grad_norm": 0.0001514231407782063,
"learning_rate": 0.00017978852207603522,
"loss": 0.0,
"step": 2630
},
{
"epoch": 0.3828550671994188,
"grad_norm": 0.000137203314807266,
"learning_rate": 0.00017970780531116314,
"loss": 0.0,
"step": 2635
},
{
"epoch": 0.3835815474028333,
"grad_norm": 0.00011654103582259268,
"learning_rate": 0.00017962708854629106,
"loss": 0.0,
"step": 2640
},
{
"epoch": 0.3843080276062477,
"grad_norm": 0.00011019224621122703,
"learning_rate": 0.000179546371781419,
"loss": 0.0,
"step": 2645
},
{
"epoch": 0.3850345078096622,
"grad_norm": 0.00011716793233063072,
"learning_rate": 0.00017946565501654696,
"loss": 0.0,
"step": 2650
},
{
"epoch": 0.3857609880130766,
"grad_norm": 0.00013133355241734535,
"learning_rate": 0.00017938493825167488,
"loss": 0.0,
"step": 2655
},
{
"epoch": 0.3864874682164911,
"grad_norm": 0.00010616648069117218,
"learning_rate": 0.0001793042214868028,
"loss": 0.0,
"step": 2660
},
{
"epoch": 0.3872139484199056,
"grad_norm": 0.00012793530186172575,
"learning_rate": 0.00017922350472193076,
"loss": 0.0,
"step": 2665
},
{
"epoch": 0.38794042862332,
"grad_norm": 0.00021880699205212295,
"learning_rate": 0.0001791427879570587,
"loss": 0.0,
"step": 2670
},
{
"epoch": 0.3886669088267345,
"grad_norm": 0.0321350060403347,
"learning_rate": 0.00017906207119218663,
"loss": 0.0,
"step": 2675
},
{
"epoch": 0.3893933890301489,
"grad_norm": 0.0001054102904163301,
"learning_rate": 0.00017898135442731455,
"loss": 0.0,
"step": 2680
},
{
"epoch": 0.3901198692335634,
"grad_norm": 0.00011370116408215836,
"learning_rate": 0.0001789006376624425,
"loss": 0.0,
"step": 2685
},
{
"epoch": 0.3908463494369778,
"grad_norm": 7.921565702417865e-05,
"learning_rate": 0.00017881992089757042,
"loss": 0.0,
"step": 2690
},
{
"epoch": 0.3915728296403923,
"grad_norm": 0.0001325017656199634,
"learning_rate": 0.00017873920413269837,
"loss": 0.0,
"step": 2695
},
{
"epoch": 0.39229930984380673,
"grad_norm": 0.00011485354480100796,
"learning_rate": 0.00017865848736782632,
"loss": 0.0,
"step": 2700
},
{
"epoch": 0.3930257900472212,
"grad_norm": 0.0001319620932918042,
"learning_rate": 0.00017857777060295424,
"loss": 0.0,
"step": 2705
},
{
"epoch": 0.3937522702506357,
"grad_norm": 0.00011554160300875083,
"learning_rate": 0.00017849705383808217,
"loss": 0.0,
"step": 2710
},
{
"epoch": 0.3944787504540501,
"grad_norm": 0.00011111667845398188,
"learning_rate": 0.00017841633707321012,
"loss": 0.0,
"step": 2715
},
{
"epoch": 0.3952052306574646,
"grad_norm": 0.00030816654907539487,
"learning_rate": 0.00017833562030833807,
"loss": 0.0,
"step": 2720
},
{
"epoch": 0.395931710860879,
"grad_norm": 0.00012618518667295575,
"learning_rate": 0.000178254903543466,
"loss": 0.0,
"step": 2725
},
{
"epoch": 0.3966581910642935,
"grad_norm": 0.00011036815703846514,
"learning_rate": 0.0001781741867785939,
"loss": 0.0,
"step": 2730
},
{
"epoch": 0.39738467126770793,
"grad_norm": 0.001136181759648025,
"learning_rate": 0.00017809347001372186,
"loss": 0.0,
"step": 2735
},
{
"epoch": 0.3981111514711224,
"grad_norm": 9.4526847533416e-05,
"learning_rate": 0.00017801275324884978,
"loss": 0.0,
"step": 2740
},
{
"epoch": 0.3988376316745369,
"grad_norm": 9.693180618342012e-05,
"learning_rate": 0.00017793203648397773,
"loss": 0.0,
"step": 2745
},
{
"epoch": 0.3995641118779513,
"grad_norm": 0.00013439155009109527,
"learning_rate": 0.00017785131971910566,
"loss": 0.0,
"step": 2750
},
{
"epoch": 0.4001452960406829,
"eval_accuracy": 1.0,
"eval_f1": 1.0,
"eval_loss": 8.966613904703991e-07,
"eval_precision": 1.0,
"eval_recall": 1.0,
"eval_runtime": 123.9354,
"eval_samples_per_second": 313.591,
"eval_steps_per_second": 2.453,
"step": 2754
}
],
"logging_steps": 5,
"max_steps": 13766,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1377,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3618642193367040.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}