{ "best_global_step": 2754, "best_metric": 1.0, "best_model_checkpoint": "./results/wallet_deberta_v10/checkpoint-2754", "epoch": 0.4001452960406829, "eval_steps": 1377, "global_step": 2754, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000726480203414457, "grad_norm": 1.474841833114624, "learning_rate": 5.809731299927378e-07, "loss": 0.6929, "step": 5 }, { "epoch": 0.001452960406828914, "grad_norm": 0.7840715646743774, "learning_rate": 1.3071895424836602e-06, "loss": 0.6982, "step": 10 }, { "epoch": 0.002179440610243371, "grad_norm": 0.6173088550567627, "learning_rate": 2.0334059549745822e-06, "loss": 0.6906, "step": 15 }, { "epoch": 0.002905920813657828, "grad_norm": 1.7541619539260864, "learning_rate": 2.759622367465505e-06, "loss": 0.6917, "step": 20 }, { "epoch": 0.003632401017072285, "grad_norm": 1.0843828916549683, "learning_rate": 3.4858387799564276e-06, "loss": 0.6885, "step": 25 }, { "epoch": 0.004358881220486742, "grad_norm": 0.6747203469276428, "learning_rate": 4.212055192447349e-06, "loss": 0.6854, "step": 30 }, { "epoch": 0.005085361423901199, "grad_norm": 2.215040445327759, "learning_rate": 4.938271604938272e-06, "loss": 0.6895, "step": 35 }, { "epoch": 0.005811841627315656, "grad_norm": 1.2605243921279907, "learning_rate": 5.664488017429194e-06, "loss": 0.6914, "step": 40 }, { "epoch": 0.006538321830730113, "grad_norm": 1.5572800636291504, "learning_rate": 6.390704429920116e-06, "loss": 0.6913, "step": 45 }, { "epoch": 0.00726480203414457, "grad_norm": 0.5582659244537354, "learning_rate": 7.116920842411038e-06, "loss": 0.7015, "step": 50 }, { "epoch": 0.007991282237559027, "grad_norm": 2.64322829246521, "learning_rate": 7.84313725490196e-06, "loss": 0.6883, "step": 55 }, { "epoch": 0.008717762440973484, "grad_norm": 0.4942118525505066, "learning_rate": 8.569353667392883e-06, "loss": 0.6863, "step": 60 }, { "epoch": 0.00944424264438794, "grad_norm": 1.1258032321929932, "learning_rate": 9.295570079883805e-06, "loss": 0.6999, "step": 65 }, { "epoch": 0.010170722847802398, "grad_norm": 1.727752923965454, "learning_rate": 1.0021786492374727e-05, "loss": 0.6932, "step": 70 }, { "epoch": 0.010897203051216855, "grad_norm": 1.4421207904815674, "learning_rate": 1.0748002904865651e-05, "loss": 0.6898, "step": 75 }, { "epoch": 0.011623683254631312, "grad_norm": 1.5667537450790405, "learning_rate": 1.1474219317356574e-05, "loss": 0.6965, "step": 80 }, { "epoch": 0.012350163458045769, "grad_norm": 0.44926533102989197, "learning_rate": 1.2200435729847496e-05, "loss": 0.6929, "step": 85 }, { "epoch": 0.013076643661460226, "grad_norm": 0.425881028175354, "learning_rate": 1.2926652142338416e-05, "loss": 0.6907, "step": 90 }, { "epoch": 0.013803123864874683, "grad_norm": 0.4951478838920593, "learning_rate": 1.365286855482934e-05, "loss": 0.6993, "step": 95 }, { "epoch": 0.01452960406828914, "grad_norm": 0.3694448471069336, "learning_rate": 1.4379084967320261e-05, "loss": 0.6831, "step": 100 }, { "epoch": 0.015256084271703597, "grad_norm": 1.17753005027771, "learning_rate": 1.5105301379811185e-05, "loss": 0.68, "step": 105 }, { "epoch": 0.015982564475118054, "grad_norm": 0.611096978187561, "learning_rate": 1.5831517792302105e-05, "loss": 0.6921, "step": 110 }, { "epoch": 0.01670904467853251, "grad_norm": 1.563934087753296, "learning_rate": 1.655773420479303e-05, "loss": 0.6865, "step": 115 }, { "epoch": 0.017435524881946968, "grad_norm": 0.928711473941803, "learning_rate": 1.728395061728395e-05, "loss": 0.6861, "step": 120 }, { "epoch": 0.018162005085361425, "grad_norm": 1.1964377164840698, "learning_rate": 1.8010167029774874e-05, "loss": 0.7021, "step": 125 }, { "epoch": 0.01888848528877588, "grad_norm": 0.3896273970603943, "learning_rate": 1.8736383442265794e-05, "loss": 0.6918, "step": 130 }, { "epoch": 0.01961496549219034, "grad_norm": 0.4799005091190338, "learning_rate": 1.946259985475672e-05, "loss": 0.6954, "step": 135 }, { "epoch": 0.020341445695604796, "grad_norm": 0.6113623380661011, "learning_rate": 2.018881626724764e-05, "loss": 0.6886, "step": 140 }, { "epoch": 0.021067925899019253, "grad_norm": 0.6965861320495605, "learning_rate": 2.0915032679738563e-05, "loss": 0.6814, "step": 145 }, { "epoch": 0.02179440610243371, "grad_norm": 0.46387240290641785, "learning_rate": 2.1641249092229483e-05, "loss": 0.681, "step": 150 }, { "epoch": 0.022520886305848167, "grad_norm": 0.4296594560146332, "learning_rate": 2.2367465504720407e-05, "loss": 0.6853, "step": 155 }, { "epoch": 0.023247366509262624, "grad_norm": 0.8854900002479553, "learning_rate": 2.3093681917211328e-05, "loss": 0.6784, "step": 160 }, { "epoch": 0.02397384671267708, "grad_norm": 0.7150282263755798, "learning_rate": 2.3819898329702252e-05, "loss": 0.6774, "step": 165 }, { "epoch": 0.024700326916091538, "grad_norm": 0.8739128708839417, "learning_rate": 2.4546114742193176e-05, "loss": 0.6662, "step": 170 }, { "epoch": 0.025426807119505995, "grad_norm": 0.7919905781745911, "learning_rate": 2.5272331154684096e-05, "loss": 0.6601, "step": 175 }, { "epoch": 0.02615328732292045, "grad_norm": 0.6220109462738037, "learning_rate": 2.599854756717502e-05, "loss": 0.649, "step": 180 }, { "epoch": 0.02687976752633491, "grad_norm": 0.8708673715591431, "learning_rate": 2.672476397966594e-05, "loss": 0.6318, "step": 185 }, { "epoch": 0.027606247729749366, "grad_norm": 1.0253198146820068, "learning_rate": 2.7450980392156865e-05, "loss": 0.6025, "step": 190 }, { "epoch": 0.028332727933163823, "grad_norm": 1.1449552774429321, "learning_rate": 2.817719680464779e-05, "loss": 0.5564, "step": 195 }, { "epoch": 0.02905920813657828, "grad_norm": 3.2159643173217773, "learning_rate": 2.890341321713871e-05, "loss": 0.515, "step": 200 }, { "epoch": 0.029785688339992736, "grad_norm": 1.912434458732605, "learning_rate": 2.962962962962963e-05, "loss": 0.4635, "step": 205 }, { "epoch": 0.030512168543407193, "grad_norm": 1.9808599948883057, "learning_rate": 3.0355846042120557e-05, "loss": 0.4233, "step": 210 }, { "epoch": 0.03123864874682165, "grad_norm": 1.929961085319519, "learning_rate": 3.1082062454611474e-05, "loss": 0.3505, "step": 215 }, { "epoch": 0.03196512895023611, "grad_norm": 2.4213836193084717, "learning_rate": 3.1808278867102395e-05, "loss": 0.3079, "step": 220 }, { "epoch": 0.03269160915365056, "grad_norm": 8.767487525939941, "learning_rate": 3.2534495279593316e-05, "loss": 0.2805, "step": 225 }, { "epoch": 0.03341808935706502, "grad_norm": 6.868554592132568, "learning_rate": 3.326071169208424e-05, "loss": 0.2501, "step": 230 }, { "epoch": 0.034144569560479475, "grad_norm": 10.502647399902344, "learning_rate": 3.3986928104575163e-05, "loss": 0.2505, "step": 235 }, { "epoch": 0.034871049763893935, "grad_norm": 2.8313727378845215, "learning_rate": 3.471314451706609e-05, "loss": 0.2265, "step": 240 }, { "epoch": 0.03559752996730839, "grad_norm": 4.173934459686279, "learning_rate": 3.543936092955701e-05, "loss": 0.2045, "step": 245 }, { "epoch": 0.03632401017072285, "grad_norm": 1.775830626487732, "learning_rate": 3.616557734204793e-05, "loss": 0.1584, "step": 250 }, { "epoch": 0.0370504903741373, "grad_norm": 3.129055976867676, "learning_rate": 3.689179375453886e-05, "loss": 0.131, "step": 255 }, { "epoch": 0.03777697057755176, "grad_norm": 4.053362846374512, "learning_rate": 3.761801016702978e-05, "loss": 0.1333, "step": 260 }, { "epoch": 0.03850345078096622, "grad_norm": 3.1923694610595703, "learning_rate": 3.83442265795207e-05, "loss": 0.1734, "step": 265 }, { "epoch": 0.03922993098438068, "grad_norm": 4.751387119293213, "learning_rate": 3.907044299201162e-05, "loss": 0.1296, "step": 270 }, { "epoch": 0.03995641118779513, "grad_norm": 1.9991039037704468, "learning_rate": 3.979665940450255e-05, "loss": 0.1297, "step": 275 }, { "epoch": 0.04068289139120959, "grad_norm": 5.077785015106201, "learning_rate": 4.052287581699347e-05, "loss": 0.1127, "step": 280 }, { "epoch": 0.041409371594624045, "grad_norm": 3.6270077228546143, "learning_rate": 4.124909222948439e-05, "loss": 0.1125, "step": 285 }, { "epoch": 0.042135851798038505, "grad_norm": 1.017204999923706, "learning_rate": 4.197530864197531e-05, "loss": 0.0802, "step": 290 }, { "epoch": 0.04286233200145296, "grad_norm": 1.492018222808838, "learning_rate": 4.270152505446624e-05, "loss": 0.1008, "step": 295 }, { "epoch": 0.04358881220486742, "grad_norm": 1.7372925281524658, "learning_rate": 4.342774146695716e-05, "loss": 0.1172, "step": 300 }, { "epoch": 0.04431529240828187, "grad_norm": 2.817929983139038, "learning_rate": 4.415395787944808e-05, "loss": 0.1096, "step": 305 }, { "epoch": 0.04504177261169633, "grad_norm": 0.9688124656677246, "learning_rate": 4.4880174291939e-05, "loss": 0.0663, "step": 310 }, { "epoch": 0.04576825281511079, "grad_norm": 3.9759960174560547, "learning_rate": 4.5606390704429926e-05, "loss": 0.0706, "step": 315 }, { "epoch": 0.04649473301852525, "grad_norm": 5.014834880828857, "learning_rate": 4.633260711692085e-05, "loss": 0.0681, "step": 320 }, { "epoch": 0.0472212132219397, "grad_norm": 3.1871182918548584, "learning_rate": 4.705882352941177e-05, "loss": 0.0934, "step": 325 }, { "epoch": 0.04794769342535416, "grad_norm": 5.145167350769043, "learning_rate": 4.778503994190269e-05, "loss": 0.0777, "step": 330 }, { "epoch": 0.048674173628768615, "grad_norm": 6.0013275146484375, "learning_rate": 4.8511256354393615e-05, "loss": 0.0825, "step": 335 }, { "epoch": 0.049400653832183075, "grad_norm": 0.9712790250778198, "learning_rate": 4.9237472766884536e-05, "loss": 0.0417, "step": 340 }, { "epoch": 0.05012713403559753, "grad_norm": 1.4921551942825317, "learning_rate": 4.9963689179375456e-05, "loss": 0.0316, "step": 345 }, { "epoch": 0.05085361423901199, "grad_norm": 4.556818008422852, "learning_rate": 5.0689905591866384e-05, "loss": 0.0592, "step": 350 }, { "epoch": 0.05158009444242644, "grad_norm": 2.8250820636749268, "learning_rate": 5.1416122004357304e-05, "loss": 0.0591, "step": 355 }, { "epoch": 0.0523065746458409, "grad_norm": 2.345460891723633, "learning_rate": 5.2142338416848225e-05, "loss": 0.073, "step": 360 }, { "epoch": 0.05303305484925536, "grad_norm": 0.44890037178993225, "learning_rate": 5.2868554829339145e-05, "loss": 0.057, "step": 365 }, { "epoch": 0.05375953505266982, "grad_norm": 1.1340672969818115, "learning_rate": 5.3594771241830066e-05, "loss": 0.0466, "step": 370 }, { "epoch": 0.05448601525608427, "grad_norm": 0.7642996311187744, "learning_rate": 5.4320987654320986e-05, "loss": 0.0437, "step": 375 }, { "epoch": 0.05521249545949873, "grad_norm": 4.866988658905029, "learning_rate": 5.504720406681191e-05, "loss": 0.0654, "step": 380 }, { "epoch": 0.055938975662913185, "grad_norm": 0.9396504163742065, "learning_rate": 5.577342047930284e-05, "loss": 0.0184, "step": 385 }, { "epoch": 0.056665455866327645, "grad_norm": 0.5176196098327637, "learning_rate": 5.649963689179376e-05, "loss": 0.0208, "step": 390 }, { "epoch": 0.0573919360697421, "grad_norm": 4.328392028808594, "learning_rate": 5.722585330428468e-05, "loss": 0.0186, "step": 395 }, { "epoch": 0.05811841627315656, "grad_norm": 6.72576379776001, "learning_rate": 5.79520697167756e-05, "loss": 0.0311, "step": 400 }, { "epoch": 0.05884489647657101, "grad_norm": 10.39561653137207, "learning_rate": 5.8678286129266523e-05, "loss": 0.0453, "step": 405 }, { "epoch": 0.05957137667998547, "grad_norm": 8.84882926940918, "learning_rate": 5.9404502541757444e-05, "loss": 0.0821, "step": 410 }, { "epoch": 0.060297856883399926, "grad_norm": 0.5676841735839844, "learning_rate": 6.0130718954248365e-05, "loss": 0.1021, "step": 415 }, { "epoch": 0.06102433708681439, "grad_norm": 3.4484715461730957, "learning_rate": 6.0856935366739285e-05, "loss": 0.0397, "step": 420 }, { "epoch": 0.06175081729022884, "grad_norm": 8.414154052734375, "learning_rate": 6.158315177923021e-05, "loss": 0.0864, "step": 425 }, { "epoch": 0.0624772974936433, "grad_norm": 5.502734184265137, "learning_rate": 6.230936819172115e-05, "loss": 0.0313, "step": 430 }, { "epoch": 0.06320377769705776, "grad_norm": 6.950675964355469, "learning_rate": 6.303558460421207e-05, "loss": 0.0271, "step": 435 }, { "epoch": 0.06393025790047221, "grad_norm": 2.7828145027160645, "learning_rate": 6.376180101670299e-05, "loss": 0.016, "step": 440 }, { "epoch": 0.06465673810388667, "grad_norm": 4.585832118988037, "learning_rate": 6.448801742919391e-05, "loss": 0.0402, "step": 445 }, { "epoch": 0.06538321830730112, "grad_norm": 5.096743106842041, "learning_rate": 6.521423384168483e-05, "loss": 0.0719, "step": 450 }, { "epoch": 0.06610969851071559, "grad_norm": 5.883689880371094, "learning_rate": 6.594045025417575e-05, "loss": 0.0691, "step": 455 }, { "epoch": 0.06683617871413004, "grad_norm": 1.7454990148544312, "learning_rate": 6.666666666666667e-05, "loss": 0.0504, "step": 460 }, { "epoch": 0.0675626589175445, "grad_norm": 2.231943368911743, "learning_rate": 6.739288307915759e-05, "loss": 0.0184, "step": 465 }, { "epoch": 0.06828913912095895, "grad_norm": 4.1820268630981445, "learning_rate": 6.811909949164852e-05, "loss": 0.025, "step": 470 }, { "epoch": 0.06901561932437342, "grad_norm": 0.06752662360668182, "learning_rate": 6.884531590413945e-05, "loss": 0.0061, "step": 475 }, { "epoch": 0.06974209952778787, "grad_norm": 0.034968651831150055, "learning_rate": 6.957153231663037e-05, "loss": 0.0246, "step": 480 }, { "epoch": 0.07046857973120232, "grad_norm": 4.133062839508057, "learning_rate": 7.029774872912129e-05, "loss": 0.0483, "step": 485 }, { "epoch": 0.07119505993461678, "grad_norm": 0.14520829916000366, "learning_rate": 7.10239651416122e-05, "loss": 0.0242, "step": 490 }, { "epoch": 0.07192154013803125, "grad_norm": 0.08248770982027054, "learning_rate": 7.175018155410313e-05, "loss": 0.0389, "step": 495 }, { "epoch": 0.0726480203414457, "grad_norm": 0.09677606076002121, "learning_rate": 7.247639796659405e-05, "loss": 0.0813, "step": 500 }, { "epoch": 0.07337450054486015, "grad_norm": 2.2317094802856445, "learning_rate": 7.320261437908497e-05, "loss": 0.0425, "step": 505 }, { "epoch": 0.0741009807482746, "grad_norm": 0.9524332284927368, "learning_rate": 7.39288307915759e-05, "loss": 0.0165, "step": 510 }, { "epoch": 0.07482746095168907, "grad_norm": 1.2688440084457397, "learning_rate": 7.465504720406682e-05, "loss": 0.0376, "step": 515 }, { "epoch": 0.07555394115510353, "grad_norm": 0.5410459637641907, "learning_rate": 7.538126361655774e-05, "loss": 0.0132, "step": 520 }, { "epoch": 0.07628042135851798, "grad_norm": 1.0646350383758545, "learning_rate": 7.610748002904866e-05, "loss": 0.0357, "step": 525 }, { "epoch": 0.07700690156193243, "grad_norm": 0.05422890931367874, "learning_rate": 7.683369644153958e-05, "loss": 0.0024, "step": 530 }, { "epoch": 0.0777333817653469, "grad_norm": 1.7686655521392822, "learning_rate": 7.75599128540305e-05, "loss": 0.011, "step": 535 }, { "epoch": 0.07845986196876135, "grad_norm": 1.7055928707122803, "learning_rate": 7.828612926652143e-05, "loss": 0.0283, "step": 540 }, { "epoch": 0.07918634217217581, "grad_norm": 7.1870245933532715, "learning_rate": 7.901234567901235e-05, "loss": 0.0255, "step": 545 }, { "epoch": 0.07991282237559026, "grad_norm": 4.143937110900879, "learning_rate": 7.973856209150328e-05, "loss": 0.0163, "step": 550 }, { "epoch": 0.08063930257900472, "grad_norm": 2.7253036499023438, "learning_rate": 8.04647785039942e-05, "loss": 0.0356, "step": 555 }, { "epoch": 0.08136578278241918, "grad_norm": 0.1260932832956314, "learning_rate": 8.119099491648512e-05, "loss": 0.0897, "step": 560 }, { "epoch": 0.08209226298583364, "grad_norm": 0.8739075064659119, "learning_rate": 8.191721132897604e-05, "loss": 0.0212, "step": 565 }, { "epoch": 0.08281874318924809, "grad_norm": 0.07976645231246948, "learning_rate": 8.264342774146696e-05, "loss": 0.0202, "step": 570 }, { "epoch": 0.08354522339266254, "grad_norm": 3.089498996734619, "learning_rate": 8.336964415395788e-05, "loss": 0.0288, "step": 575 }, { "epoch": 0.08427170359607701, "grad_norm": 1.1282787322998047, "learning_rate": 8.40958605664488e-05, "loss": 0.0236, "step": 580 }, { "epoch": 0.08499818379949146, "grad_norm": 2.50753116607666, "learning_rate": 8.482207697893972e-05, "loss": 0.0491, "step": 585 }, { "epoch": 0.08572466400290592, "grad_norm": 15.398341178894043, "learning_rate": 8.554829339143065e-05, "loss": 0.0541, "step": 590 }, { "epoch": 0.08645114420632037, "grad_norm": 0.3026963174343109, "learning_rate": 8.627450980392158e-05, "loss": 0.0053, "step": 595 }, { "epoch": 0.08717762440973484, "grad_norm": 0.12404945492744446, "learning_rate": 8.70007262164125e-05, "loss": 0.0404, "step": 600 }, { "epoch": 0.08790410461314929, "grad_norm": 0.9239891767501831, "learning_rate": 8.772694262890342e-05, "loss": 0.0221, "step": 605 }, { "epoch": 0.08863058481656375, "grad_norm": 1.404173493385315, "learning_rate": 8.845315904139434e-05, "loss": 0.0122, "step": 610 }, { "epoch": 0.0893570650199782, "grad_norm": 3.049877405166626, "learning_rate": 8.917937545388526e-05, "loss": 0.02, "step": 615 }, { "epoch": 0.09008354522339267, "grad_norm": 0.6122508645057678, "learning_rate": 8.990559186637618e-05, "loss": 0.0191, "step": 620 }, { "epoch": 0.09081002542680712, "grad_norm": 0.021131640300154686, "learning_rate": 9.06318082788671e-05, "loss": 0.0257, "step": 625 }, { "epoch": 0.09153650563022157, "grad_norm": 1.1997209787368774, "learning_rate": 9.135802469135802e-05, "loss": 0.0067, "step": 630 }, { "epoch": 0.09226298583363603, "grad_norm": 2.1676833629608154, "learning_rate": 9.208424110384896e-05, "loss": 0.0078, "step": 635 }, { "epoch": 0.0929894660370505, "grad_norm": 0.29366588592529297, "learning_rate": 9.281045751633988e-05, "loss": 0.0052, "step": 640 }, { "epoch": 0.09371594624046495, "grad_norm": 0.6021141409873962, "learning_rate": 9.35366739288308e-05, "loss": 0.0147, "step": 645 }, { "epoch": 0.0944424264438794, "grad_norm": 0.05590349808335304, "learning_rate": 9.426289034132172e-05, "loss": 0.0041, "step": 650 }, { "epoch": 0.09516890664729385, "grad_norm": 0.010648532770574093, "learning_rate": 9.498910675381264e-05, "loss": 0.0004, "step": 655 }, { "epoch": 0.09589538685070832, "grad_norm": 0.6512329578399658, "learning_rate": 9.571532316630356e-05, "loss": 0.0057, "step": 660 }, { "epoch": 0.09662186705412278, "grad_norm": 0.040556080639362335, "learning_rate": 9.644153957879448e-05, "loss": 0.0006, "step": 665 }, { "epoch": 0.09734834725753723, "grad_norm": 0.03349559009075165, "learning_rate": 9.71677559912854e-05, "loss": 0.0025, "step": 670 }, { "epoch": 0.09807482746095168, "grad_norm": 0.22619083523750305, "learning_rate": 9.789397240377634e-05, "loss": 0.0008, "step": 675 }, { "epoch": 0.09880130766436615, "grad_norm": 0.005620414856821299, "learning_rate": 9.862018881626726e-05, "loss": 0.0004, "step": 680 }, { "epoch": 0.0995277878677806, "grad_norm": 0.05560583993792534, "learning_rate": 9.934640522875818e-05, "loss": 0.0115, "step": 685 }, { "epoch": 0.10025426807119506, "grad_norm": 0.003737515537068248, "learning_rate": 0.00010007262164124908, "loss": 0.006, "step": 690 }, { "epoch": 0.10098074827460951, "grad_norm": 4.636546611785889, "learning_rate": 0.00010079883805374002, "loss": 0.0039, "step": 695 }, { "epoch": 0.10170722847802398, "grad_norm": 0.0036786955315619707, "learning_rate": 0.00010152505446623095, "loss": 0.0193, "step": 700 }, { "epoch": 0.10243370868143843, "grad_norm": 1.0555495023727417, "learning_rate": 0.00010225127087872186, "loss": 0.024, "step": 705 }, { "epoch": 0.10316018888485289, "grad_norm": 0.32569730281829834, "learning_rate": 0.00010297748729121279, "loss": 0.0168, "step": 710 }, { "epoch": 0.10388666908826734, "grad_norm": 0.6908342242240906, "learning_rate": 0.0001037037037037037, "loss": 0.0359, "step": 715 }, { "epoch": 0.1046131492916818, "grad_norm": 0.044849053025245667, "learning_rate": 0.00010442992011619463, "loss": 0.0009, "step": 720 }, { "epoch": 0.10533962949509626, "grad_norm": 0.011313475668430328, "learning_rate": 0.00010515613652868554, "loss": 0.0003, "step": 725 }, { "epoch": 0.10606610969851071, "grad_norm": 0.011058060452342033, "learning_rate": 0.00010588235294117647, "loss": 0.0047, "step": 730 }, { "epoch": 0.10679258990192517, "grad_norm": 0.013103635981678963, "learning_rate": 0.00010660856935366741, "loss": 0.0005, "step": 735 }, { "epoch": 0.10751907010533963, "grad_norm": 0.0076889158226549625, "learning_rate": 0.00010733478576615832, "loss": 0.0122, "step": 740 }, { "epoch": 0.10824555030875409, "grad_norm": 0.6621626019477844, "learning_rate": 0.00010806100217864925, "loss": 0.0027, "step": 745 }, { "epoch": 0.10897203051216854, "grad_norm": 1.037239670753479, "learning_rate": 0.00010878721859114016, "loss": 0.0385, "step": 750 }, { "epoch": 0.109698510715583, "grad_norm": 0.11858850717544556, "learning_rate": 0.00010951343500363109, "loss": 0.0522, "step": 755 }, { "epoch": 0.11042499091899746, "grad_norm": 3.902498245239258, "learning_rate": 0.000110239651416122, "loss": 0.0037, "step": 760 }, { "epoch": 0.11115147112241192, "grad_norm": 0.03190886229276657, "learning_rate": 0.00011096586782861293, "loss": 0.0092, "step": 765 }, { "epoch": 0.11187795132582637, "grad_norm": 0.028368664905428886, "learning_rate": 0.00011169208424110384, "loss": 0.0011, "step": 770 }, { "epoch": 0.11260443152924082, "grad_norm": 1.0788954496383667, "learning_rate": 0.00011241830065359477, "loss": 0.0205, "step": 775 }, { "epoch": 0.11333091173265529, "grad_norm": 0.10793304443359375, "learning_rate": 0.00011314451706608571, "loss": 0.0128, "step": 780 }, { "epoch": 0.11405739193606974, "grad_norm": 1.4622502326965332, "learning_rate": 0.00011387073347857661, "loss": 0.0525, "step": 785 }, { "epoch": 0.1147838721394842, "grad_norm": 0.43396472930908203, "learning_rate": 0.00011459694989106755, "loss": 0.0016, "step": 790 }, { "epoch": 0.11551035234289865, "grad_norm": 2.9861035346984863, "learning_rate": 0.00011532316630355846, "loss": 0.0138, "step": 795 }, { "epoch": 0.11623683254631312, "grad_norm": 14.586094856262207, "learning_rate": 0.00011604938271604939, "loss": 0.1007, "step": 800 }, { "epoch": 0.11696331274972757, "grad_norm": 0.014536268077790737, "learning_rate": 0.0001167755991285403, "loss": 0.0147, "step": 805 }, { "epoch": 0.11768979295314203, "grad_norm": 0.05354047194123268, "learning_rate": 0.00011750181554103123, "loss": 0.0176, "step": 810 }, { "epoch": 0.11841627315655648, "grad_norm": 0.3078368902206421, "learning_rate": 0.00011822803195352215, "loss": 0.0049, "step": 815 }, { "epoch": 0.11914275335997095, "grad_norm": 0.011014469899237156, "learning_rate": 0.00011895424836601307, "loss": 0.0035, "step": 820 }, { "epoch": 0.1198692335633854, "grad_norm": 0.5486952662467957, "learning_rate": 0.000119680464778504, "loss": 0.0168, "step": 825 }, { "epoch": 0.12059571376679985, "grad_norm": 0.02629510499536991, "learning_rate": 0.00012040668119099491, "loss": 0.001, "step": 830 }, { "epoch": 0.1213221939702143, "grad_norm": 0.019840385764837265, "learning_rate": 0.00012113289760348585, "loss": 0.0037, "step": 835 }, { "epoch": 0.12204867417362877, "grad_norm": 1.3649095296859741, "learning_rate": 0.00012185911401597675, "loss": 0.0133, "step": 840 }, { "epoch": 0.12277515437704323, "grad_norm": 0.025183813646435738, "learning_rate": 0.0001225853304284677, "loss": 0.0006, "step": 845 }, { "epoch": 0.12350163458045768, "grad_norm": 0.07554338127374649, "learning_rate": 0.0001233115468409586, "loss": 0.0041, "step": 850 }, { "epoch": 0.12422811478387213, "grad_norm": 0.04600398242473602, "learning_rate": 0.00012403776325344953, "loss": 0.0022, "step": 855 }, { "epoch": 0.1249545949872866, "grad_norm": 4.709814548492432, "learning_rate": 0.00012476397966594048, "loss": 0.0037, "step": 860 }, { "epoch": 0.12568107519070104, "grad_norm": 0.020981954410672188, "learning_rate": 0.00012549019607843137, "loss": 0.0121, "step": 865 }, { "epoch": 0.12640755539411552, "grad_norm": 0.3170248866081238, "learning_rate": 0.00012621641249092232, "loss": 0.0027, "step": 870 }, { "epoch": 0.12713403559752998, "grad_norm": 0.01148161105811596, "learning_rate": 0.0001269426289034132, "loss": 0.0005, "step": 875 }, { "epoch": 0.12786051580094443, "grad_norm": 0.005348953418433666, "learning_rate": 0.00012766884531590416, "loss": 0.0002, "step": 880 }, { "epoch": 0.12858699600435888, "grad_norm": 3.101860761642456, "learning_rate": 0.00012839506172839505, "loss": 0.0038, "step": 885 }, { "epoch": 0.12931347620777334, "grad_norm": 2.680506706237793, "learning_rate": 0.000129121278140886, "loss": 0.0472, "step": 890 }, { "epoch": 0.1300399564111878, "grad_norm": 22.780397415161133, "learning_rate": 0.0001298474945533769, "loss": 0.0232, "step": 895 }, { "epoch": 0.13076643661460224, "grad_norm": 0.08615617454051971, "learning_rate": 0.00013057371096586784, "loss": 0.0278, "step": 900 }, { "epoch": 0.1314929168180167, "grad_norm": 0.4959210455417633, "learning_rate": 0.00013129992737835876, "loss": 0.0007, "step": 905 }, { "epoch": 0.13221939702143118, "grad_norm": 0.0067051006481051445, "learning_rate": 0.00013202614379084968, "loss": 0.0167, "step": 910 }, { "epoch": 0.13294587722484563, "grad_norm": 10.88768482208252, "learning_rate": 0.0001327523602033406, "loss": 0.0682, "step": 915 }, { "epoch": 0.13367235742826009, "grad_norm": 0.007390011567622423, "learning_rate": 0.00013347857661583152, "loss": 0.0003, "step": 920 }, { "epoch": 0.13439883763167454, "grad_norm": 0.12825822830200195, "learning_rate": 0.00013420479302832244, "loss": 0.0244, "step": 925 }, { "epoch": 0.135125317835089, "grad_norm": 0.8949776291847229, "learning_rate": 0.00013493100944081336, "loss": 0.0214, "step": 930 }, { "epoch": 0.13585179803850345, "grad_norm": 0.007870912551879883, "learning_rate": 0.00013565722585330429, "loss": 0.0004, "step": 935 }, { "epoch": 0.1365782782419179, "grad_norm": 0.013382726348936558, "learning_rate": 0.0001363834422657952, "loss": 0.0004, "step": 940 }, { "epoch": 0.13730475844533235, "grad_norm": 0.037289395928382874, "learning_rate": 0.00013710965867828613, "loss": 0.0012, "step": 945 }, { "epoch": 0.13803123864874683, "grad_norm": 0.9494091272354126, "learning_rate": 0.00013783587509077707, "loss": 0.0249, "step": 950 }, { "epoch": 0.1387577188521613, "grad_norm": 2.1269211769104004, "learning_rate": 0.00013856209150326797, "loss": 0.0041, "step": 955 }, { "epoch": 0.13948419905557574, "grad_norm": 0.03475005179643631, "learning_rate": 0.00013928830791575892, "loss": 0.0039, "step": 960 }, { "epoch": 0.1402106792589902, "grad_norm": 0.778325080871582, "learning_rate": 0.0001400145243282498, "loss": 0.0044, "step": 965 }, { "epoch": 0.14093715946240465, "grad_norm": 0.06391960382461548, "learning_rate": 0.00014074074074074076, "loss": 0.0011, "step": 970 }, { "epoch": 0.1416636396658191, "grad_norm": 0.015311076305806637, "learning_rate": 0.00014146695715323165, "loss": 0.0156, "step": 975 }, { "epoch": 0.14239011986923356, "grad_norm": 0.005620781797915697, "learning_rate": 0.0001421931735657226, "loss": 0.0005, "step": 980 }, { "epoch": 0.143116600072648, "grad_norm": 0.006361651234328747, "learning_rate": 0.00014291938997821352, "loss": 0.0173, "step": 985 }, { "epoch": 0.1438430802760625, "grad_norm": 0.6632714867591858, "learning_rate": 0.00014364560639070444, "loss": 0.0008, "step": 990 }, { "epoch": 0.14456956047947694, "grad_norm": 3.7890255451202393, "learning_rate": 0.00014437182280319536, "loss": 0.0591, "step": 995 }, { "epoch": 0.1452960406828914, "grad_norm": 4.573298454284668, "learning_rate": 0.00014509803921568628, "loss": 0.0182, "step": 1000 }, { "epoch": 0.14602252088630585, "grad_norm": 0.18653298914432526, "learning_rate": 0.0001458242556281772, "loss": 0.011, "step": 1005 }, { "epoch": 0.1467490010897203, "grad_norm": 0.0030135358683764935, "learning_rate": 0.00014655047204066812, "loss": 0.017, "step": 1010 }, { "epoch": 0.14747548129313476, "grad_norm": 13.294329643249512, "learning_rate": 0.00014727668845315904, "loss": 0.0359, "step": 1015 }, { "epoch": 0.1482019614965492, "grad_norm": 1.0047153234481812, "learning_rate": 0.00014800290486564996, "loss": 0.0014, "step": 1020 }, { "epoch": 0.14892844169996367, "grad_norm": 0.0042244489304721355, "learning_rate": 0.00014872912127814088, "loss": 0.0008, "step": 1025 }, { "epoch": 0.14965492190337815, "grad_norm": 0.005744027905166149, "learning_rate": 0.00014945533769063183, "loss": 0.0005, "step": 1030 }, { "epoch": 0.1503814021067926, "grad_norm": 0.0027218873146921396, "learning_rate": 0.00015018155410312272, "loss": 0.0009, "step": 1035 }, { "epoch": 0.15110788231020705, "grad_norm": 1.5683510303497314, "learning_rate": 0.00015090777051561367, "loss": 0.0009, "step": 1040 }, { "epoch": 0.1518343625136215, "grad_norm": 0.0024358402006328106, "learning_rate": 0.00015163398692810456, "loss": 0.0008, "step": 1045 }, { "epoch": 0.15256084271703596, "grad_norm": 0.0035784540232270956, "learning_rate": 0.0001523602033405955, "loss": 0.0217, "step": 1050 }, { "epoch": 0.15328732292045041, "grad_norm": 0.017342494800686836, "learning_rate": 0.0001530864197530864, "loss": 0.0002, "step": 1055 }, { "epoch": 0.15401380312386487, "grad_norm": 0.0023592431098222733, "learning_rate": 0.00015381263616557735, "loss": 0.0001, "step": 1060 }, { "epoch": 0.15474028332727932, "grad_norm": 0.0029132033232599497, "learning_rate": 0.00015453885257806827, "loss": 0.0002, "step": 1065 }, { "epoch": 0.1554667635306938, "grad_norm": 5.089969158172607, "learning_rate": 0.0001552650689905592, "loss": 0.0124, "step": 1070 }, { "epoch": 0.15619324373410826, "grad_norm": 0.0020955149084329605, "learning_rate": 0.00015599128540305012, "loss": 0.0002, "step": 1075 }, { "epoch": 0.1569197239375227, "grad_norm": 0.001827805070206523, "learning_rate": 0.00015671750181554104, "loss": 0.0026, "step": 1080 }, { "epoch": 0.15764620414093716, "grad_norm": 0.0018593213753774762, "learning_rate": 0.00015744371822803196, "loss": 0.0001, "step": 1085 }, { "epoch": 0.15837268434435162, "grad_norm": 8.548373222351074, "learning_rate": 0.00015816993464052288, "loss": 0.0116, "step": 1090 }, { "epoch": 0.15909916454776607, "grad_norm": 0.003052167361602187, "learning_rate": 0.0001588961510530138, "loss": 0.023, "step": 1095 }, { "epoch": 0.15982564475118052, "grad_norm": 0.01510961726307869, "learning_rate": 0.00015962236746550472, "loss": 0.0003, "step": 1100 }, { "epoch": 0.16055212495459498, "grad_norm": 0.006872969213873148, "learning_rate": 0.00016034858387799564, "loss": 0.0008, "step": 1105 }, { "epoch": 0.16127860515800943, "grad_norm": 0.0075376201421022415, "learning_rate": 0.00016107480029048659, "loss": 0.001, "step": 1110 }, { "epoch": 0.1620050853614239, "grad_norm": 1.308592438697815, "learning_rate": 0.00016180101670297748, "loss": 0.0206, "step": 1115 }, { "epoch": 0.16273156556483837, "grad_norm": 0.01441910769790411, "learning_rate": 0.00016252723311546843, "loss": 0.0003, "step": 1120 }, { "epoch": 0.16345804576825282, "grad_norm": 4.73635721206665, "learning_rate": 0.00016325344952795935, "loss": 0.0048, "step": 1125 }, { "epoch": 0.16418452597166727, "grad_norm": 0.07317811995744705, "learning_rate": 0.00016397966594045027, "loss": 0.0079, "step": 1130 }, { "epoch": 0.16491100617508173, "grad_norm": 3.066941976547241, "learning_rate": 0.0001647058823529412, "loss": 0.0245, "step": 1135 }, { "epoch": 0.16563748637849618, "grad_norm": 0.20101045072078705, "learning_rate": 0.0001654320987654321, "loss": 0.0538, "step": 1140 }, { "epoch": 0.16636396658191063, "grad_norm": 0.03498254343867302, "learning_rate": 0.00016615831517792303, "loss": 0.0009, "step": 1145 }, { "epoch": 0.1670904467853251, "grad_norm": 0.044696319848299026, "learning_rate": 0.00016688453159041395, "loss": 0.001, "step": 1150 }, { "epoch": 0.16781692698873957, "grad_norm": 0.005176996346563101, "learning_rate": 0.00016761074800290487, "loss": 0.001, "step": 1155 }, { "epoch": 0.16854340719215402, "grad_norm": 0.0034458874724805355, "learning_rate": 0.0001683369644153958, "loss": 0.0295, "step": 1160 }, { "epoch": 0.16926988739556847, "grad_norm": 0.01240626908838749, "learning_rate": 0.0001690631808278867, "loss": 0.0055, "step": 1165 }, { "epoch": 0.16999636759898293, "grad_norm": 0.0073911272920668125, "learning_rate": 0.00016978939724037763, "loss": 0.0002, "step": 1170 }, { "epoch": 0.17072284780239738, "grad_norm": 0.010020995512604713, "learning_rate": 0.00017051561365286855, "loss": 0.0002, "step": 1175 }, { "epoch": 0.17144932800581184, "grad_norm": 0.0028329354245215654, "learning_rate": 0.00017124183006535947, "loss": 0.0002, "step": 1180 }, { "epoch": 0.1721758082092263, "grad_norm": 0.009768263436853886, "learning_rate": 0.00017196804647785042, "loss": 0.0001, "step": 1185 }, { "epoch": 0.17290228841264074, "grad_norm": 0.006985844578593969, "learning_rate": 0.00017269426289034134, "loss": 0.0001, "step": 1190 }, { "epoch": 0.17362876861605522, "grad_norm": 0.003910423722118139, "learning_rate": 0.00017342047930283226, "loss": 0.0001, "step": 1195 }, { "epoch": 0.17435524881946968, "grad_norm": 0.0018550670938566327, "learning_rate": 0.00017414669571532318, "loss": 0.0001, "step": 1200 }, { "epoch": 0.17508172902288413, "grad_norm": 0.003561209188774228, "learning_rate": 0.0001748729121278141, "loss": 0.0001, "step": 1205 }, { "epoch": 0.17580820922629858, "grad_norm": 0.0017712870612740517, "learning_rate": 0.00017559912854030502, "loss": 0.0001, "step": 1210 }, { "epoch": 0.17653468942971304, "grad_norm": 0.002323460765182972, "learning_rate": 0.00017632534495279595, "loss": 0.0001, "step": 1215 }, { "epoch": 0.1772611696331275, "grad_norm": 0.0017775703454390168, "learning_rate": 0.00017705156136528687, "loss": 0.0004, "step": 1220 }, { "epoch": 0.17798764983654194, "grad_norm": 0.003454179735854268, "learning_rate": 0.00017777777777777779, "loss": 0.0001, "step": 1225 }, { "epoch": 0.1787141300399564, "grad_norm": 0.003128621494397521, "learning_rate": 0.0001785039941902687, "loss": 0.0001, "step": 1230 }, { "epoch": 0.17944061024337088, "grad_norm": 0.013285885564982891, "learning_rate": 0.00017923021060275963, "loss": 0.0001, "step": 1235 }, { "epoch": 0.18016709044678533, "grad_norm": 0.0012834910303354263, "learning_rate": 0.00017995642701525055, "loss": 0.0004, "step": 1240 }, { "epoch": 0.1808935706501998, "grad_norm": 0.0010866275988519192, "learning_rate": 0.0001806826434277415, "loss": 0.0001, "step": 1245 }, { "epoch": 0.18162005085361424, "grad_norm": 0.0010630824835970998, "learning_rate": 0.0001814088598402324, "loss": 0.0, "step": 1250 }, { "epoch": 0.1823465310570287, "grad_norm": 0.0011757917236536741, "learning_rate": 0.00018213507625272334, "loss": 0.0, "step": 1255 }, { "epoch": 0.18307301126044315, "grad_norm": 0.0009444226743653417, "learning_rate": 0.00018286129266521423, "loss": 0.008, "step": 1260 }, { "epoch": 0.1837994914638576, "grad_norm": 0.0011839661747217178, "learning_rate": 0.00018358750907770518, "loss": 0.0, "step": 1265 }, { "epoch": 0.18452597166727205, "grad_norm": 0.0008903779671527445, "learning_rate": 0.00018431372549019607, "loss": 0.0, "step": 1270 }, { "epoch": 0.18525245187068654, "grad_norm": 0.0010285211028531194, "learning_rate": 0.00018503994190268702, "loss": 0.0001, "step": 1275 }, { "epoch": 0.185978932074101, "grad_norm": 0.0016522291116416454, "learning_rate": 0.00018576615831517794, "loss": 0.0338, "step": 1280 }, { "epoch": 0.18670541227751544, "grad_norm": 0.001982170157134533, "learning_rate": 0.00018649237472766886, "loss": 0.0009, "step": 1285 }, { "epoch": 0.1874318924809299, "grad_norm": 0.002856120467185974, "learning_rate": 0.00018721859114015978, "loss": 0.0001, "step": 1290 }, { "epoch": 0.18815837268434435, "grad_norm": 6.8815484046936035, "learning_rate": 0.0001879448075526507, "loss": 0.0064, "step": 1295 }, { "epoch": 0.1888848528877588, "grad_norm": 0.002711124252527952, "learning_rate": 0.00018867102396514162, "loss": 0.0001, "step": 1300 }, { "epoch": 0.18961133309117326, "grad_norm": 0.01453580055385828, "learning_rate": 0.00018939724037763254, "loss": 0.0002, "step": 1305 }, { "epoch": 0.1903378132945877, "grad_norm": 0.004619908984750509, "learning_rate": 0.00019012345679012346, "loss": 0.0108, "step": 1310 }, { "epoch": 0.1910642934980022, "grad_norm": 0.002147579798474908, "learning_rate": 0.0001908496732026144, "loss": 0.0002, "step": 1315 }, { "epoch": 0.19179077370141664, "grad_norm": 0.006444690283387899, "learning_rate": 0.0001915758896151053, "loss": 0.0013, "step": 1320 }, { "epoch": 0.1925172539048311, "grad_norm": 0.0015877482946962118, "learning_rate": 0.00019230210602759625, "loss": 0.0011, "step": 1325 }, { "epoch": 0.19324373410824555, "grad_norm": 2.8192436695098877, "learning_rate": 0.00019302832244008715, "loss": 0.0018, "step": 1330 }, { "epoch": 0.19397021431166, "grad_norm": 6.506179332733154, "learning_rate": 0.0001937545388525781, "loss": 0.0067, "step": 1335 }, { "epoch": 0.19469669451507446, "grad_norm": 0.0016660846304148436, "learning_rate": 0.00019448075526506899, "loss": 0.0001, "step": 1340 }, { "epoch": 0.1954231747184889, "grad_norm": 0.0011433791369199753, "learning_rate": 0.00019520697167755993, "loss": 0.0141, "step": 1345 }, { "epoch": 0.19614965492190337, "grad_norm": 0.001556798000819981, "learning_rate": 0.00019593318809005083, "loss": 0.0, "step": 1350 }, { "epoch": 0.19687613512531785, "grad_norm": 0.0035784265492111444, "learning_rate": 0.00019665940450254178, "loss": 0.0001, "step": 1355 }, { "epoch": 0.1976026153287323, "grad_norm": 4.246982097625732, "learning_rate": 0.0001973856209150327, "loss": 0.0068, "step": 1360 }, { "epoch": 0.19832909553214675, "grad_norm": 0.001589273801073432, "learning_rate": 0.00019811183732752362, "loss": 0.012, "step": 1365 }, { "epoch": 0.1990555757355612, "grad_norm": 0.0008454394992440939, "learning_rate": 0.00019883805374001454, "loss": 0.0001, "step": 1370 }, { "epoch": 0.19978205593897566, "grad_norm": 0.0013743549352511764, "learning_rate": 0.00019956427015250546, "loss": 0.0001, "step": 1375 }, { "epoch": 0.20007264802034144, "eval_accuracy": 0.9996397787212145, "eval_f1": 0.9995310825294748, "eval_loss": 0.0019488565158098936, "eval_precision": 0.9990626046200201, "eval_recall": 1.0, "eval_runtime": 124.3971, "eval_samples_per_second": 312.427, "eval_steps_per_second": 2.444, "step": 1377 }, { "epoch": 0.20050853614239011, "grad_norm": 0.002287400420755148, "learning_rate": 0.00019996771329405116, "loss": 0.0093, "step": 1380 }, { "epoch": 0.20123501634580457, "grad_norm": 0.004998628981411457, "learning_rate": 0.00019988699652917914, "loss": 0.0159, "step": 1385 }, { "epoch": 0.20196149654921902, "grad_norm": 0.003076898632571101, "learning_rate": 0.00019980627976430706, "loss": 0.0073, "step": 1390 }, { "epoch": 0.2026879767526335, "grad_norm": 0.7471761107444763, "learning_rate": 0.00019972556299943498, "loss": 0.0033, "step": 1395 }, { "epoch": 0.20341445695604796, "grad_norm": 0.1988172084093094, "learning_rate": 0.00019964484623456293, "loss": 0.0009, "step": 1400 }, { "epoch": 0.2041409371594624, "grad_norm": 0.005002092570066452, "learning_rate": 0.00019956412946969088, "loss": 0.0009, "step": 1405 }, { "epoch": 0.20486741736287686, "grad_norm": 0.24179202318191528, "learning_rate": 0.0001994834127048188, "loss": 0.0182, "step": 1410 }, { "epoch": 0.20559389756629132, "grad_norm": 4.140319347381592, "learning_rate": 0.00019940269593994673, "loss": 0.0052, "step": 1415 }, { "epoch": 0.20632037776970577, "grad_norm": 0.0015831501223146915, "learning_rate": 0.00019932197917507468, "loss": 0.0, "step": 1420 }, { "epoch": 0.20704685797312022, "grad_norm": 0.0023513727355748415, "learning_rate": 0.0001992412624102026, "loss": 0.0001, "step": 1425 }, { "epoch": 0.20777333817653468, "grad_norm": 0.0018358491361141205, "learning_rate": 0.00019916054564533055, "loss": 0.0217, "step": 1430 }, { "epoch": 0.20849981837994916, "grad_norm": 2.7655224800109863, "learning_rate": 0.00019907982888045847, "loss": 0.0393, "step": 1435 }, { "epoch": 0.2092262985833636, "grad_norm": 0.00683799060061574, "learning_rate": 0.00019899911211558642, "loss": 0.0002, "step": 1440 }, { "epoch": 0.20995277878677807, "grad_norm": 0.011541269719600677, "learning_rate": 0.00019891839535071435, "loss": 0.0045, "step": 1445 }, { "epoch": 0.21067925899019252, "grad_norm": 0.013042348437011242, "learning_rate": 0.00019883767858584227, "loss": 0.0003, "step": 1450 }, { "epoch": 0.21140573919360697, "grad_norm": 0.01146721187978983, "learning_rate": 0.00019875696182097025, "loss": 0.0009, "step": 1455 }, { "epoch": 0.21213221939702143, "grad_norm": 0.0053860582411289215, "learning_rate": 0.00019867624505609817, "loss": 0.003, "step": 1460 }, { "epoch": 0.21285869960043588, "grad_norm": 0.35763925313949585, "learning_rate": 0.0001985955282912261, "loss": 0.0066, "step": 1465 }, { "epoch": 0.21358517980385033, "grad_norm": 0.003207879839465022, "learning_rate": 0.00019851481152635401, "loss": 0.0002, "step": 1470 }, { "epoch": 0.21431166000726481, "grad_norm": 0.004152906127274036, "learning_rate": 0.00019843409476148196, "loss": 0.0001, "step": 1475 }, { "epoch": 0.21503814021067927, "grad_norm": 0.003981268033385277, "learning_rate": 0.00019835337799660991, "loss": 0.0002, "step": 1480 }, { "epoch": 0.21576462041409372, "grad_norm": 0.0030321148224174976, "learning_rate": 0.00019827266123173784, "loss": 0.0004, "step": 1485 }, { "epoch": 0.21649110061750818, "grad_norm": 0.0033642794005572796, "learning_rate": 0.00019819194446686579, "loss": 0.0002, "step": 1490 }, { "epoch": 0.21721758082092263, "grad_norm": 0.0015044253086671233, "learning_rate": 0.0001981112277019937, "loss": 0.0, "step": 1495 }, { "epoch": 0.21794406102433708, "grad_norm": 0.0013194256462156773, "learning_rate": 0.00019803051093712166, "loss": 0.0064, "step": 1500 }, { "epoch": 0.21867054122775154, "grad_norm": 0.003604642581194639, "learning_rate": 0.00019794979417224958, "loss": 0.0001, "step": 1505 }, { "epoch": 0.219397021431166, "grad_norm": 0.002144684549421072, "learning_rate": 0.00019786907740737753, "loss": 0.0011, "step": 1510 }, { "epoch": 0.22012350163458047, "grad_norm": 0.00234671076759696, "learning_rate": 0.00019778836064250545, "loss": 0.0001, "step": 1515 }, { "epoch": 0.22084998183799492, "grad_norm": 0.027411388233304024, "learning_rate": 0.00019770764387763338, "loss": 0.0002, "step": 1520 }, { "epoch": 0.22157646204140938, "grad_norm": 0.00431784288957715, "learning_rate": 0.00019762692711276135, "loss": 0.0016, "step": 1525 }, { "epoch": 0.22230294224482383, "grad_norm": 0.007216178812086582, "learning_rate": 0.00019754621034788928, "loss": 0.0031, "step": 1530 }, { "epoch": 0.22302942244823828, "grad_norm": 0.0020561974961310625, "learning_rate": 0.0001974654935830172, "loss": 0.0, "step": 1535 }, { "epoch": 0.22375590265165274, "grad_norm": 0.003935552202165127, "learning_rate": 0.00019738477681814512, "loss": 0.0, "step": 1540 }, { "epoch": 0.2244823828550672, "grad_norm": 0.0017273337580263615, "learning_rate": 0.00019730406005327307, "loss": 0.0018, "step": 1545 }, { "epoch": 0.22520886305848165, "grad_norm": 0.0009397296234965324, "learning_rate": 0.00019722334328840102, "loss": 0.0071, "step": 1550 }, { "epoch": 0.22593534326189613, "grad_norm": 4.2714738845825195, "learning_rate": 0.00019714262652352894, "loss": 0.0043, "step": 1555 }, { "epoch": 0.22666182346531058, "grad_norm": 0.008737271651625633, "learning_rate": 0.0001970619097586569, "loss": 0.0, "step": 1560 }, { "epoch": 0.22738830366872503, "grad_norm": 0.0011167083866894245, "learning_rate": 0.00019698119299378482, "loss": 0.0195, "step": 1565 }, { "epoch": 0.2281147838721395, "grad_norm": 0.0015777769731357694, "learning_rate": 0.00019690047622891274, "loss": 0.0093, "step": 1570 }, { "epoch": 0.22884126407555394, "grad_norm": 1.3581019639968872, "learning_rate": 0.0001968197594640407, "loss": 0.0371, "step": 1575 }, { "epoch": 0.2295677442789684, "grad_norm": 0.005585103295743465, "learning_rate": 0.00019673904269916864, "loss": 0.0003, "step": 1580 }, { "epoch": 0.23029422448238285, "grad_norm": 0.013055490329861641, "learning_rate": 0.00019665832593429656, "loss": 0.0009, "step": 1585 }, { "epoch": 0.2310207046857973, "grad_norm": 0.012752565555274487, "learning_rate": 0.00019657760916942448, "loss": 0.0057, "step": 1590 }, { "epoch": 0.23174718488921178, "grad_norm": 0.016765527427196503, "learning_rate": 0.00019649689240455243, "loss": 0.0053, "step": 1595 }, { "epoch": 0.23247366509262624, "grad_norm": 0.04566654935479164, "learning_rate": 0.00019641617563968038, "loss": 0.0031, "step": 1600 }, { "epoch": 0.2332001452960407, "grad_norm": 0.05058443173766136, "learning_rate": 0.0001963354588748083, "loss": 0.0064, "step": 1605 }, { "epoch": 0.23392662549945514, "grad_norm": 0.006236894056200981, "learning_rate": 0.00019625474210993623, "loss": 0.0202, "step": 1610 }, { "epoch": 0.2346531057028696, "grad_norm": 0.00453936355188489, "learning_rate": 0.00019617402534506418, "loss": 0.0002, "step": 1615 }, { "epoch": 0.23537958590628405, "grad_norm": 0.01652829721570015, "learning_rate": 0.00019609330858019213, "loss": 0.0005, "step": 1620 }, { "epoch": 0.2361060661096985, "grad_norm": 0.28086262941360474, "learning_rate": 0.00019601259181532005, "loss": 0.0012, "step": 1625 }, { "epoch": 0.23683254631311296, "grad_norm": 0.002964381594210863, "learning_rate": 0.000195931875050448, "loss": 0.0001, "step": 1630 }, { "epoch": 0.23755902651652744, "grad_norm": 0.004744562786072493, "learning_rate": 0.00019585115828557592, "loss": 0.0001, "step": 1635 }, { "epoch": 0.2382855067199419, "grad_norm": 0.002022289205342531, "learning_rate": 0.00019577044152070385, "loss": 0.0003, "step": 1640 }, { "epoch": 0.23901198692335635, "grad_norm": 1.325679063796997, "learning_rate": 0.0001956897247558318, "loss": 0.0223, "step": 1645 }, { "epoch": 0.2397384671267708, "grad_norm": 0.005906618200242519, "learning_rate": 0.00019560900799095974, "loss": 0.0004, "step": 1650 }, { "epoch": 0.24046494733018525, "grad_norm": 0.022973524406552315, "learning_rate": 0.00019552829122608767, "loss": 0.0004, "step": 1655 }, { "epoch": 0.2411914275335997, "grad_norm": 0.017179157584905624, "learning_rate": 0.0001954475744612156, "loss": 0.0005, "step": 1660 }, { "epoch": 0.24191790773701416, "grad_norm": 0.011254935525357723, "learning_rate": 0.00019536685769634354, "loss": 0.0008, "step": 1665 }, { "epoch": 0.2426443879404286, "grad_norm": 0.004135392606258392, "learning_rate": 0.0001952861409314715, "loss": 0.0003, "step": 1670 }, { "epoch": 0.24337086814384307, "grad_norm": 0.002715233713388443, "learning_rate": 0.0001952054241665994, "loss": 0.0002, "step": 1675 }, { "epoch": 0.24409734834725755, "grad_norm": 0.00374965975061059, "learning_rate": 0.00019512470740172734, "loss": 0.0001, "step": 1680 }, { "epoch": 0.244823828550672, "grad_norm": 0.0033891089260578156, "learning_rate": 0.00019504399063685528, "loss": 0.0001, "step": 1685 }, { "epoch": 0.24555030875408645, "grad_norm": 0.001574166351929307, "learning_rate": 0.0001949632738719832, "loss": 0.0001, "step": 1690 }, { "epoch": 0.2462767889575009, "grad_norm": 0.001289655570872128, "learning_rate": 0.00019488255710711116, "loss": 0.0, "step": 1695 }, { "epoch": 0.24700326916091536, "grad_norm": 0.0012494047405198216, "learning_rate": 0.0001948018403422391, "loss": 0.0, "step": 1700 }, { "epoch": 0.24772974936432982, "grad_norm": 0.0028091860003769398, "learning_rate": 0.00019472112357736703, "loss": 0.0, "step": 1705 }, { "epoch": 0.24845622956774427, "grad_norm": 0.0020063440315425396, "learning_rate": 0.00019464040681249495, "loss": 0.0, "step": 1710 }, { "epoch": 0.24918270977115872, "grad_norm": 0.00732283852994442, "learning_rate": 0.0001945596900476229, "loss": 0.0001, "step": 1715 }, { "epoch": 0.2499091899745732, "grad_norm": 0.0009436274995096028, "learning_rate": 0.00019447897328275085, "loss": 0.0, "step": 1720 }, { "epoch": 0.25063567017798766, "grad_norm": 0.001065302756614983, "learning_rate": 0.00019439825651787877, "loss": 0.0, "step": 1725 }, { "epoch": 0.2513621503814021, "grad_norm": 0.0007398009183816612, "learning_rate": 0.0001943175397530067, "loss": 0.0, "step": 1730 }, { "epoch": 0.25208863058481656, "grad_norm": 0.0009731051395647228, "learning_rate": 0.00019423682298813465, "loss": 0.0001, "step": 1735 }, { "epoch": 0.25281511078823105, "grad_norm": 0.0006832171930000186, "learning_rate": 0.0001941561062232626, "loss": 0.0, "step": 1740 }, { "epoch": 0.25354159099164547, "grad_norm": 0.0011063286801800132, "learning_rate": 0.00019407538945839052, "loss": 0.0, "step": 1745 }, { "epoch": 0.25426807119505995, "grad_norm": 0.0012475239345803857, "learning_rate": 0.00019399467269351844, "loss": 0.0, "step": 1750 }, { "epoch": 0.2549945513984744, "grad_norm": 0.0008868346340022981, "learning_rate": 0.0001939139559286464, "loss": 0.0, "step": 1755 }, { "epoch": 0.25572103160188886, "grad_norm": 0.0013618022203445435, "learning_rate": 0.00019383323916377431, "loss": 0.0, "step": 1760 }, { "epoch": 0.2564475118053033, "grad_norm": 0.0008328580879606307, "learning_rate": 0.00019375252239890226, "loss": 0.0, "step": 1765 }, { "epoch": 0.25717399200871777, "grad_norm": 0.0017324545187875628, "learning_rate": 0.00019367180563403021, "loss": 0.0, "step": 1770 }, { "epoch": 0.2579004722121322, "grad_norm": 0.0010712060611695051, "learning_rate": 0.00019359108886915814, "loss": 0.0, "step": 1775 }, { "epoch": 0.2586269524155467, "grad_norm": 0.0005095238448120654, "learning_rate": 0.00019351037210428606, "loss": 0.0, "step": 1780 }, { "epoch": 0.25935343261896115, "grad_norm": 0.0014343465445563197, "learning_rate": 0.00019342965533941398, "loss": 0.0, "step": 1785 }, { "epoch": 0.2600799128223756, "grad_norm": 0.0007007729145698249, "learning_rate": 0.00019334893857454196, "loss": 0.0, "step": 1790 }, { "epoch": 0.26080639302579006, "grad_norm": 0.0005924066063016653, "learning_rate": 0.00019326822180966988, "loss": 0.0, "step": 1795 }, { "epoch": 0.2615328732292045, "grad_norm": 0.0004458896000869572, "learning_rate": 0.0001931875050447978, "loss": 0.0, "step": 1800 }, { "epoch": 0.26225935343261897, "grad_norm": 0.005087355151772499, "learning_rate": 0.00019310678827992575, "loss": 0.0, "step": 1805 }, { "epoch": 0.2629858336360334, "grad_norm": 0.11598234623670578, "learning_rate": 0.0001930260715150537, "loss": 0.0004, "step": 1810 }, { "epoch": 0.2637123138394479, "grad_norm": 0.0015027482295408845, "learning_rate": 0.00019294535475018163, "loss": 0.0473, "step": 1815 }, { "epoch": 0.26443879404286236, "grad_norm": 0.04484843090176582, "learning_rate": 0.00019286463798530955, "loss": 0.0003, "step": 1820 }, { "epoch": 0.2651652742462768, "grad_norm": 0.007797603961080313, "learning_rate": 0.0001927839212204375, "loss": 0.0031, "step": 1825 }, { "epoch": 0.26589175444969126, "grad_norm": 0.006486868020147085, "learning_rate": 0.00019270320445556542, "loss": 0.0003, "step": 1830 }, { "epoch": 0.2666182346531057, "grad_norm": 0.005536284297704697, "learning_rate": 0.00019262248769069337, "loss": 0.0004, "step": 1835 }, { "epoch": 0.26734471485652017, "grad_norm": 0.014443649910390377, "learning_rate": 0.00019254177092582132, "loss": 0.0001, "step": 1840 }, { "epoch": 0.2680711950599346, "grad_norm": 0.0030865217559039593, "learning_rate": 0.00019246105416094924, "loss": 0.0001, "step": 1845 }, { "epoch": 0.2687976752633491, "grad_norm": 0.15668638050556183, "learning_rate": 0.00019238033739607717, "loss": 0.0002, "step": 1850 }, { "epoch": 0.2695241554667635, "grad_norm": 0.04532123729586601, "learning_rate": 0.0001922996206312051, "loss": 0.0003, "step": 1855 }, { "epoch": 0.270250635670178, "grad_norm": 0.00196210783906281, "learning_rate": 0.00019221890386633307, "loss": 0.0001, "step": 1860 }, { "epoch": 0.27097711587359247, "grad_norm": 0.0017535451333969831, "learning_rate": 0.000192138187101461, "loss": 0.0012, "step": 1865 }, { "epoch": 0.2717035960770069, "grad_norm": 0.0014856884954497218, "learning_rate": 0.0001920574703365889, "loss": 0.0002, "step": 1870 }, { "epoch": 0.2724300762804214, "grad_norm": 0.004271077457815409, "learning_rate": 0.00019197675357171686, "loss": 0.0049, "step": 1875 }, { "epoch": 0.2731565564838358, "grad_norm": 0.009308665059506893, "learning_rate": 0.00019189603680684478, "loss": 0.0013, "step": 1880 }, { "epoch": 0.2738830366872503, "grad_norm": 0.001470932038500905, "learning_rate": 0.00019181532004197273, "loss": 0.0009, "step": 1885 }, { "epoch": 0.2746095168906647, "grad_norm": 0.0009906482882797718, "learning_rate": 0.00019173460327710066, "loss": 0.0074, "step": 1890 }, { "epoch": 0.2753359970940792, "grad_norm": 0.5366028547286987, "learning_rate": 0.0001916538865122286, "loss": 0.0004, "step": 1895 }, { "epoch": 0.27606247729749367, "grad_norm": 0.0012202219804748893, "learning_rate": 0.00019157316974735653, "loss": 0.0007, "step": 1900 }, { "epoch": 0.2767889575009081, "grad_norm": 0.5043062567710876, "learning_rate": 0.00019149245298248445, "loss": 0.0007, "step": 1905 }, { "epoch": 0.2775154377043226, "grad_norm": 0.0006929966621100903, "learning_rate": 0.0001914117362176124, "loss": 0.01, "step": 1910 }, { "epoch": 0.278241917907737, "grad_norm": 0.0005868257721886039, "learning_rate": 0.00019133101945274035, "loss": 0.0, "step": 1915 }, { "epoch": 0.2789683981111515, "grad_norm": 1.5353443622589111, "learning_rate": 0.00019125030268786827, "loss": 0.0012, "step": 1920 }, { "epoch": 0.2796948783145659, "grad_norm": 0.0007161126704886556, "learning_rate": 0.0001911695859229962, "loss": 0.0, "step": 1925 }, { "epoch": 0.2804213585179804, "grad_norm": 0.0007424887735396624, "learning_rate": 0.00019108886915812417, "loss": 0.0, "step": 1930 }, { "epoch": 0.2811478387213948, "grad_norm": 0.0006449614884331822, "learning_rate": 0.0001910081523932521, "loss": 0.0, "step": 1935 }, { "epoch": 0.2818743189248093, "grad_norm": 0.0006138585740700364, "learning_rate": 0.00019092743562838002, "loss": 0.0, "step": 1940 }, { "epoch": 0.2826007991282238, "grad_norm": 0.0006936938152648509, "learning_rate": 0.00019084671886350797, "loss": 0.0, "step": 1945 }, { "epoch": 0.2833272793316382, "grad_norm": 0.0004829142999369651, "learning_rate": 0.0001907660020986359, "loss": 0.0001, "step": 1950 }, { "epoch": 0.2840537595350527, "grad_norm": 0.0005034743226133287, "learning_rate": 0.00019068528533376384, "loss": 0.0, "step": 1955 }, { "epoch": 0.2847802397384671, "grad_norm": 0.0004061743093188852, "learning_rate": 0.00019060456856889176, "loss": 0.0, "step": 1960 }, { "epoch": 0.2855067199418816, "grad_norm": 0.6731203198432922, "learning_rate": 0.0001905238518040197, "loss": 0.0282, "step": 1965 }, { "epoch": 0.286233200145296, "grad_norm": 0.010977654717862606, "learning_rate": 0.00019044313503914764, "loss": 0.0002, "step": 1970 }, { "epoch": 0.2869596803487105, "grad_norm": 0.022831691429018974, "learning_rate": 0.00019036241827427556, "loss": 0.0006, "step": 1975 }, { "epoch": 0.287686160552125, "grad_norm": 0.026040196418762207, "learning_rate": 0.0001902817015094035, "loss": 0.0005, "step": 1980 }, { "epoch": 0.2884126407555394, "grad_norm": 0.011391847394406796, "learning_rate": 0.00019020098474453146, "loss": 0.0004, "step": 1985 }, { "epoch": 0.2891391209589539, "grad_norm": 0.013334060087800026, "learning_rate": 0.00019012026797965938, "loss": 0.0003, "step": 1990 }, { "epoch": 0.2898656011623683, "grad_norm": 0.0060678147710859776, "learning_rate": 0.0001900395512147873, "loss": 0.0002, "step": 1995 }, { "epoch": 0.2905920813657828, "grad_norm": 0.004468259867280722, "learning_rate": 0.00018995883444991525, "loss": 0.0002, "step": 2000 }, { "epoch": 0.2913185615691972, "grad_norm": 0.0036872695200145245, "learning_rate": 0.0001898781176850432, "loss": 0.0001, "step": 2005 }, { "epoch": 0.2920450417726117, "grad_norm": 0.0026169579941779375, "learning_rate": 0.00018979740092017113, "loss": 0.0001, "step": 2010 }, { "epoch": 0.29277152197602613, "grad_norm": 0.0021394200157374144, "learning_rate": 0.00018971668415529907, "loss": 0.0001, "step": 2015 }, { "epoch": 0.2934980021794406, "grad_norm": 0.0022201493848115206, "learning_rate": 0.000189635967390427, "loss": 0.0001, "step": 2020 }, { "epoch": 0.2942244823828551, "grad_norm": 0.0021840811241418123, "learning_rate": 0.00018955525062555495, "loss": 0.0001, "step": 2025 }, { "epoch": 0.2949509625862695, "grad_norm": 0.0016265831654891372, "learning_rate": 0.00018947453386068287, "loss": 0.0001, "step": 2030 }, { "epoch": 0.295677442789684, "grad_norm": 0.0015095279086381197, "learning_rate": 0.00018939381709581082, "loss": 0.0001, "step": 2035 }, { "epoch": 0.2964039229930984, "grad_norm": 0.0013007308589294553, "learning_rate": 0.00018931310033093874, "loss": 0.0, "step": 2040 }, { "epoch": 0.2971304031965129, "grad_norm": 0.0011377567425370216, "learning_rate": 0.00018923238356606666, "loss": 0.0, "step": 2045 }, { "epoch": 0.29785688339992733, "grad_norm": 0.0017277223523706198, "learning_rate": 0.00018915166680119461, "loss": 0.0, "step": 2050 }, { "epoch": 0.2985833636033418, "grad_norm": 0.0009744380367919803, "learning_rate": 0.00018907095003632256, "loss": 0.0158, "step": 2055 }, { "epoch": 0.2993098438067563, "grad_norm": 0.0012453808449208736, "learning_rate": 0.0001889902332714505, "loss": 0.0, "step": 2060 }, { "epoch": 0.3000363240101707, "grad_norm": 0.05199315398931503, "learning_rate": 0.0001889095165065784, "loss": 0.0002, "step": 2065 }, { "epoch": 0.3007628042135852, "grad_norm": 0.0018118784064427018, "learning_rate": 0.00018882879974170636, "loss": 0.0116, "step": 2070 }, { "epoch": 0.3014892844169996, "grad_norm": 0.002479708520695567, "learning_rate": 0.0001887480829768343, "loss": 0.0096, "step": 2075 }, { "epoch": 0.3022157646204141, "grad_norm": 0.001789470436051488, "learning_rate": 0.00018866736621196223, "loss": 0.0, "step": 2080 }, { "epoch": 0.30294224482382853, "grad_norm": 1.2244577407836914, "learning_rate": 0.00018858664944709018, "loss": 0.002, "step": 2085 }, { "epoch": 0.303668725027243, "grad_norm": 0.001510178786702454, "learning_rate": 0.0001885059326822181, "loss": 0.0001, "step": 2090 }, { "epoch": 0.30439520523065744, "grad_norm": 0.0012227630941197276, "learning_rate": 0.00018842521591734603, "loss": 0.0007, "step": 2095 }, { "epoch": 0.3051216854340719, "grad_norm": 6.986842155456543, "learning_rate": 0.00018834449915247398, "loss": 0.0041, "step": 2100 }, { "epoch": 0.3058481656374864, "grad_norm": 0.0014463558327406645, "learning_rate": 0.00018826378238760193, "loss": 0.0006, "step": 2105 }, { "epoch": 0.30657464584090083, "grad_norm": 0.0013261119602248073, "learning_rate": 0.00018818306562272985, "loss": 0.0058, "step": 2110 }, { "epoch": 0.3073011260443153, "grad_norm": 0.0014859420480206609, "learning_rate": 0.00018810234885785777, "loss": 0.0, "step": 2115 }, { "epoch": 0.30802760624772973, "grad_norm": 0.001101717702113092, "learning_rate": 0.00018802163209298572, "loss": 0.0, "step": 2120 }, { "epoch": 0.3087540864511442, "grad_norm": 0.0022333369124680758, "learning_rate": 0.00018794091532811367, "loss": 0.0003, "step": 2125 }, { "epoch": 0.30948056665455864, "grad_norm": 0.011202757246792316, "learning_rate": 0.0001878601985632416, "loss": 0.0001, "step": 2130 }, { "epoch": 0.3102070468579731, "grad_norm": 0.001800977042876184, "learning_rate": 0.00018777948179836952, "loss": 0.0218, "step": 2135 }, { "epoch": 0.3109335270613876, "grad_norm": 0.004161295481026173, "learning_rate": 0.00018769876503349747, "loss": 0.0002, "step": 2140 }, { "epoch": 0.31166000726480203, "grad_norm": 0.0032398079056292772, "learning_rate": 0.00018761804826862542, "loss": 0.0006, "step": 2145 }, { "epoch": 0.3123864874682165, "grad_norm": 0.04649796336889267, "learning_rate": 0.00018753733150375334, "loss": 0.0001, "step": 2150 }, { "epoch": 0.31311296767163094, "grad_norm": 0.0010927373077720404, "learning_rate": 0.0001874566147388813, "loss": 0.0001, "step": 2155 }, { "epoch": 0.3138394478750454, "grad_norm": 0.002848146017640829, "learning_rate": 0.0001873758979740092, "loss": 0.0001, "step": 2160 }, { "epoch": 0.31456592807845984, "grad_norm": 0.001080561545677483, "learning_rate": 0.00018729518120913713, "loss": 0.0, "step": 2165 }, { "epoch": 0.3152924082818743, "grad_norm": 0.0011905552819371223, "learning_rate": 0.00018721446444426508, "loss": 0.0, "step": 2170 }, { "epoch": 0.31601888848528875, "grad_norm": 0.002129076048731804, "learning_rate": 0.00018713374767939303, "loss": 0.0001, "step": 2175 }, { "epoch": 0.31674536868870323, "grad_norm": 0.0015021953731775284, "learning_rate": 0.00018705303091452096, "loss": 0.0, "step": 2180 }, { "epoch": 0.3174718488921177, "grad_norm": 0.0011074721114709973, "learning_rate": 0.00018697231414964888, "loss": 0.0, "step": 2185 }, { "epoch": 0.31819832909553214, "grad_norm": 0.0013954649912193418, "learning_rate": 0.00018689159738477683, "loss": 0.0, "step": 2190 }, { "epoch": 0.3189248092989466, "grad_norm": 0.0008435107301920652, "learning_rate": 0.00018681088061990478, "loss": 0.0, "step": 2195 }, { "epoch": 0.31965128950236105, "grad_norm": 0.0015673066955059767, "learning_rate": 0.0001867301638550327, "loss": 0.0001, "step": 2200 }, { "epoch": 0.32037776970577553, "grad_norm": 0.0006937576690688729, "learning_rate": 0.00018664944709016062, "loss": 0.0, "step": 2205 }, { "epoch": 0.32110424990918995, "grad_norm": 0.0006967806257307529, "learning_rate": 0.00018656873032528857, "loss": 0.0, "step": 2210 }, { "epoch": 0.32183073011260444, "grad_norm": 0.0010916970204561949, "learning_rate": 0.0001864880135604165, "loss": 0.0, "step": 2215 }, { "epoch": 0.32255721031601886, "grad_norm": 0.054137326776981354, "learning_rate": 0.00018640729679554445, "loss": 0.0001, "step": 2220 }, { "epoch": 0.32328369051943334, "grad_norm": 0.0007835402502678335, "learning_rate": 0.00018632658003067237, "loss": 0.0, "step": 2225 }, { "epoch": 0.3240101707228478, "grad_norm": 0.0006136346491985023, "learning_rate": 0.00018624586326580032, "loss": 0.0, "step": 2230 }, { "epoch": 0.32473665092626225, "grad_norm": 0.0005693508428521454, "learning_rate": 0.00018616514650092824, "loss": 0.0, "step": 2235 }, { "epoch": 0.32546313112967673, "grad_norm": 0.001010082894936204, "learning_rate": 0.0001860844297360562, "loss": 0.0, "step": 2240 }, { "epoch": 0.32618961133309116, "grad_norm": 0.0006115248543210328, "learning_rate": 0.00018600371297118414, "loss": 0.0, "step": 2245 }, { "epoch": 0.32691609153650564, "grad_norm": 0.005977267399430275, "learning_rate": 0.00018592299620631206, "loss": 0.0, "step": 2250 }, { "epoch": 0.32764257173992006, "grad_norm": 0.0004075188480783254, "learning_rate": 0.00018584227944143999, "loss": 0.0, "step": 2255 }, { "epoch": 0.32836905194333454, "grad_norm": 0.0005186618654988706, "learning_rate": 0.00018576156267656794, "loss": 0.0, "step": 2260 }, { "epoch": 0.329095532146749, "grad_norm": 0.0005320632481016219, "learning_rate": 0.00018568084591169589, "loss": 0.0, "step": 2265 }, { "epoch": 0.32982201235016345, "grad_norm": 0.029953761026263237, "learning_rate": 0.0001856001291468238, "loss": 0.0, "step": 2270 }, { "epoch": 0.33054849255357793, "grad_norm": 0.0003188280388712883, "learning_rate": 0.00018551941238195173, "loss": 0.0, "step": 2275 }, { "epoch": 0.33127497275699236, "grad_norm": 0.0004120226367376745, "learning_rate": 0.00018543869561707968, "loss": 0.0, "step": 2280 }, { "epoch": 0.33200145296040684, "grad_norm": 0.0005906698643229902, "learning_rate": 0.0001853579788522076, "loss": 0.0, "step": 2285 }, { "epoch": 0.33272793316382127, "grad_norm": 0.00045190524542704225, "learning_rate": 0.00018527726208733555, "loss": 0.0, "step": 2290 }, { "epoch": 0.33345441336723575, "grad_norm": 0.0008185270125977695, "learning_rate": 0.00018519654532246348, "loss": 0.0, "step": 2295 }, { "epoch": 0.3341808935706502, "grad_norm": 0.0003965144860558212, "learning_rate": 0.00018511582855759143, "loss": 0.0, "step": 2300 }, { "epoch": 0.33490737377406465, "grad_norm": 0.0003858699928969145, "learning_rate": 0.00018503511179271935, "loss": 0.0, "step": 2305 }, { "epoch": 0.33563385397747914, "grad_norm": 0.0005558038246817887, "learning_rate": 0.00018495439502784727, "loss": 0.0, "step": 2310 }, { "epoch": 0.33636033418089356, "grad_norm": 0.00037957995664328337, "learning_rate": 0.00018487367826297525, "loss": 0.0, "step": 2315 }, { "epoch": 0.33708681438430804, "grad_norm": 0.0003773049684241414, "learning_rate": 0.00018479296149810317, "loss": 0.0, "step": 2320 }, { "epoch": 0.33781329458772247, "grad_norm": 0.0006691055023111403, "learning_rate": 0.0001847122447332311, "loss": 0.0, "step": 2325 }, { "epoch": 0.33853977479113695, "grad_norm": 0.000681467994581908, "learning_rate": 0.00018463152796835904, "loss": 0.0, "step": 2330 }, { "epoch": 0.3392662549945514, "grad_norm": 0.0005777952610515058, "learning_rate": 0.000184550811203487, "loss": 0.0, "step": 2335 }, { "epoch": 0.33999273519796586, "grad_norm": 0.0005241065518930554, "learning_rate": 0.00018447009443861492, "loss": 0.0, "step": 2340 }, { "epoch": 0.34071921540138034, "grad_norm": 0.00039175679557956755, "learning_rate": 0.00018438937767374284, "loss": 0.0, "step": 2345 }, { "epoch": 0.34144569560479476, "grad_norm": 0.00041981766116805375, "learning_rate": 0.0001843086609088708, "loss": 0.0, "step": 2350 }, { "epoch": 0.34217217580820924, "grad_norm": 0.000371248199371621, "learning_rate": 0.0001842279441439987, "loss": 0.0, "step": 2355 }, { "epoch": 0.34289865601162367, "grad_norm": 0.00031778172706253827, "learning_rate": 0.00018414722737912666, "loss": 0.0, "step": 2360 }, { "epoch": 0.34362513621503815, "grad_norm": 0.00029086892027407885, "learning_rate": 0.00018406651061425458, "loss": 0.0, "step": 2365 }, { "epoch": 0.3443516164184526, "grad_norm": 0.0002902498235926032, "learning_rate": 0.00018398579384938253, "loss": 0.0, "step": 2370 }, { "epoch": 0.34507809662186706, "grad_norm": 0.00040075520519167185, "learning_rate": 0.00018390507708451045, "loss": 0.0, "step": 2375 }, { "epoch": 0.3458045768252815, "grad_norm": 0.00024263348313979805, "learning_rate": 0.00018382436031963838, "loss": 0.0, "step": 2380 }, { "epoch": 0.34653105702869597, "grad_norm": 0.0003889152139890939, "learning_rate": 0.00018374364355476635, "loss": 0.0, "step": 2385 }, { "epoch": 0.34725753723211045, "grad_norm": 0.00022724135487806052, "learning_rate": 0.00018366292678989428, "loss": 0.0, "step": 2390 }, { "epoch": 0.3479840174355249, "grad_norm": 0.0003505950153339654, "learning_rate": 0.0001835822100250222, "loss": 0.0, "step": 2395 }, { "epoch": 0.34871049763893935, "grad_norm": 0.27515658736228943, "learning_rate": 0.00018350149326015015, "loss": 0.0002, "step": 2400 }, { "epoch": 0.3494369778423538, "grad_norm": 0.0003519939782563597, "learning_rate": 0.00018342077649527807, "loss": 0.0, "step": 2405 }, { "epoch": 0.35016345804576826, "grad_norm": 0.00033144818735308945, "learning_rate": 0.00018334005973040602, "loss": 0.0, "step": 2410 }, { "epoch": 0.3508899382491827, "grad_norm": 0.0003098642046097666, "learning_rate": 0.00018325934296553394, "loss": 0.0, "step": 2415 }, { "epoch": 0.35161641845259717, "grad_norm": 0.0002381189988227561, "learning_rate": 0.0001831786262006619, "loss": 0.0, "step": 2420 }, { "epoch": 0.35234289865601165, "grad_norm": 0.0007972380262799561, "learning_rate": 0.00018309790943578982, "loss": 0.0, "step": 2425 }, { "epoch": 0.3530693788594261, "grad_norm": 0.0025481837801635265, "learning_rate": 0.00018301719267091774, "loss": 0.0, "step": 2430 }, { "epoch": 0.35379585906284056, "grad_norm": 0.00035965273855254054, "learning_rate": 0.0001829364759060457, "loss": 0.0, "step": 2435 }, { "epoch": 0.354522339266255, "grad_norm": 0.000297486170893535, "learning_rate": 0.00018285575914117364, "loss": 0.0, "step": 2440 }, { "epoch": 0.35524881946966946, "grad_norm": 0.00028157353517599404, "learning_rate": 0.00018277504237630156, "loss": 0.0021, "step": 2445 }, { "epoch": 0.3559752996730839, "grad_norm": 0.0003479410079307854, "learning_rate": 0.00018269432561142948, "loss": 0.0, "step": 2450 }, { "epoch": 0.35670177987649837, "grad_norm": 0.002874035155400634, "learning_rate": 0.00018261360884655746, "loss": 0.0003, "step": 2455 }, { "epoch": 0.3574282600799128, "grad_norm": 0.00015613746654707938, "learning_rate": 0.00018253289208168538, "loss": 0.0004, "step": 2460 }, { "epoch": 0.3581547402833273, "grad_norm": 0.00013312845840118825, "learning_rate": 0.0001824521753168133, "loss": 0.0, "step": 2465 }, { "epoch": 0.35888122048674176, "grad_norm": 0.0001981378736672923, "learning_rate": 0.00018237145855194126, "loss": 0.0, "step": 2470 }, { "epoch": 0.3596077006901562, "grad_norm": 0.00027879534172825515, "learning_rate": 0.00018229074178706918, "loss": 0.0, "step": 2475 }, { "epoch": 0.36033418089357067, "grad_norm": 0.00016323383897542953, "learning_rate": 0.00018221002502219713, "loss": 0.0, "step": 2480 }, { "epoch": 0.3610606610969851, "grad_norm": 0.0005233317497186363, "learning_rate": 0.00018212930825732505, "loss": 0.0, "step": 2485 }, { "epoch": 0.3617871413003996, "grad_norm": 0.00013268415932543576, "learning_rate": 0.000182048591492453, "loss": 0.0, "step": 2490 }, { "epoch": 0.362513621503814, "grad_norm": 0.01259111799299717, "learning_rate": 0.00018196787472758092, "loss": 0.0001, "step": 2495 }, { "epoch": 0.3632401017072285, "grad_norm": 0.00014725365326739848, "learning_rate": 0.00018188715796270885, "loss": 0.0, "step": 2500 }, { "epoch": 0.36396658191064296, "grad_norm": 0.00021464233577717096, "learning_rate": 0.0001818064411978368, "loss": 0.0, "step": 2505 }, { "epoch": 0.3646930621140574, "grad_norm": 0.00011434618500061333, "learning_rate": 0.00018172572443296475, "loss": 0.0, "step": 2510 }, { "epoch": 0.36541954231747187, "grad_norm": 0.00012706074630841613, "learning_rate": 0.00018164500766809267, "loss": 0.0, "step": 2515 }, { "epoch": 0.3661460225208863, "grad_norm": 0.00015453774540219456, "learning_rate": 0.0001815642909032206, "loss": 0.0, "step": 2520 }, { "epoch": 0.3668725027243008, "grad_norm": 0.00014317889872472733, "learning_rate": 0.00018148357413834854, "loss": 0.0, "step": 2525 }, { "epoch": 0.3675989829277152, "grad_norm": 0.00014966298476792872, "learning_rate": 0.0001814028573734765, "loss": 0.0, "step": 2530 }, { "epoch": 0.3683254631311297, "grad_norm": 0.0001484445674577728, "learning_rate": 0.00018132214060860441, "loss": 0.0, "step": 2535 }, { "epoch": 0.3690519433345441, "grad_norm": 0.00012702727690339088, "learning_rate": 0.00018124142384373234, "loss": 0.0, "step": 2540 }, { "epoch": 0.3697784235379586, "grad_norm": 0.0001310681545874104, "learning_rate": 0.00018116070707886029, "loss": 0.0, "step": 2545 }, { "epoch": 0.37050490374137307, "grad_norm": 0.0001544792321510613, "learning_rate": 0.00018107999031398824, "loss": 0.0, "step": 2550 }, { "epoch": 0.3712313839447875, "grad_norm": 0.0003174188022967428, "learning_rate": 0.00018099927354911616, "loss": 0.0, "step": 2555 }, { "epoch": 0.371957864148202, "grad_norm": 0.00012976166908629239, "learning_rate": 0.0001809185567842441, "loss": 0.0, "step": 2560 }, { "epoch": 0.3726843443516164, "grad_norm": 0.00011333979637129232, "learning_rate": 0.00018083784001937203, "loss": 0.0, "step": 2565 }, { "epoch": 0.3734108245550309, "grad_norm": 0.00014128838665783405, "learning_rate": 0.00018075712325449995, "loss": 0.0, "step": 2570 }, { "epoch": 0.3741373047584453, "grad_norm": 9.816375677473843e-05, "learning_rate": 0.0001806764064896279, "loss": 0.0, "step": 2575 }, { "epoch": 0.3748637849618598, "grad_norm": 0.00012458849232643843, "learning_rate": 0.00018059568972475585, "loss": 0.0, "step": 2580 }, { "epoch": 0.3755902651652743, "grad_norm": 0.00011874383199028671, "learning_rate": 0.00018051497295988378, "loss": 0.0, "step": 2585 }, { "epoch": 0.3763167453686887, "grad_norm": 0.00010492030560271814, "learning_rate": 0.0001804342561950117, "loss": 0.0, "step": 2590 }, { "epoch": 0.3770432255721032, "grad_norm": 0.00012079241423634812, "learning_rate": 0.00018035353943013965, "loss": 0.0, "step": 2595 }, { "epoch": 0.3777697057755176, "grad_norm": 0.0010301030706614256, "learning_rate": 0.0001802728226652676, "loss": 0.0, "step": 2600 }, { "epoch": 0.3784961859789321, "grad_norm": 0.00020237726857885718, "learning_rate": 0.00018019210590039552, "loss": 0.0, "step": 2605 }, { "epoch": 0.3792226661823465, "grad_norm": 0.00014590570935979486, "learning_rate": 0.00018011138913552344, "loss": 0.0, "step": 2610 }, { "epoch": 0.379949146385761, "grad_norm": 0.00012144942593295127, "learning_rate": 0.0001800306723706514, "loss": 0.0, "step": 2615 }, { "epoch": 0.3806756265891754, "grad_norm": 0.00011861774692079052, "learning_rate": 0.00017994995560577932, "loss": 0.0, "step": 2620 }, { "epoch": 0.3814021067925899, "grad_norm": 0.0002795616746880114, "learning_rate": 0.00017986923884090727, "loss": 0.0, "step": 2625 }, { "epoch": 0.3821285869960044, "grad_norm": 0.0001514231407782063, "learning_rate": 0.00017978852207603522, "loss": 0.0, "step": 2630 }, { "epoch": 0.3828550671994188, "grad_norm": 0.000137203314807266, "learning_rate": 0.00017970780531116314, "loss": 0.0, "step": 2635 }, { "epoch": 0.3835815474028333, "grad_norm": 0.00011654103582259268, "learning_rate": 0.00017962708854629106, "loss": 0.0, "step": 2640 }, { "epoch": 0.3843080276062477, "grad_norm": 0.00011019224621122703, "learning_rate": 0.000179546371781419, "loss": 0.0, "step": 2645 }, { "epoch": 0.3850345078096622, "grad_norm": 0.00011716793233063072, "learning_rate": 0.00017946565501654696, "loss": 0.0, "step": 2650 }, { "epoch": 0.3857609880130766, "grad_norm": 0.00013133355241734535, "learning_rate": 0.00017938493825167488, "loss": 0.0, "step": 2655 }, { "epoch": 0.3864874682164911, "grad_norm": 0.00010616648069117218, "learning_rate": 0.0001793042214868028, "loss": 0.0, "step": 2660 }, { "epoch": 0.3872139484199056, "grad_norm": 0.00012793530186172575, "learning_rate": 0.00017922350472193076, "loss": 0.0, "step": 2665 }, { "epoch": 0.38794042862332, "grad_norm": 0.00021880699205212295, "learning_rate": 0.0001791427879570587, "loss": 0.0, "step": 2670 }, { "epoch": 0.3886669088267345, "grad_norm": 0.0321350060403347, "learning_rate": 0.00017906207119218663, "loss": 0.0, "step": 2675 }, { "epoch": 0.3893933890301489, "grad_norm": 0.0001054102904163301, "learning_rate": 0.00017898135442731455, "loss": 0.0, "step": 2680 }, { "epoch": 0.3901198692335634, "grad_norm": 0.00011370116408215836, "learning_rate": 0.0001789006376624425, "loss": 0.0, "step": 2685 }, { "epoch": 0.3908463494369778, "grad_norm": 7.921565702417865e-05, "learning_rate": 0.00017881992089757042, "loss": 0.0, "step": 2690 }, { "epoch": 0.3915728296403923, "grad_norm": 0.0001325017656199634, "learning_rate": 0.00017873920413269837, "loss": 0.0, "step": 2695 }, { "epoch": 0.39229930984380673, "grad_norm": 0.00011485354480100796, "learning_rate": 0.00017865848736782632, "loss": 0.0, "step": 2700 }, { "epoch": 0.3930257900472212, "grad_norm": 0.0001319620932918042, "learning_rate": 0.00017857777060295424, "loss": 0.0, "step": 2705 }, { "epoch": 0.3937522702506357, "grad_norm": 0.00011554160300875083, "learning_rate": 0.00017849705383808217, "loss": 0.0, "step": 2710 }, { "epoch": 0.3944787504540501, "grad_norm": 0.00011111667845398188, "learning_rate": 0.00017841633707321012, "loss": 0.0, "step": 2715 }, { "epoch": 0.3952052306574646, "grad_norm": 0.00030816654907539487, "learning_rate": 0.00017833562030833807, "loss": 0.0, "step": 2720 }, { "epoch": 0.395931710860879, "grad_norm": 0.00012618518667295575, "learning_rate": 0.000178254903543466, "loss": 0.0, "step": 2725 }, { "epoch": 0.3966581910642935, "grad_norm": 0.00011036815703846514, "learning_rate": 0.0001781741867785939, "loss": 0.0, "step": 2730 }, { "epoch": 0.39738467126770793, "grad_norm": 0.001136181759648025, "learning_rate": 0.00017809347001372186, "loss": 0.0, "step": 2735 }, { "epoch": 0.3981111514711224, "grad_norm": 9.4526847533416e-05, "learning_rate": 0.00017801275324884978, "loss": 0.0, "step": 2740 }, { "epoch": 0.3988376316745369, "grad_norm": 9.693180618342012e-05, "learning_rate": 0.00017793203648397773, "loss": 0.0, "step": 2745 }, { "epoch": 0.3995641118779513, "grad_norm": 0.00013439155009109527, "learning_rate": 0.00017785131971910566, "loss": 0.0, "step": 2750 }, { "epoch": 0.4001452960406829, "eval_accuracy": 1.0, "eval_f1": 1.0, "eval_loss": 8.966613904703991e-07, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 123.9354, "eval_samples_per_second": 313.591, "eval_steps_per_second": 2.453, "step": 2754 } ], "logging_steps": 5, "max_steps": 13766, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1377, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3618642193367040.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }