{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998985080686086, "eval_steps": 500, "global_step": 2463, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0040596772556581754, "grad_norm": 132.0478057861328, "learning_rate": 1.3513513513513515e-10, "loss": 15.9449, "step": 10 }, { "epoch": 0.008119354511316351, "grad_norm": 185.23609924316406, "learning_rate": 2.702702702702703e-10, "loss": 17.0759, "step": 20 }, { "epoch": 0.012179031766974525, "grad_norm": 138.67050170898438, "learning_rate": 4.0540540540540546e-10, "loss": 16.5406, "step": 30 }, { "epoch": 0.016238709022632702, "grad_norm": 154.48605346679688, "learning_rate": 5.405405405405406e-10, "loss": 16.9727, "step": 40 }, { "epoch": 0.020298386278290875, "grad_norm": 157.55892944335938, "learning_rate": 6.756756756756757e-10, "loss": 16.3976, "step": 50 }, { "epoch": 0.02435806353394905, "grad_norm": 195.1601104736328, "learning_rate": 8.108108108108109e-10, "loss": 17.7916, "step": 60 }, { "epoch": 0.028417740789607227, "grad_norm": 185.73776245117188, "learning_rate": 9.45945945945946e-10, "loss": 16.9119, "step": 70 }, { "epoch": 0.032477418045265403, "grad_norm": 200.99549865722656, "learning_rate": 9.9998443648451e-10, "loss": 17.2071, "step": 80 }, { "epoch": 0.036537095300923576, "grad_norm": 204.63539123535156, "learning_rate": 9.99889329620792e-10, "loss": 16.493, "step": 90 }, { "epoch": 0.04059677255658175, "grad_norm": 176.94541931152344, "learning_rate": 9.997077787173976e-10, "loss": 17.9207, "step": 100 }, { "epoch": 0.04465644981223993, "grad_norm": 208.50137329101562, "learning_rate": 9.99439815169263e-10, "loss": 16.9535, "step": 110 }, { "epoch": 0.0487161270678981, "grad_norm": 169.84095764160156, "learning_rate": 9.990854853143476e-10, "loss": 16.6511, "step": 120 }, { "epoch": 0.052775804323556275, "grad_norm": 176.42965698242188, "learning_rate": 9.98644850425622e-10, "loss": 17.7791, "step": 130 }, { "epoch": 0.056835481579214454, "grad_norm": 171.8761749267578, "learning_rate": 9.981179867004708e-10, "loss": 17.931, "step": 140 }, { "epoch": 0.06089515883487263, "grad_norm": 222.7410888671875, "learning_rate": 9.97504985247518e-10, "loss": 16.1024, "step": 150 }, { "epoch": 0.06495483609053081, "grad_norm": 173.39248657226562, "learning_rate": 9.968059520708706e-10, "loss": 17.2411, "step": 160 }, { "epoch": 0.06901451334618898, "grad_norm": 202.84156799316406, "learning_rate": 9.960210080517876e-10, "loss": 17.6544, "step": 170 }, { "epoch": 0.07307419060184715, "grad_norm": 195.48196411132812, "learning_rate": 9.951502889277773e-10, "loss": 17.2764, "step": 180 }, { "epoch": 0.07713386785750533, "grad_norm": 204.30767822265625, "learning_rate": 9.941939452691238e-10, "loss": 17.3761, "step": 190 }, { "epoch": 0.0811935451131635, "grad_norm": 233.84881591796875, "learning_rate": 9.931521424528503e-10, "loss": 17.5323, "step": 200 }, { "epoch": 0.08525322236882169, "grad_norm": 171.0516357421875, "learning_rate": 9.920250606341204e-10, "loss": 17.3739, "step": 210 }, { "epoch": 0.08931289962447986, "grad_norm": 225.2151641845703, "learning_rate": 9.908128947150849e-10, "loss": 17.3732, "step": 220 }, { "epoch": 0.09337257688013803, "grad_norm": 144.97401428222656, "learning_rate": 9.895158543111775e-10, "loss": 16.4779, "step": 230 }, { "epoch": 0.0974322541357962, "grad_norm": 194.57334899902344, "learning_rate": 9.881341637148678e-10, "loss": 17.972, "step": 240 }, { "epoch": 0.10149193139145438, "grad_norm": 182.3833770751953, "learning_rate": 9.866680618568744e-10, "loss": 17.1289, "step": 250 }, { "epoch": 0.10555160864711255, "grad_norm": 139.1823272705078, "learning_rate": 9.851178022648477e-10, "loss": 16.7695, "step": 260 }, { "epoch": 0.10961128590277074, "grad_norm": 147.51815795898438, "learning_rate": 9.834836530195282e-10, "loss": 16.6021, "step": 270 }, { "epoch": 0.11367096315842891, "grad_norm": 141.7736358642578, "learning_rate": 9.817658967083883e-10, "loss": 17.0966, "step": 280 }, { "epoch": 0.11773064041408708, "grad_norm": 188.4720001220703, "learning_rate": 9.799648303767659e-10, "loss": 16.9828, "step": 290 }, { "epoch": 0.12179031766974525, "grad_norm": 148.27267456054688, "learning_rate": 9.780807654764966e-10, "loss": 17.4211, "step": 300 }, { "epoch": 0.12584999492540344, "grad_norm": 149.38882446289062, "learning_rate": 9.761140278120562e-10, "loss": 16.9751, "step": 310 }, { "epoch": 0.12990967218106161, "grad_norm": 159.62318420410156, "learning_rate": 9.740649574842206e-10, "loss": 16.3416, "step": 320 }, { "epoch": 0.1339693494367198, "grad_norm": 158.93472290039062, "learning_rate": 9.719339088312521e-10, "loss": 16.6636, "step": 330 }, { "epoch": 0.13802902669237796, "grad_norm": 197.2371826171875, "learning_rate": 9.697212503676272e-10, "loss": 18.0939, "step": 340 }, { "epoch": 0.14208870394803613, "grad_norm": 149.5404510498047, "learning_rate": 9.674273647203087e-10, "loss": 16.6984, "step": 350 }, { "epoch": 0.1461483812036943, "grad_norm": 159.680908203125, "learning_rate": 9.650526485625804e-10, "loss": 17.404, "step": 360 }, { "epoch": 0.15020805845935248, "grad_norm": 171.5859832763672, "learning_rate": 9.625975125454515e-10, "loss": 16.8117, "step": 370 }, { "epoch": 0.15426773571501065, "grad_norm": 199.5972137451172, "learning_rate": 9.600623812266447e-10, "loss": 17.56, "step": 380 }, { "epoch": 0.15832741297066882, "grad_norm": 147.773681640625, "learning_rate": 9.57447692997178e-10, "loss": 17.4291, "step": 390 }, { "epoch": 0.162387090226327, "grad_norm": 218.25433349609375, "learning_rate": 9.54753900005557e-10, "loss": 17.5885, "step": 400 }, { "epoch": 0.16644676748198517, "grad_norm": 169.73826599121094, "learning_rate": 9.519814680795842e-10, "loss": 16.6519, "step": 410 }, { "epoch": 0.17050644473764337, "grad_norm": 166.3510284423828, "learning_rate": 9.491308766458076e-10, "loss": 17.2467, "step": 420 }, { "epoch": 0.17456612199330154, "grad_norm": 182.20436096191406, "learning_rate": 9.462026186466134e-10, "loss": 17.4754, "step": 430 }, { "epoch": 0.17862579924895972, "grad_norm": 162.10064697265625, "learning_rate": 9.431972004549834e-10, "loss": 16.3912, "step": 440 }, { "epoch": 0.1826854765046179, "grad_norm": 211.85739135742188, "learning_rate": 9.40115141786931e-10, "loss": 18.0005, "step": 450 }, { "epoch": 0.18674515376027606, "grad_norm": 156.6442413330078, "learning_rate": 9.369569756116282e-10, "loss": 16.4153, "step": 460 }, { "epoch": 0.19080483101593423, "grad_norm": 160.61705017089844, "learning_rate": 9.337232480592392e-10, "loss": 17.6727, "step": 470 }, { "epoch": 0.1948645082715924, "grad_norm": 152.6673583984375, "learning_rate": 9.304145183264834e-10, "loss": 17.8167, "step": 480 }, { "epoch": 0.19892418552725058, "grad_norm": 200.42538452148438, "learning_rate": 9.270313585799328e-10, "loss": 17.4904, "step": 490 }, { "epoch": 0.20298386278290875, "grad_norm": 232.02088928222656, "learning_rate": 9.235743538570709e-10, "loss": 16.3814, "step": 500 }, { "epoch": 0.20704354003856693, "grad_norm": 159.0342254638672, "learning_rate": 9.200441019651237e-10, "loss": 16.7111, "step": 510 }, { "epoch": 0.2111032172942251, "grad_norm": 189.23023986816406, "learning_rate": 9.164412133776831e-10, "loss": 17.5323, "step": 520 }, { "epoch": 0.21516289454988327, "grad_norm": 126.5080337524414, "learning_rate": 9.127663111291399e-10, "loss": 17.2915, "step": 530 }, { "epoch": 0.21922257180554147, "grad_norm": 206.55093383789062, "learning_rate": 9.09020030706945e-10, "loss": 17.1491, "step": 540 }, { "epoch": 0.22328224906119964, "grad_norm": 219.1647491455078, "learning_rate": 9.052030199417168e-10, "loss": 17.3283, "step": 550 }, { "epoch": 0.22734192631685782, "grad_norm": 178.6308135986328, "learning_rate": 9.013159388952136e-10, "loss": 16.8583, "step": 560 }, { "epoch": 0.231401603572516, "grad_norm": 173.4936065673828, "learning_rate": 8.973594597461927e-10, "loss": 17.5231, "step": 570 }, { "epoch": 0.23546128082817416, "grad_norm": 162.73269653320312, "learning_rate": 8.933342666741717e-10, "loss": 17.1647, "step": 580 }, { "epoch": 0.23952095808383234, "grad_norm": 200.88888549804688, "learning_rate": 8.892410557411171e-10, "loss": 17.3196, "step": 590 }, { "epoch": 0.2435806353394905, "grad_norm": 187.9020233154297, "learning_rate": 8.850805347710753e-10, "loss": 17.6811, "step": 600 }, { "epoch": 0.24764031259514868, "grad_norm": 201.80628967285156, "learning_rate": 8.80853423227773e-10, "loss": 18.0601, "step": 610 }, { "epoch": 0.2516999898508069, "grad_norm": 130.6888427734375, "learning_rate": 8.765604520902013e-10, "loss": 15.8318, "step": 620 }, { "epoch": 0.255759667106465, "grad_norm": 174.96263122558594, "learning_rate": 8.722023637262114e-10, "loss": 17.533, "step": 630 }, { "epoch": 0.25981934436212323, "grad_norm": 191.3065643310547, "learning_rate": 8.677799117641387e-10, "loss": 17.1311, "step": 640 }, { "epoch": 0.2638790216177814, "grad_norm": 202.28585815429688, "learning_rate": 8.632938609624813e-10, "loss": 17.2724, "step": 650 }, { "epoch": 0.2679386988734396, "grad_norm": 222.07972717285156, "learning_rate": 8.587449870776526e-10, "loss": 17.2216, "step": 660 }, { "epoch": 0.2719983761290977, "grad_norm": 214.5223846435547, "learning_rate": 8.541340767298328e-10, "loss": 17.3321, "step": 670 }, { "epoch": 0.2760580533847559, "grad_norm": 168.41421508789062, "learning_rate": 8.494619272669418e-10, "loss": 17.529, "step": 680 }, { "epoch": 0.28011773064041406, "grad_norm": 182.80117797851562, "learning_rate": 8.447293466267558e-10, "loss": 18.1657, "step": 690 }, { "epoch": 0.28417740789607226, "grad_norm": 190.1322784423828, "learning_rate": 8.399371531971954e-10, "loss": 18.3519, "step": 700 }, { "epoch": 0.2882370851517304, "grad_norm": 195.91421508789062, "learning_rate": 8.350861756748022e-10, "loss": 17.8645, "step": 710 }, { "epoch": 0.2922967624073886, "grad_norm": 133.7427215576172, "learning_rate": 8.301772529214376e-10, "loss": 17.2449, "step": 720 }, { "epoch": 0.2963564396630468, "grad_norm": 197.28350830078125, "learning_rate": 8.252112338192204e-10, "loss": 17.3724, "step": 730 }, { "epoch": 0.30041611691870496, "grad_norm": 185.95706176757812, "learning_rate": 8.201889771237327e-10, "loss": 16.9303, "step": 740 }, { "epoch": 0.30447579417436316, "grad_norm": 167.47555541992188, "learning_rate": 8.151113513155189e-10, "loss": 17.2537, "step": 750 }, { "epoch": 0.3085354714300213, "grad_norm": 232.46424865722656, "learning_rate": 8.099792344499018e-10, "loss": 17.4633, "step": 760 }, { "epoch": 0.3125951486856795, "grad_norm": 188.18106079101562, "learning_rate": 8.047935140051446e-10, "loss": 17.2019, "step": 770 }, { "epoch": 0.31665482594133765, "grad_norm": 177.53736877441406, "learning_rate": 7.995550867289819e-10, "loss": 16.7029, "step": 780 }, { "epoch": 0.32071450319699585, "grad_norm": 184.06761169433594, "learning_rate": 7.942648584835484e-10, "loss": 18.0381, "step": 790 }, { "epoch": 0.324774180452654, "grad_norm": 173.73468017578125, "learning_rate": 7.889237440887321e-10, "loss": 18.0302, "step": 800 }, { "epoch": 0.3288338577083122, "grad_norm": 217.86709594726562, "learning_rate": 7.835326671639764e-10, "loss": 18.0424, "step": 810 }, { "epoch": 0.33289353496397034, "grad_norm": 202.07118225097656, "learning_rate": 7.780925599685638e-10, "loss": 16.8956, "step": 820 }, { "epoch": 0.33695321221962854, "grad_norm": 191.79275512695312, "learning_rate": 7.726043632404022e-10, "loss": 17.3942, "step": 830 }, { "epoch": 0.34101288947528674, "grad_norm": 161.33790588378906, "learning_rate": 7.670690260333475e-10, "loss": 17.1583, "step": 840 }, { "epoch": 0.3450725667309449, "grad_norm": 181.90426635742188, "learning_rate": 7.614875055530866e-10, "loss": 17.1477, "step": 850 }, { "epoch": 0.3491322439866031, "grad_norm": 213.49960327148438, "learning_rate": 7.558607669916116e-10, "loss": 17.6481, "step": 860 }, { "epoch": 0.35319192124226123, "grad_norm": 202.2276153564453, "learning_rate": 7.501897833603124e-10, "loss": 16.7866, "step": 870 }, { "epoch": 0.35725159849791943, "grad_norm": 177.2437286376953, "learning_rate": 7.444755353217177e-10, "loss": 17.1007, "step": 880 }, { "epoch": 0.3613112757535776, "grad_norm": 161.7916717529297, "learning_rate": 7.387190110199122e-10, "loss": 16.8443, "step": 890 }, { "epoch": 0.3653709530092358, "grad_norm": 160.1624298095703, "learning_rate": 7.32921205909661e-10, "loss": 17.1523, "step": 900 }, { "epoch": 0.3694306302648939, "grad_norm": 200.31753540039062, "learning_rate": 7.270831225842692e-10, "loss": 17.6586, "step": 910 }, { "epoch": 0.3734903075205521, "grad_norm": 158.079833984375, "learning_rate": 7.212057706022059e-10, "loss": 17.1793, "step": 920 }, { "epoch": 0.37754998477621027, "grad_norm": 224.93112182617188, "learning_rate": 7.152901663125267e-10, "loss": 18.1676, "step": 930 }, { "epoch": 0.38160966203186847, "grad_norm": 199.39297485351562, "learning_rate": 7.09337332679119e-10, "loss": 15.8113, "step": 940 }, { "epoch": 0.38566933928752667, "grad_norm": 202.5852508544922, "learning_rate": 7.033482991038051e-10, "loss": 17.3973, "step": 950 }, { "epoch": 0.3897290165431848, "grad_norm": 206.29861450195312, "learning_rate": 6.97324101248331e-10, "loss": 16.953, "step": 960 }, { "epoch": 0.393788693798843, "grad_norm": 140.68646240234375, "learning_rate": 6.91265780855274e-10, "loss": 17.5197, "step": 970 }, { "epoch": 0.39784837105450116, "grad_norm": 191.14852905273438, "learning_rate": 6.851743855678965e-10, "loss": 17.6989, "step": 980 }, { "epoch": 0.40190804831015936, "grad_norm": 152.35377502441406, "learning_rate": 6.79050968748983e-10, "loss": 17.5127, "step": 990 }, { "epoch": 0.4059677255658175, "grad_norm": 181.50877380371094, "learning_rate": 6.728965892986838e-10, "loss": 16.8963, "step": 1000 }, { "epoch": 0.4100274028214757, "grad_norm": 192.32125854492188, "learning_rate": 6.667123114714048e-10, "loss": 17.2991, "step": 1010 }, { "epoch": 0.41408708007713385, "grad_norm": 202.2693634033203, "learning_rate": 6.604992046917688e-10, "loss": 16.8996, "step": 1020 }, { "epoch": 0.41814675733279205, "grad_norm": 151.45115661621094, "learning_rate": 6.542583433696846e-10, "loss": 16.8886, "step": 1030 }, { "epoch": 0.4222064345884502, "grad_norm": 157.8872528076172, "learning_rate": 6.479908067145527e-10, "loss": 17.0116, "step": 1040 }, { "epoch": 0.4262661118441084, "grad_norm": 228.60235595703125, "learning_rate": 6.416976785486416e-10, "loss": 17.6079, "step": 1050 }, { "epoch": 0.43032578909976654, "grad_norm": 219.45249938964844, "learning_rate": 6.353800471196667e-10, "loss": 16.9453, "step": 1060 }, { "epoch": 0.43438546635542474, "grad_norm": 164.8721923828125, "learning_rate": 6.290390049126031e-10, "loss": 17.2325, "step": 1070 }, { "epoch": 0.43844514361108294, "grad_norm": 184.8201904296875, "learning_rate": 6.226756484607668e-10, "loss": 17.1532, "step": 1080 }, { "epoch": 0.4425048208667411, "grad_norm": 187.04025268554688, "learning_rate": 6.162910781561946e-10, "loss": 16.4238, "step": 1090 }, { "epoch": 0.4465644981223993, "grad_norm": 200.2959747314453, "learning_rate": 6.098863980593574e-10, "loss": 18.0924, "step": 1100 }, { "epoch": 0.45062417537805743, "grad_norm": 214.22193908691406, "learning_rate": 6.034627157082394e-10, "loss": 17.5339, "step": 1110 }, { "epoch": 0.45468385263371563, "grad_norm": 219.8036651611328, "learning_rate": 5.970211419268152e-10, "loss": 17.7163, "step": 1120 }, { "epoch": 0.4587435298893738, "grad_norm": 177.9528045654297, "learning_rate": 5.905627906329592e-10, "loss": 17.277, "step": 1130 }, { "epoch": 0.462803207145032, "grad_norm": 181.62625122070312, "learning_rate": 5.840887786458205e-10, "loss": 17.0171, "step": 1140 }, { "epoch": 0.4668628844006901, "grad_norm": 212.0501251220703, "learning_rate": 5.776002254926935e-10, "loss": 17.2654, "step": 1150 }, { "epoch": 0.4709225616563483, "grad_norm": 185.97579956054688, "learning_rate": 5.710982532154247e-10, "loss": 17.6895, "step": 1160 }, { "epoch": 0.47498223891200647, "grad_norm": 232.2166748046875, "learning_rate": 5.645839861763805e-10, "loss": 18.0333, "step": 1170 }, { "epoch": 0.47904191616766467, "grad_norm": 176.52072143554688, "learning_rate": 5.580585508640152e-10, "loss": 16.8448, "step": 1180 }, { "epoch": 0.4831015934233229, "grad_norm": 189.46929931640625, "learning_rate": 5.515230756980719e-10, "loss": 17.2395, "step": 1190 }, { "epoch": 0.487161270678981, "grad_norm": 206.33079528808594, "learning_rate": 5.449786908344499e-10, "loss": 16.9241, "step": 1200 }, { "epoch": 0.4912209479346392, "grad_norm": 186.9293670654297, "learning_rate": 5.384265279697689e-10, "loss": 16.7443, "step": 1210 }, { "epoch": 0.49528062519029736, "grad_norm": 170.4814453125, "learning_rate": 5.318677201456708e-10, "loss": 16.6439, "step": 1220 }, { "epoch": 0.49934030244595556, "grad_norm": 181.9535675048828, "learning_rate": 5.253034015528856e-10, "loss": 16.3063, "step": 1230 }, { "epoch": 0.5033999797016138, "grad_norm": 181.1636505126953, "learning_rate": 5.187347073351006e-10, "loss": 17.3231, "step": 1240 }, { "epoch": 0.5074596569572719, "grad_norm": 186.47972106933594, "learning_rate": 5.121627733926641e-10, "loss": 17.0968, "step": 1250 }, { "epoch": 0.51151933421293, "grad_norm": 194.881591796875, "learning_rate": 5.055887361861582e-10, "loss": 18.201, "step": 1260 }, { "epoch": 0.5155790114685882, "grad_norm": 199.1874237060547, "learning_rate": 4.990137325398745e-10, "loss": 16.7817, "step": 1270 }, { "epoch": 0.5196386887242465, "grad_norm": 203.87411499023438, "learning_rate": 4.924388994452276e-10, "loss": 17.371, "step": 1280 }, { "epoch": 0.5236983659799046, "grad_norm": 177.25927734375, "learning_rate": 4.858653738641395e-10, "loss": 16.6596, "step": 1290 }, { "epoch": 0.5277580432355627, "grad_norm": 161.23329162597656, "learning_rate": 4.792942925324285e-10, "loss": 17.0887, "step": 1300 }, { "epoch": 0.531817720491221, "grad_norm": 188.26792907714844, "learning_rate": 4.727267917632377e-10, "loss": 17.4645, "step": 1310 }, { "epoch": 0.5358773977468791, "grad_norm": 204.50733947753906, "learning_rate": 4.661640072505365e-10, "loss": 17.5325, "step": 1320 }, { "epoch": 0.5399370750025373, "grad_norm": 180.15682983398438, "learning_rate": 4.5960707387272904e-10, "loss": 17.7173, "step": 1330 }, { "epoch": 0.5439967522581954, "grad_norm": 195.1600341796875, "learning_rate": 4.5305712549640504e-10, "loss": 16.8578, "step": 1340 }, { "epoch": 0.5480564295138537, "grad_norm": 201.564208984375, "learning_rate": 4.4651529478026227e-10, "loss": 17.7686, "step": 1350 }, { "epoch": 0.5521161067695118, "grad_norm": 207.5132293701172, "learning_rate": 4.3998271297924156e-10, "loss": 16.9821, "step": 1360 }, { "epoch": 0.55617578402517, "grad_norm": 243.6310272216797, "learning_rate": 4.3346050974890247e-10, "loss": 17.9338, "step": 1370 }, { "epoch": 0.5602354612808281, "grad_norm": 169.40707397460938, "learning_rate": 4.269498129500762e-10, "loss": 16.6915, "step": 1380 }, { "epoch": 0.5642951385364864, "grad_norm": 209.2589569091797, "learning_rate": 4.2045174845382885e-10, "loss": 17.3758, "step": 1390 }, { "epoch": 0.5683548157921445, "grad_norm": 171.83935546875, "learning_rate": 4.139674399467684e-10, "loss": 16.4755, "step": 1400 }, { "epoch": 0.5724144930478027, "grad_norm": 206.6162109375, "learning_rate": 4.074980087367294e-10, "loss": 17.9797, "step": 1410 }, { "epoch": 0.5764741703034608, "grad_norm": 173.0574951171875, "learning_rate": 4.010445735588702e-10, "loss": 16.503, "step": 1420 }, { "epoch": 0.5805338475591191, "grad_norm": 206.66969299316406, "learning_rate": 3.946082503822132e-10, "loss": 17.5007, "step": 1430 }, { "epoch": 0.5845935248147772, "grad_norm": 225.87709045410156, "learning_rate": 3.881901522166649e-10, "loss": 17.5912, "step": 1440 }, { "epoch": 0.5886532020704354, "grad_norm": 180.4112091064453, "learning_rate": 3.817913889205473e-10, "loss": 17.6061, "step": 1450 }, { "epoch": 0.5927128793260936, "grad_norm": 128.54103088378906, "learning_rate": 3.7541306700867386e-10, "loss": 16.0483, "step": 1460 }, { "epoch": 0.5967725565817518, "grad_norm": 181.9798126220703, "learning_rate": 3.6905628946100346e-10, "loss": 16.802, "step": 1470 }, { "epoch": 0.6008322338374099, "grad_norm": 149.59954833984375, "learning_rate": 3.6272215553190727e-10, "loss": 16.2398, "step": 1480 }, { "epoch": 0.6048919110930681, "grad_norm": 170.98695373535156, "learning_rate": 3.564117605600774e-10, "loss": 16.2826, "step": 1490 }, { "epoch": 0.6089515883487263, "grad_norm": 170.1161651611328, "learning_rate": 3.5012619577911544e-10, "loss": 17.1219, "step": 1500 }, { "epoch": 0.6130112656043845, "grad_norm": 197.9274139404297, "learning_rate": 3.438665481288278e-10, "loss": 16.7303, "step": 1510 }, { "epoch": 0.6170709428600426, "grad_norm": 187.0927276611328, "learning_rate": 3.376339000672664e-10, "loss": 17.0052, "step": 1520 }, { "epoch": 0.6211306201157007, "grad_norm": 161.8428497314453, "learning_rate": 3.3142932938354233e-10, "loss": 16.2225, "step": 1530 }, { "epoch": 0.625190297371359, "grad_norm": 198.08689880371094, "learning_rate": 3.252539090114484e-10, "loss": 17.4928, "step": 1540 }, { "epoch": 0.6292499746270172, "grad_norm": 165.53260803222656, "learning_rate": 3.1910870684392023e-10, "loss": 17.0441, "step": 1550 }, { "epoch": 0.6333096518826753, "grad_norm": 153.26893615722656, "learning_rate": 3.1299478554836934e-10, "loss": 16.6345, "step": 1560 }, { "epoch": 0.6373693291383336, "grad_norm": 166.07655334472656, "learning_rate": 3.069132023829202e-10, "loss": 16.7557, "step": 1570 }, { "epoch": 0.6414290063939917, "grad_norm": 206.7568817138672, "learning_rate": 3.0086500901358233e-10, "loss": 17.2537, "step": 1580 }, { "epoch": 0.6454886836496498, "grad_norm": 187.02734375, "learning_rate": 2.94851251332389e-10, "loss": 16.7615, "step": 1590 }, { "epoch": 0.649548360905308, "grad_norm": 183.49896240234375, "learning_rate": 2.888729692765365e-10, "loss": 17.6427, "step": 1600 }, { "epoch": 0.6536080381609662, "grad_norm": 201.25961303710938, "learning_rate": 2.8293119664854974e-10, "loss": 16.8277, "step": 1610 }, { "epoch": 0.6576677154166244, "grad_norm": 156.2751922607422, "learning_rate": 2.770269609375114e-10, "loss": 17.5363, "step": 1620 }, { "epoch": 0.6617273926722825, "grad_norm": 205.1450958251953, "learning_rate": 2.71161283141382e-10, "loss": 18.4642, "step": 1630 }, { "epoch": 0.6657870699279407, "grad_norm": 157.36001586914062, "learning_rate": 2.653351775904427e-10, "loss": 17.0324, "step": 1640 }, { "epoch": 0.6698467471835989, "grad_norm": 197.63540649414062, "learning_rate": 2.5954965177189e-10, "loss": 17.0267, "step": 1650 }, { "epoch": 0.6739064244392571, "grad_norm": 163.759033203125, "learning_rate": 2.5380570615561564e-10, "loss": 17.2452, "step": 1660 }, { "epoch": 0.6779661016949152, "grad_norm": 161.09716796875, "learning_rate": 2.481043340211986e-10, "loss": 17.429, "step": 1670 }, { "epoch": 0.6820257789505735, "grad_norm": 208.68508911132812, "learning_rate": 2.4244652128614036e-10, "loss": 17.7347, "step": 1680 }, { "epoch": 0.6860854562062316, "grad_norm": 154.68821716308594, "learning_rate": 2.3683324633537435e-10, "loss": 16.7167, "step": 1690 }, { "epoch": 0.6901451334618898, "grad_norm": 186.65939331054688, "learning_rate": 2.3126547985207759e-10, "loss": 17.0754, "step": 1700 }, { "epoch": 0.6942048107175479, "grad_norm": 155.3890838623047, "learning_rate": 2.2574418464981368e-10, "loss": 17.0158, "step": 1710 }, { "epoch": 0.6982644879732062, "grad_norm": 175.33834838867188, "learning_rate": 2.2027031550603654e-10, "loss": 17.5807, "step": 1720 }, { "epoch": 0.7023241652288643, "grad_norm": 200.68499755859375, "learning_rate": 2.148448189969854e-10, "loss": 15.5709, "step": 1730 }, { "epoch": 0.7063838424845225, "grad_norm": 186.82205200195312, "learning_rate": 2.094686333339953e-10, "loss": 16.648, "step": 1740 }, { "epoch": 0.7104435197401806, "grad_norm": 187.7284698486328, "learning_rate": 2.0414268820125654e-10, "loss": 17.0848, "step": 1750 }, { "epoch": 0.7145031969958389, "grad_norm": 170.2503662109375, "learning_rate": 1.9886790459504857e-10, "loss": 16.8571, "step": 1760 }, { "epoch": 0.718562874251497, "grad_norm": 176.3491668701172, "learning_rate": 1.9364519466447346e-10, "loss": 16.7827, "step": 1770 }, { "epoch": 0.7226225515071552, "grad_norm": 167.1256866455078, "learning_rate": 1.8847546155372252e-10, "loss": 16.8153, "step": 1780 }, { "epoch": 0.7266822287628134, "grad_norm": 187.24716186523438, "learning_rate": 1.8335959924589935e-10, "loss": 17.8325, "step": 1790 }, { "epoch": 0.7307419060184716, "grad_norm": 216.55247497558594, "learning_rate": 1.7829849240842516e-10, "loss": 17.5121, "step": 1800 }, { "epoch": 0.7348015832741297, "grad_norm": 200.8616180419922, "learning_rate": 1.732930162400579e-10, "loss": 16.8064, "step": 1810 }, { "epoch": 0.7388612605297878, "grad_norm": 183.3948516845703, "learning_rate": 1.6834403631954642e-10, "loss": 17.0833, "step": 1820 }, { "epoch": 0.7429209377854461, "grad_norm": 166.15834045410156, "learning_rate": 1.6345240845594933e-10, "loss": 17.7809, "step": 1830 }, { "epoch": 0.7469806150411042, "grad_norm": 165.8581085205078, "learning_rate": 1.586189785406429e-10, "loss": 17.0209, "step": 1840 }, { "epoch": 0.7510402922967624, "grad_norm": 212.00918579101562, "learning_rate": 1.5384458240104482e-10, "loss": 17.0343, "step": 1850 }, { "epoch": 0.7550999695524205, "grad_norm": 185.42967224121094, "learning_rate": 1.4913004565607665e-10, "loss": 16.6158, "step": 1860 }, { "epoch": 0.7591596468080788, "grad_norm": 195.2454071044922, "learning_rate": 1.4447618357339333e-10, "loss": 16.4979, "step": 1870 }, { "epoch": 0.7632193240637369, "grad_norm": 214.2625274658203, "learning_rate": 1.398838009284016e-10, "loss": 16.691, "step": 1880 }, { "epoch": 0.7672790013193951, "grad_norm": 181.38255310058594, "learning_rate": 1.3535369186509296e-10, "loss": 16.9062, "step": 1890 }, { "epoch": 0.7713386785750533, "grad_norm": 145.416748046875, "learning_rate": 1.308866397587153e-10, "loss": 17.7773, "step": 1900 }, { "epoch": 0.7753983558307115, "grad_norm": 216.9072723388672, "learning_rate": 1.264834170803072e-10, "loss": 16.9568, "step": 1910 }, { "epoch": 0.7794580330863696, "grad_norm": 200.9578094482422, "learning_rate": 1.2214478526311674e-10, "loss": 17.5622, "step": 1920 }, { "epoch": 0.7835177103420278, "grad_norm": 162.93185424804688, "learning_rate": 1.1787149457092962e-10, "loss": 16.9736, "step": 1930 }, { "epoch": 0.787577387597686, "grad_norm": 189.59182739257812, "learning_rate": 1.1366428396832929e-10, "loss": 15.8744, "step": 1940 }, { "epoch": 0.7916370648533442, "grad_norm": 188.7930450439453, "learning_rate": 1.0952388099290983e-10, "loss": 17.6766, "step": 1950 }, { "epoch": 0.7956967421090023, "grad_norm": 134.07015991210938, "learning_rate": 1.0545100162946586e-10, "loss": 16.6428, "step": 1960 }, { "epoch": 0.7997564193646605, "grad_norm": 142.8442840576172, "learning_rate": 1.0144635018618054e-10, "loss": 17.4065, "step": 1970 }, { "epoch": 0.8038160966203187, "grad_norm": 182.24485778808594, "learning_rate": 9.751061917283073e-11, "loss": 17.2971, "step": 1980 }, { "epoch": 0.8078757738759769, "grad_norm": 194.0862274169922, "learning_rate": 9.364448918103474e-11, "loss": 17.2544, "step": 1990 }, { "epoch": 0.811935451131635, "grad_norm": 191.11993408203125, "learning_rate": 8.984862876656026e-11, "loss": 17.1763, "step": 2000 }, { "epoch": 0.8159951283872932, "grad_norm": 188.06570434570312, "learning_rate": 8.612369433371265e-11, "loss": 16.6179, "step": 2010 }, { "epoch": 0.8200548056429514, "grad_norm": 167.46762084960938, "learning_rate": 8.247033002182614e-11, "loss": 16.6814, "step": 2020 }, { "epoch": 0.8241144828986096, "grad_norm": 159.38058471679688, "learning_rate": 7.888916759387471e-11, "loss": 16.5084, "step": 2030 }, { "epoch": 0.8281741601542677, "grad_norm": 172.08058166503906, "learning_rate": 7.538082632722371e-11, "loss": 17.3695, "step": 2040 }, { "epoch": 0.832233837409926, "grad_norm": 222.47935485839844, "learning_rate": 7.194591290654024e-11, "loss": 16.9923, "step": 2050 }, { "epoch": 0.8362935146655841, "grad_norm": 189.6218719482422, "learning_rate": 6.858502131888211e-11, "loss": 17.5893, "step": 2060 }, { "epoch": 0.8403531919212422, "grad_norm": 244.06753540039062, "learning_rate": 6.52987327509812e-11, "loss": 17.5454, "step": 2070 }, { "epoch": 0.8444128691769004, "grad_norm": 137.7332305908203, "learning_rate": 6.208761548874082e-11, "loss": 17.2953, "step": 2080 }, { "epoch": 0.8484725464325586, "grad_norm": 201.9289093017578, "learning_rate": 5.895222481896489e-11, "loss": 17.7196, "step": 2090 }, { "epoch": 0.8525322236882168, "grad_norm": 172.56558227539062, "learning_rate": 5.5893102933333277e-11, "loss": 17.0008, "step": 2100 }, { "epoch": 0.8565919009438749, "grad_norm": 173.23507690429688, "learning_rate": 5.291077883464307e-11, "loss": 16.3006, "step": 2110 }, { "epoch": 0.8606515781995331, "grad_norm": 200.89015197753906, "learning_rate": 5.0005768245330264e-11, "loss": 17.5656, "step": 2120 }, { "epoch": 0.8647112554551913, "grad_norm": 191.20590209960938, "learning_rate": 4.717857351828731e-11, "loss": 17.3456, "step": 2130 }, { "epoch": 0.8687709327108495, "grad_norm": 166.0770263671875, "learning_rate": 4.4429683549993106e-11, "loss": 17.2893, "step": 2140 }, { "epoch": 0.8728306099665076, "grad_norm": 172.1090850830078, "learning_rate": 4.175957369597039e-11, "loss": 16.464, "step": 2150 }, { "epoch": 0.8768902872221659, "grad_norm": 170.6510009765625, "learning_rate": 3.9168705688583555e-11, "loss": 17.662, "step": 2160 }, { "epoch": 0.880949964477824, "grad_norm": 201.04290771484375, "learning_rate": 3.665752755719332e-11, "loss": 17.4915, "step": 2170 }, { "epoch": 0.8850096417334822, "grad_norm": 194.7832794189453, "learning_rate": 3.422647355068076e-11, "loss": 18.2301, "step": 2180 }, { "epoch": 0.8890693189891403, "grad_norm": 181.28720092773438, "learning_rate": 3.187596406235421e-11, "loss": 17.7734, "step": 2190 }, { "epoch": 0.8931289962447986, "grad_norm": 152.69996643066406, "learning_rate": 2.9606405557251637e-11, "loss": 16.8411, "step": 2200 }, { "epoch": 0.8971886735004567, "grad_norm": 212.84933471679688, "learning_rate": 2.7418190501853014e-11, "loss": 17.3207, "step": 2210 }, { "epoch": 0.9012483507561149, "grad_norm": 144.8594207763672, "learning_rate": 2.5311697296211634e-11, "loss": 16.9442, "step": 2220 }, { "epoch": 0.905308028011773, "grad_norm": 232.24757385253906, "learning_rate": 2.328729020851961e-11, "loss": 18.1509, "step": 2230 }, { "epoch": 0.9093677052674313, "grad_norm": 180.50587463378906, "learning_rate": 2.134531931211542e-11, "loss": 16.2897, "step": 2240 }, { "epoch": 0.9134273825230894, "grad_norm": 176.4561004638672, "learning_rate": 1.9486120424947908e-11, "loss": 17.3459, "step": 2250 }, { "epoch": 0.9174870597787476, "grad_norm": 177.8277130126953, "learning_rate": 1.771001505150366e-11, "loss": 16.3936, "step": 2260 }, { "epoch": 0.9215467370344058, "grad_norm": 189.9925994873047, "learning_rate": 1.6017310327211155e-11, "loss": 17.3137, "step": 2270 }, { "epoch": 0.925606414290064, "grad_norm": 212.03208923339844, "learning_rate": 1.4408298965328472e-11, "loss": 17.907, "step": 2280 }, { "epoch": 0.9296660915457221, "grad_norm": 157.50892639160156, "learning_rate": 1.2883259206325493e-11, "loss": 16.9568, "step": 2290 }, { "epoch": 0.9337257688013803, "grad_norm": 184.62356567382812, "learning_rate": 1.1442454769769017e-11, "loss": 18.1454, "step": 2300 }, { "epoch": 0.9377854460570385, "grad_norm": 206.690185546875, "learning_rate": 1.0086134808718562e-11, "loss": 18.086, "step": 2310 }, { "epoch": 0.9418451233126967, "grad_norm": 171.5003662109375, "learning_rate": 8.814533866641106e-12, "loss": 18.0924, "step": 2320 }, { "epoch": 0.9459048005683548, "grad_norm": 154.81707763671875, "learning_rate": 7.627871836852652e-12, "loss": 16.5896, "step": 2330 }, { "epoch": 0.9499644778240129, "grad_norm": 193.352783203125, "learning_rate": 6.52635392449269e-12, "loss": 18.0086, "step": 2340 }, { "epoch": 0.9540241550796712, "grad_norm": 182.68508911132812, "learning_rate": 5.510170611038701e-12, "loss": 17.6251, "step": 2350 }, { "epoch": 0.9580838323353293, "grad_norm": 220.1875762939453, "learning_rate": 4.579497621367057e-12, "loss": 18.5577, "step": 2360 }, { "epoch": 0.9621435095909875, "grad_norm": 193.38424682617188, "learning_rate": 3.734495893365664e-12, "loss": 18.0829, "step": 2370 }, { "epoch": 0.9662031868466457, "grad_norm": 162.79934692382812, "learning_rate": 2.9753115501032213e-12, "loss": 17.6267, "step": 2380 }, { "epoch": 0.9702628641023039, "grad_norm": 185.84449768066406, "learning_rate": 2.3020758745610493e-12, "loss": 17.418, "step": 2390 }, { "epoch": 0.974322541357962, "grad_norm": 176.00831604003906, "learning_rate": 1.7149052869305794e-12, "loss": 17.5759, "step": 2400 }, { "epoch": 0.9783822186136202, "grad_norm": 163.9122772216797, "learning_rate": 1.2139013244812924e-12, "loss": 17.8926, "step": 2410 }, { "epoch": 0.9824418958692784, "grad_norm": 172.8951416015625, "learning_rate": 7.991506240022095e-13, "loss": 17.5553, "step": 2420 }, { "epoch": 0.9865015731249366, "grad_norm": 186.85015869140625, "learning_rate": 4.70724906820208e-13, "loss": 18.0582, "step": 2430 }, { "epoch": 0.9905612503805947, "grad_norm": 203.409423828125, "learning_rate": 2.286809663974987e-13, "loss": 18.464, "step": 2440 }, { "epoch": 0.9946209276362529, "grad_norm": 192.5825653076172, "learning_rate": 7.306065851042654e-14, "loss": 17.8112, "step": 2450 }, { "epoch": 0.9986806048919111, "grad_norm": 174.4022674560547, "learning_rate": 3.890894011593371e-15, "loss": 17.7037, "step": 2460 }, { "epoch": 0.9998985080686086, "step": 2463, "total_flos": 0.0, "train_loss": 17.218061584212457, "train_runtime": 5138.9594, "train_samples_per_second": 11.504, "train_steps_per_second": 0.479 } ], "logging_steps": 10, "max_steps": 2463, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }