{ "best_metric": 0.7931236608249025, "best_model_checkpoint": "./models/checkpoint-1500", "epoch": 1.0053619302949062, "eval_steps": 500, "global_step": 1500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003351206434316354, "grad_norm": 248.0, "learning_rate": 5e-06, "loss": 12.9737, "step": 5 }, { "epoch": 0.006702412868632708, "grad_norm": 205.0, "learning_rate": 1e-05, "loss": 12.2905, "step": 10 }, { "epoch": 0.010053619302949061, "grad_norm": 78.5, "learning_rate": 1.5e-05, "loss": 10.8747, "step": 15 }, { "epoch": 0.013404825737265416, "grad_norm": 45.25, "learning_rate": 2e-05, "loss": 9.7048, "step": 20 }, { "epoch": 0.01675603217158177, "grad_norm": 35.0, "learning_rate": 2.5e-05, "loss": 8.8715, "step": 25 }, { "epoch": 0.020107238605898123, "grad_norm": 30.0, "learning_rate": 3e-05, "loss": 8.1936, "step": 30 }, { "epoch": 0.023458445040214475, "grad_norm": 29.5, "learning_rate": 3.5e-05, "loss": 7.5312, "step": 35 }, { "epoch": 0.02680965147453083, "grad_norm": 35.75, "learning_rate": 4e-05, "loss": 6.9812, "step": 40 }, { "epoch": 0.030160857908847184, "grad_norm": 47.75, "learning_rate": 4.5e-05, "loss": 6.2436, "step": 45 }, { "epoch": 0.03351206434316354, "grad_norm": 25.125, "learning_rate": 5e-05, "loss": 5.7614, "step": 50 }, { "epoch": 0.03686327077747989, "grad_norm": 19.625, "learning_rate": 5.500000000000001e-05, "loss": 5.6177, "step": 55 }, { "epoch": 0.040214477211796246, "grad_norm": 13.5625, "learning_rate": 6e-05, "loss": 5.3004, "step": 60 }, { "epoch": 0.0435656836461126, "grad_norm": 21.375, "learning_rate": 6.500000000000001e-05, "loss": 4.9812, "step": 65 }, { "epoch": 0.04691689008042895, "grad_norm": 14.6875, "learning_rate": 7e-05, "loss": 4.827, "step": 70 }, { "epoch": 0.05026809651474531, "grad_norm": 15.6875, "learning_rate": 7.500000000000001e-05, "loss": 4.4671, "step": 75 }, { "epoch": 0.05361930294906166, "grad_norm": 13.4375, "learning_rate": 8e-05, "loss": 4.5079, "step": 80 }, { "epoch": 0.05697050938337802, "grad_norm": 15.8125, "learning_rate": 8.5e-05, "loss": 4.2763, "step": 85 }, { "epoch": 0.06032171581769437, "grad_norm": 18.625, "learning_rate": 9e-05, "loss": 4.1637, "step": 90 }, { "epoch": 0.06367292225201072, "grad_norm": 15.125, "learning_rate": 9.5e-05, "loss": 4.0087, "step": 95 }, { "epoch": 0.06702412868632708, "grad_norm": 12.875, "learning_rate": 0.0001, "loss": 3.9387, "step": 100 }, { "epoch": 0.07037533512064344, "grad_norm": 14.0, "learning_rate": 0.000105, "loss": 3.9187, "step": 105 }, { "epoch": 0.07372654155495978, "grad_norm": 12.0, "learning_rate": 0.00011000000000000002, "loss": 3.808, "step": 110 }, { "epoch": 0.07707774798927614, "grad_norm": 12.1875, "learning_rate": 0.00011499999999999999, "loss": 3.6024, "step": 115 }, { "epoch": 0.08042895442359249, "grad_norm": 11.4375, "learning_rate": 0.00012, "loss": 3.6723, "step": 120 }, { "epoch": 0.08378016085790885, "grad_norm": 11.5625, "learning_rate": 0.000125, "loss": 3.5476, "step": 125 }, { "epoch": 0.0871313672922252, "grad_norm": 11.0, "learning_rate": 0.00013000000000000002, "loss": 3.4838, "step": 130 }, { "epoch": 0.09048257372654156, "grad_norm": 10.375, "learning_rate": 0.00013500000000000003, "loss": 3.5531, "step": 135 }, { "epoch": 0.0938337801608579, "grad_norm": 11.75, "learning_rate": 0.00014, "loss": 3.4721, "step": 140 }, { "epoch": 0.09718498659517426, "grad_norm": 11.75, "learning_rate": 0.000145, "loss": 3.4215, "step": 145 }, { "epoch": 0.10053619302949061, "grad_norm": 11.5, "learning_rate": 0.00015000000000000001, "loss": 3.5449, "step": 150 }, { "epoch": 0.10388739946380697, "grad_norm": 19.0, "learning_rate": 0.000155, "loss": 3.3309, "step": 155 }, { "epoch": 0.10723860589812333, "grad_norm": 10.0, "learning_rate": 0.00016, "loss": 3.3066, "step": 160 }, { "epoch": 0.11058981233243968, "grad_norm": 13.25, "learning_rate": 0.000165, "loss": 3.3844, "step": 165 }, { "epoch": 0.11394101876675604, "grad_norm": 10.5625, "learning_rate": 0.00017, "loss": 3.2758, "step": 170 }, { "epoch": 0.11729222520107238, "grad_norm": 10.75, "learning_rate": 0.000175, "loss": 3.4125, "step": 175 }, { "epoch": 0.12064343163538874, "grad_norm": 10.0, "learning_rate": 0.00018, "loss": 3.0847, "step": 180 }, { "epoch": 0.1239946380697051, "grad_norm": 23.5, "learning_rate": 0.00018500000000000002, "loss": 3.1467, "step": 185 }, { "epoch": 0.12734584450402145, "grad_norm": 8.0625, "learning_rate": 0.00019, "loss": 3.1672, "step": 190 }, { "epoch": 0.1306970509383378, "grad_norm": 7.21875, "learning_rate": 0.000195, "loss": 3.2039, "step": 195 }, { "epoch": 0.13404825737265416, "grad_norm": 6.71875, "learning_rate": 0.0002, "loss": 3.2238, "step": 200 }, { "epoch": 0.13739946380697052, "grad_norm": 8.0625, "learning_rate": 0.00019999619230641713, "loss": 3.1889, "step": 205 }, { "epoch": 0.14075067024128687, "grad_norm": 13.4375, "learning_rate": 0.00019998476951563915, "loss": 3.1551, "step": 210 }, { "epoch": 0.14410187667560323, "grad_norm": 11.0, "learning_rate": 0.00019996573249755572, "loss": 3.1229, "step": 215 }, { "epoch": 0.14745308310991956, "grad_norm": 8.375, "learning_rate": 0.0001999390827019096, "loss": 3.0822, "step": 220 }, { "epoch": 0.15080428954423591, "grad_norm": 7.875, "learning_rate": 0.0001999048221581858, "loss": 2.9755, "step": 225 }, { "epoch": 0.15415549597855227, "grad_norm": 8.5, "learning_rate": 0.0001998629534754574, "loss": 3.0043, "step": 230 }, { "epoch": 0.15750670241286863, "grad_norm": 9.0625, "learning_rate": 0.0001998134798421867, "loss": 2.9992, "step": 235 }, { "epoch": 0.16085790884718498, "grad_norm": 13.4375, "learning_rate": 0.00019975640502598244, "loss": 3.0183, "step": 240 }, { "epoch": 0.16420911528150134, "grad_norm": 10.875, "learning_rate": 0.0001996917333733128, "loss": 3.004, "step": 245 }, { "epoch": 0.1675603217158177, "grad_norm": 7.75, "learning_rate": 0.00019961946980917456, "loss": 3.0863, "step": 250 }, { "epoch": 0.17091152815013405, "grad_norm": 7.71875, "learning_rate": 0.00019953961983671788, "loss": 2.7425, "step": 255 }, { "epoch": 0.1742627345844504, "grad_norm": 9.5625, "learning_rate": 0.00019945218953682734, "loss": 3.0072, "step": 260 }, { "epoch": 0.17761394101876676, "grad_norm": 7.96875, "learning_rate": 0.00019935718556765876, "loss": 2.9687, "step": 265 }, { "epoch": 0.18096514745308312, "grad_norm": 8.8125, "learning_rate": 0.00019925461516413223, "loss": 2.9539, "step": 270 }, { "epoch": 0.18431635388739948, "grad_norm": 7.46875, "learning_rate": 0.00019914448613738106, "loss": 2.8877, "step": 275 }, { "epoch": 0.1876675603217158, "grad_norm": 5.65625, "learning_rate": 0.00019902680687415705, "loss": 2.8264, "step": 280 }, { "epoch": 0.19101876675603216, "grad_norm": 6.375, "learning_rate": 0.0001989015863361917, "loss": 2.8201, "step": 285 }, { "epoch": 0.19436997319034852, "grad_norm": 6.09375, "learning_rate": 0.00019876883405951377, "loss": 2.8141, "step": 290 }, { "epoch": 0.19772117962466487, "grad_norm": 7.0, "learning_rate": 0.00019862856015372317, "loss": 2.9018, "step": 295 }, { "epoch": 0.20107238605898123, "grad_norm": 6.34375, "learning_rate": 0.00019848077530122083, "loss": 2.8597, "step": 300 }, { "epoch": 0.20442359249329758, "grad_norm": 8.9375, "learning_rate": 0.0001983254907563955, "loss": 2.8288, "step": 305 }, { "epoch": 0.20777479892761394, "grad_norm": 21.375, "learning_rate": 0.00019816271834476642, "loss": 2.8927, "step": 310 }, { "epoch": 0.2111260053619303, "grad_norm": 9.9375, "learning_rate": 0.00019799247046208297, "loss": 2.6922, "step": 315 }, { "epoch": 0.21447721179624665, "grad_norm": 6.21875, "learning_rate": 0.00019781476007338058, "loss": 2.9288, "step": 320 }, { "epoch": 0.217828418230563, "grad_norm": 4.84375, "learning_rate": 0.00019762960071199333, "loss": 2.7749, "step": 325 }, { "epoch": 0.22117962466487937, "grad_norm": 6.28125, "learning_rate": 0.00019743700647852354, "loss": 2.725, "step": 330 }, { "epoch": 0.22453083109919572, "grad_norm": 11.8125, "learning_rate": 0.00019723699203976766, "loss": 2.9425, "step": 335 }, { "epoch": 0.22788203753351208, "grad_norm": 6.9375, "learning_rate": 0.00019702957262759965, "loss": 2.744, "step": 340 }, { "epoch": 0.2312332439678284, "grad_norm": 5.0625, "learning_rate": 0.0001968147640378108, "loss": 2.6849, "step": 345 }, { "epoch": 0.23458445040214476, "grad_norm": 5.9375, "learning_rate": 0.00019659258262890683, "loss": 2.9359, "step": 350 }, { "epoch": 0.23793565683646112, "grad_norm": 8.0, "learning_rate": 0.0001963630453208623, "loss": 2.7998, "step": 355 }, { "epoch": 0.24128686327077747, "grad_norm": 4.4375, "learning_rate": 0.0001961261695938319, "loss": 2.6601, "step": 360 }, { "epoch": 0.24463806970509383, "grad_norm": 5.1875, "learning_rate": 0.0001958819734868193, "loss": 2.6936, "step": 365 }, { "epoch": 0.2479892761394102, "grad_norm": 5.25, "learning_rate": 0.00019563047559630357, "loss": 2.7705, "step": 370 }, { "epoch": 0.25134048257372654, "grad_norm": 5.96875, "learning_rate": 0.0001953716950748227, "loss": 2.8504, "step": 375 }, { "epoch": 0.2546916890080429, "grad_norm": 5.78125, "learning_rate": 0.00019510565162951537, "loss": 2.5577, "step": 380 }, { "epoch": 0.25804289544235925, "grad_norm": 5.5, "learning_rate": 0.00019483236552061994, "loss": 2.7717, "step": 385 }, { "epoch": 0.2613941018766756, "grad_norm": 4.8125, "learning_rate": 0.0001945518575599317, "loss": 2.6139, "step": 390 }, { "epoch": 0.26474530831099197, "grad_norm": 60.75, "learning_rate": 0.00019426414910921787, "loss": 2.7684, "step": 395 }, { "epoch": 0.2680965147453083, "grad_norm": 4.53125, "learning_rate": 0.00019396926207859084, "loss": 2.6009, "step": 400 }, { "epoch": 0.2714477211796247, "grad_norm": 5.28125, "learning_rate": 0.00019366721892483978, "loss": 2.6487, "step": 405 }, { "epoch": 0.27479892761394104, "grad_norm": 4.625, "learning_rate": 0.00019335804264972018, "loss": 2.5787, "step": 410 }, { "epoch": 0.2781501340482574, "grad_norm": 4.78125, "learning_rate": 0.00019304175679820247, "loss": 2.5201, "step": 415 }, { "epoch": 0.28150134048257375, "grad_norm": 4.5, "learning_rate": 0.00019271838545667876, "loss": 2.575, "step": 420 }, { "epoch": 0.2848525469168901, "grad_norm": 10.4375, "learning_rate": 0.0001923879532511287, "loss": 2.4937, "step": 425 }, { "epoch": 0.28820375335120646, "grad_norm": 14.5625, "learning_rate": 0.00019205048534524406, "loss": 2.6559, "step": 430 }, { "epoch": 0.2915549597855228, "grad_norm": 4.28125, "learning_rate": 0.0001917060074385124, "loss": 2.529, "step": 435 }, { "epoch": 0.2949061662198391, "grad_norm": 4.125, "learning_rate": 0.0001913545457642601, "loss": 2.6327, "step": 440 }, { "epoch": 0.2982573726541555, "grad_norm": 4.96875, "learning_rate": 0.00019099612708765434, "loss": 2.6676, "step": 445 }, { "epoch": 0.30160857908847183, "grad_norm": 4.84375, "learning_rate": 0.000190630778703665, "loss": 2.531, "step": 450 }, { "epoch": 0.3049597855227882, "grad_norm": 4.40625, "learning_rate": 0.00019025852843498607, "loss": 2.586, "step": 455 }, { "epoch": 0.30831099195710454, "grad_norm": 4.1875, "learning_rate": 0.0001898794046299167, "loss": 2.6193, "step": 460 }, { "epoch": 0.3116621983914209, "grad_norm": 4.375, "learning_rate": 0.00018949343616020252, "loss": 2.5805, "step": 465 }, { "epoch": 0.31501340482573725, "grad_norm": 4.1875, "learning_rate": 0.0001891006524188368, "loss": 2.6252, "step": 470 }, { "epoch": 0.3183646112600536, "grad_norm": 3.9375, "learning_rate": 0.00018870108331782217, "loss": 2.5582, "step": 475 }, { "epoch": 0.32171581769436997, "grad_norm": 4.90625, "learning_rate": 0.00018829475928589271, "loss": 2.6244, "step": 480 }, { "epoch": 0.3250670241286863, "grad_norm": 31.375, "learning_rate": 0.00018788171126619653, "loss": 2.651, "step": 485 }, { "epoch": 0.3284182305630027, "grad_norm": 4.5, "learning_rate": 0.00018746197071393958, "loss": 2.5153, "step": 490 }, { "epoch": 0.33176943699731903, "grad_norm": 4.15625, "learning_rate": 0.00018703556959398998, "loss": 2.6416, "step": 495 }, { "epoch": 0.3351206434316354, "grad_norm": 4.90625, "learning_rate": 0.00018660254037844388, "loss": 2.5965, "step": 500 }, { "epoch": 0.3351206434316354, "eval_128_ap": 0.6946773245581791, "eval_128_auc": 0.9680845837539295, "eval_128_loss": 2.086740016937256, "eval_128_runtime": 19.4102, "eval_128_samples_per_second": 20.608, "eval_128_steps_per_second": 5.152, "step": 500 }, { "epoch": 0.33847184986595175, "grad_norm": 4.4375, "learning_rate": 0.00018616291604415258, "loss": 2.435, "step": 505 }, { "epoch": 0.3418230563002681, "grad_norm": 4.875, "learning_rate": 0.00018571673007021123, "loss": 2.5466, "step": 510 }, { "epoch": 0.34517426273458446, "grad_norm": 3.875, "learning_rate": 0.00018526401643540922, "loss": 2.6, "step": 515 }, { "epoch": 0.3485254691689008, "grad_norm": 6.34375, "learning_rate": 0.0001848048096156426, "loss": 2.4788, "step": 520 }, { "epoch": 0.35187667560321717, "grad_norm": 4.0625, "learning_rate": 0.0001843391445812886, "loss": 2.4897, "step": 525 }, { "epoch": 0.3552278820375335, "grad_norm": 3.84375, "learning_rate": 0.00018386705679454242, "loss": 2.3744, "step": 530 }, { "epoch": 0.3585790884718499, "grad_norm": 3.5625, "learning_rate": 0.00018338858220671682, "loss": 2.5395, "step": 535 }, { "epoch": 0.36193029490616624, "grad_norm": 4.28125, "learning_rate": 0.00018290375725550417, "loss": 2.4539, "step": 540 }, { "epoch": 0.3652815013404826, "grad_norm": 4.03125, "learning_rate": 0.00018241261886220154, "loss": 2.4642, "step": 545 }, { "epoch": 0.36863270777479895, "grad_norm": 5.03125, "learning_rate": 0.0001819152044288992, "loss": 2.5463, "step": 550 }, { "epoch": 0.3719839142091153, "grad_norm": 3.921875, "learning_rate": 0.00018141155183563193, "loss": 2.4316, "step": 555 }, { "epoch": 0.3753351206434316, "grad_norm": 6.3125, "learning_rate": 0.00018090169943749476, "loss": 2.4111, "step": 560 }, { "epoch": 0.37868632707774796, "grad_norm": 4.84375, "learning_rate": 0.00018038568606172173, "loss": 2.5699, "step": 565 }, { "epoch": 0.3820375335120643, "grad_norm": 4.40625, "learning_rate": 0.00017986355100472928, "loss": 2.3959, "step": 570 }, { "epoch": 0.3853887399463807, "grad_norm": 4.34375, "learning_rate": 0.00017933533402912354, "loss": 2.4525, "step": 575 }, { "epoch": 0.38873994638069703, "grad_norm": 4.375, "learning_rate": 0.00017880107536067218, "loss": 2.5208, "step": 580 }, { "epoch": 0.3920911528150134, "grad_norm": 4.5625, "learning_rate": 0.0001782608156852414, "loss": 2.482, "step": 585 }, { "epoch": 0.39544235924932974, "grad_norm": 4.375, "learning_rate": 0.0001777145961456971, "loss": 2.4212, "step": 590 }, { "epoch": 0.3987935656836461, "grad_norm": 4.4375, "learning_rate": 0.00017716245833877201, "loss": 2.4238, "step": 595 }, { "epoch": 0.40214477211796246, "grad_norm": 4.90625, "learning_rate": 0.0001766044443118978, "loss": 2.4752, "step": 600 }, { "epoch": 0.4054959785522788, "grad_norm": 4.15625, "learning_rate": 0.0001760405965600031, "loss": 2.4592, "step": 605 }, { "epoch": 0.40884718498659517, "grad_norm": 4.0625, "learning_rate": 0.00017547095802227723, "loss": 2.3419, "step": 610 }, { "epoch": 0.4121983914209115, "grad_norm": 3.75, "learning_rate": 0.00017489557207890023, "loss": 2.4173, "step": 615 }, { "epoch": 0.4155495978552279, "grad_norm": 4.3125, "learning_rate": 0.00017431448254773944, "loss": 2.4782, "step": 620 }, { "epoch": 0.41890080428954424, "grad_norm": 4.96875, "learning_rate": 0.0001737277336810124, "loss": 2.4608, "step": 625 }, { "epoch": 0.4222520107238606, "grad_norm": 3.453125, "learning_rate": 0.00017313537016191706, "loss": 2.3687, "step": 630 }, { "epoch": 0.42560321715817695, "grad_norm": 4.0, "learning_rate": 0.00017253743710122878, "loss": 2.5249, "step": 635 }, { "epoch": 0.4289544235924933, "grad_norm": 3.65625, "learning_rate": 0.0001719339800338651, "loss": 2.3771, "step": 640 }, { "epoch": 0.43230563002680966, "grad_norm": 3.90625, "learning_rate": 0.00017132504491541818, "loss": 2.5041, "step": 645 }, { "epoch": 0.435656836461126, "grad_norm": 3.53125, "learning_rate": 0.00017071067811865476, "loss": 2.2476, "step": 650 }, { "epoch": 0.4390080428954424, "grad_norm": 4.3125, "learning_rate": 0.0001700909264299851, "loss": 2.4838, "step": 655 }, { "epoch": 0.44235924932975873, "grad_norm": 3.421875, "learning_rate": 0.00016946583704589973, "loss": 2.3025, "step": 660 }, { "epoch": 0.4457104557640751, "grad_norm": 6.125, "learning_rate": 0.0001688354575693754, "loss": 2.3728, "step": 665 }, { "epoch": 0.44906166219839144, "grad_norm": 3.578125, "learning_rate": 0.00016819983600624986, "loss": 2.3742, "step": 670 }, { "epoch": 0.4524128686327078, "grad_norm": 3.8125, "learning_rate": 0.00016755902076156604, "loss": 2.5019, "step": 675 }, { "epoch": 0.45576407506702415, "grad_norm": 6.09375, "learning_rate": 0.00016691306063588583, "loss": 2.4063, "step": 680 }, { "epoch": 0.45911528150134046, "grad_norm": 3.84375, "learning_rate": 0.00016626200482157378, "loss": 2.3548, "step": 685 }, { "epoch": 0.4624664879356568, "grad_norm": 4.375, "learning_rate": 0.00016560590289905073, "loss": 2.3674, "step": 690 }, { "epoch": 0.46581769436997317, "grad_norm": 3.859375, "learning_rate": 0.00016494480483301836, "loss": 2.3071, "step": 695 }, { "epoch": 0.4691689008042895, "grad_norm": 3.71875, "learning_rate": 0.00016427876096865394, "loss": 2.3699, "step": 700 }, { "epoch": 0.4725201072386059, "grad_norm": 3.859375, "learning_rate": 0.0001636078220277764, "loss": 2.2788, "step": 705 }, { "epoch": 0.47587131367292224, "grad_norm": 3.546875, "learning_rate": 0.00016293203910498376, "loss": 2.2637, "step": 710 }, { "epoch": 0.4792225201072386, "grad_norm": 3.65625, "learning_rate": 0.00016225146366376198, "loss": 2.3791, "step": 715 }, { "epoch": 0.48257372654155495, "grad_norm": 3.46875, "learning_rate": 0.00016156614753256584, "loss": 2.2704, "step": 720 }, { "epoch": 0.4859249329758713, "grad_norm": 6.21875, "learning_rate": 0.00016087614290087208, "loss": 2.3598, "step": 725 }, { "epoch": 0.48927613941018766, "grad_norm": 3.671875, "learning_rate": 0.00016018150231520483, "loss": 2.4334, "step": 730 }, { "epoch": 0.492627345844504, "grad_norm": 3.65625, "learning_rate": 0.00015948227867513415, "loss": 2.434, "step": 735 }, { "epoch": 0.4959785522788204, "grad_norm": 3.640625, "learning_rate": 0.00015877852522924732, "loss": 2.3569, "step": 740 }, { "epoch": 0.49932975871313673, "grad_norm": 3.6875, "learning_rate": 0.00015807029557109398, "loss": 2.3909, "step": 745 }, { "epoch": 0.5026809651474531, "grad_norm": 3.765625, "learning_rate": 0.0001573576436351046, "loss": 2.3255, "step": 750 }, { "epoch": 0.5060321715817694, "grad_norm": 3.421875, "learning_rate": 0.00015664062369248328, "loss": 2.2162, "step": 755 }, { "epoch": 0.5093833780160858, "grad_norm": 4.4375, "learning_rate": 0.0001559192903470747, "loss": 2.2311, "step": 760 }, { "epoch": 0.5127345844504021, "grad_norm": 12.0625, "learning_rate": 0.0001551936985312058, "loss": 2.2553, "step": 765 }, { "epoch": 0.5160857908847185, "grad_norm": 3.90625, "learning_rate": 0.0001544639035015027, "loss": 2.3287, "step": 770 }, { "epoch": 0.5194369973190348, "grad_norm": 3.484375, "learning_rate": 0.0001537299608346824, "loss": 2.3142, "step": 775 }, { "epoch": 0.5227882037533512, "grad_norm": 3.375, "learning_rate": 0.0001529919264233205, "loss": 2.3647, "step": 780 }, { "epoch": 0.5261394101876675, "grad_norm": 4.1875, "learning_rate": 0.0001522498564715949, "loss": 2.2755, "step": 785 }, { "epoch": 0.5294906166219839, "grad_norm": 3.84375, "learning_rate": 0.00015150380749100545, "loss": 2.1969, "step": 790 }, { "epoch": 0.5328418230563002, "grad_norm": 3.46875, "learning_rate": 0.00015075383629607042, "loss": 2.3177, "step": 795 }, { "epoch": 0.5361930294906166, "grad_norm": 3.875, "learning_rate": 0.00015000000000000001, "loss": 2.2585, "step": 800 }, { "epoch": 0.539544235924933, "grad_norm": 3.8125, "learning_rate": 0.00014924235601034672, "loss": 2.3202, "step": 805 }, { "epoch": 0.5428954423592494, "grad_norm": 3.65625, "learning_rate": 0.00014848096202463372, "loss": 2.2391, "step": 810 }, { "epoch": 0.5462466487935657, "grad_norm": 4.15625, "learning_rate": 0.00014771587602596084, "loss": 2.1343, "step": 815 }, { "epoch": 0.5495978552278821, "grad_norm": 3.203125, "learning_rate": 0.00014694715627858908, "loss": 2.2725, "step": 820 }, { "epoch": 0.5529490616621984, "grad_norm": 3.4375, "learning_rate": 0.00014617486132350343, "loss": 2.1359, "step": 825 }, { "epoch": 0.5563002680965148, "grad_norm": 3.4375, "learning_rate": 0.00014539904997395468, "loss": 2.081, "step": 830 }, { "epoch": 0.5596514745308311, "grad_norm": 3.15625, "learning_rate": 0.00014461978131098088, "loss": 2.1051, "step": 835 }, { "epoch": 0.5630026809651475, "grad_norm": 3.328125, "learning_rate": 0.00014383711467890774, "loss": 2.1229, "step": 840 }, { "epoch": 0.5663538873994638, "grad_norm": 4.09375, "learning_rate": 0.00014305110968082952, "loss": 2.2683, "step": 845 }, { "epoch": 0.5697050938337802, "grad_norm": 3.484375, "learning_rate": 0.00014226182617406996, "loss": 2.1467, "step": 850 }, { "epoch": 0.5730563002680965, "grad_norm": 3.640625, "learning_rate": 0.00014146932426562392, "loss": 2.1038, "step": 855 }, { "epoch": 0.5764075067024129, "grad_norm": 3.390625, "learning_rate": 0.00014067366430758004, "loss": 2.3725, "step": 860 }, { "epoch": 0.5797587131367292, "grad_norm": 5.125, "learning_rate": 0.00013987490689252463, "loss": 2.1452, "step": 865 }, { "epoch": 0.5831099195710456, "grad_norm": 3.421875, "learning_rate": 0.00013907311284892736, "loss": 2.1279, "step": 870 }, { "epoch": 0.5864611260053619, "grad_norm": 3.265625, "learning_rate": 0.000138268343236509, "loss": 2.2028, "step": 875 }, { "epoch": 0.5898123324396782, "grad_norm": 3.546875, "learning_rate": 0.00013746065934159123, "loss": 2.2621, "step": 880 }, { "epoch": 0.5931635388739946, "grad_norm": 3.40625, "learning_rate": 0.00013665012267242974, "loss": 2.3162, "step": 885 }, { "epoch": 0.596514745308311, "grad_norm": 3.3125, "learning_rate": 0.00013583679495453, "loss": 2.2635, "step": 890 }, { "epoch": 0.5998659517426274, "grad_norm": 3.65625, "learning_rate": 0.00013502073812594675, "loss": 2.2185, "step": 895 }, { "epoch": 0.6032171581769437, "grad_norm": 3.40625, "learning_rate": 0.00013420201433256689, "loss": 2.1357, "step": 900 }, { "epoch": 0.6065683646112601, "grad_norm": 3.875, "learning_rate": 0.0001333806859233771, "loss": 2.1906, "step": 905 }, { "epoch": 0.6099195710455764, "grad_norm": 3.578125, "learning_rate": 0.00013255681544571568, "loss": 2.1371, "step": 910 }, { "epoch": 0.6132707774798928, "grad_norm": 3.328125, "learning_rate": 0.00013173046564050924, "loss": 2.1377, "step": 915 }, { "epoch": 0.6166219839142091, "grad_norm": 3.078125, "learning_rate": 0.00013090169943749476, "loss": 2.1863, "step": 920 }, { "epoch": 0.6199731903485255, "grad_norm": 3.5625, "learning_rate": 0.00013007057995042732, "loss": 2.1153, "step": 925 }, { "epoch": 0.6233243967828418, "grad_norm": 3.125, "learning_rate": 0.00012923717047227368, "loss": 2.1994, "step": 930 }, { "epoch": 0.6266756032171582, "grad_norm": 3.203125, "learning_rate": 0.00012840153447039228, "loss": 2.205, "step": 935 }, { "epoch": 0.6300268096514745, "grad_norm": 3.46875, "learning_rate": 0.0001275637355816999, "loss": 2.1964, "step": 940 }, { "epoch": 0.6333780160857909, "grad_norm": 3.625, "learning_rate": 0.00012672383760782568, "loss": 2.1978, "step": 945 }, { "epoch": 0.6367292225201072, "grad_norm": 3.171875, "learning_rate": 0.00012588190451025207, "loss": 2.1862, "step": 950 }, { "epoch": 0.6400804289544236, "grad_norm": 3.296875, "learning_rate": 0.00012503800040544416, "loss": 2.1544, "step": 955 }, { "epoch": 0.6434316353887399, "grad_norm": 3.46875, "learning_rate": 0.00012419218955996676, "loss": 2.1247, "step": 960 }, { "epoch": 0.6467828418230563, "grad_norm": 3.0625, "learning_rate": 0.00012334453638559057, "loss": 2.132, "step": 965 }, { "epoch": 0.6501340482573726, "grad_norm": 3.5, "learning_rate": 0.0001224951054343865, "loss": 2.0192, "step": 970 }, { "epoch": 0.653485254691689, "grad_norm": 3.359375, "learning_rate": 0.00012164396139381029, "loss": 2.0863, "step": 975 }, { "epoch": 0.6568364611260054, "grad_norm": 3.234375, "learning_rate": 0.00012079116908177593, "loss": 2.162, "step": 980 }, { "epoch": 0.6601876675603218, "grad_norm": 3.4375, "learning_rate": 0.00011993679344171973, "loss": 2.2546, "step": 985 }, { "epoch": 0.6635388739946381, "grad_norm": 3.640625, "learning_rate": 0.00011908089953765449, "loss": 2.1244, "step": 990 }, { "epoch": 0.6668900804289544, "grad_norm": 3.515625, "learning_rate": 0.00011822355254921478, "loss": 2.1339, "step": 995 }, { "epoch": 0.6702412868632708, "grad_norm": 5.46875, "learning_rate": 0.00011736481776669306, "loss": 2.1744, "step": 1000 }, { "epoch": 0.6702412868632708, "eval_128_ap": 0.7636418814117207, "eval_128_auc": 0.9774185615872945, "eval_128_loss": 1.7800103425979614, "eval_128_runtime": 21.8273, "eval_128_samples_per_second": 18.326, "eval_128_steps_per_second": 4.581, "step": 1000 }, { "epoch": 0.6735924932975871, "grad_norm": 3.359375, "learning_rate": 0.00011650476058606777, "loss": 1.9784, "step": 1005 }, { "epoch": 0.6769436997319035, "grad_norm": 3.0625, "learning_rate": 0.0001156434465040231, "loss": 1.9891, "step": 1010 }, { "epoch": 0.6802949061662198, "grad_norm": 3.8125, "learning_rate": 0.00011478094111296109, "loss": 2.0137, "step": 1015 }, { "epoch": 0.6836461126005362, "grad_norm": 3.5625, "learning_rate": 0.00011391731009600654, "loss": 2.0556, "step": 1020 }, { "epoch": 0.6869973190348525, "grad_norm": 3.140625, "learning_rate": 0.00011305261922200519, "loss": 2.0577, "step": 1025 }, { "epoch": 0.6903485254691689, "grad_norm": 3.1875, "learning_rate": 0.00011218693434051475, "loss": 2.0269, "step": 1030 }, { "epoch": 0.6936997319034852, "grad_norm": 3.28125, "learning_rate": 0.0001113203213767907, "loss": 2.0982, "step": 1035 }, { "epoch": 0.6970509383378016, "grad_norm": 3.28125, "learning_rate": 0.00011045284632676536, "loss": 2.1156, "step": 1040 }, { "epoch": 0.7004021447721179, "grad_norm": 3.140625, "learning_rate": 0.00010958457525202241, "loss": 2.0988, "step": 1045 }, { "epoch": 0.7037533512064343, "grad_norm": 3.40625, "learning_rate": 0.00010871557427476583, "loss": 2.0687, "step": 1050 }, { "epoch": 0.7071045576407506, "grad_norm": 3.359375, "learning_rate": 0.0001078459095727845, "loss": 2.0488, "step": 1055 }, { "epoch": 0.710455764075067, "grad_norm": 3.390625, "learning_rate": 0.00010697564737441252, "loss": 2.1562, "step": 1060 }, { "epoch": 0.7138069705093834, "grad_norm": 2.96875, "learning_rate": 0.00010610485395348571, "loss": 2.0119, "step": 1065 }, { "epoch": 0.7171581769436998, "grad_norm": 3.328125, "learning_rate": 0.0001052335956242944, "loss": 2.0686, "step": 1070 }, { "epoch": 0.7205093833780161, "grad_norm": 3.65625, "learning_rate": 0.00010436193873653361, "loss": 2.0053, "step": 1075 }, { "epoch": 0.7238605898123325, "grad_norm": 3.15625, "learning_rate": 0.00010348994967025012, "loss": 2.0279, "step": 1080 }, { "epoch": 0.7272117962466488, "grad_norm": 3.3125, "learning_rate": 0.00010261769483078733, "loss": 2.2239, "step": 1085 }, { "epoch": 0.7305630026809652, "grad_norm": 3.1875, "learning_rate": 0.00010174524064372837, "loss": 2.0663, "step": 1090 }, { "epoch": 0.7339142091152815, "grad_norm": 3.25, "learning_rate": 0.0001008726535498374, "loss": 2.1292, "step": 1095 }, { "epoch": 0.7372654155495979, "grad_norm": 2.984375, "learning_rate": 0.0001, "loss": 2.0677, "step": 1100 }, { "epoch": 0.7406166219839142, "grad_norm": 3.109375, "learning_rate": 9.912734645016263e-05, "loss": 1.9551, "step": 1105 }, { "epoch": 0.7439678284182306, "grad_norm": 4.34375, "learning_rate": 9.825475935627165e-05, "loss": 2.0803, "step": 1110 }, { "epoch": 0.7473190348525469, "grad_norm": 3.25, "learning_rate": 9.73823051692127e-05, "loss": 2.114, "step": 1115 }, { "epoch": 0.7506702412868632, "grad_norm": 3.0625, "learning_rate": 9.651005032974994e-05, "loss": 2.0298, "step": 1120 }, { "epoch": 0.7540214477211796, "grad_norm": 3.203125, "learning_rate": 9.563806126346642e-05, "loss": 2.15, "step": 1125 }, { "epoch": 0.7573726541554959, "grad_norm": 3.359375, "learning_rate": 9.476640437570562e-05, "loss": 2.0435, "step": 1130 }, { "epoch": 0.7607238605898123, "grad_norm": 3.140625, "learning_rate": 9.38951460465143e-05, "loss": 2.0872, "step": 1135 }, { "epoch": 0.7640750670241286, "grad_norm": 3.421875, "learning_rate": 9.302435262558747e-05, "loss": 1.981, "step": 1140 }, { "epoch": 0.767426273458445, "grad_norm": 2.796875, "learning_rate": 9.215409042721552e-05, "loss": 1.9386, "step": 1145 }, { "epoch": 0.7707774798927614, "grad_norm": 3.34375, "learning_rate": 9.128442572523417e-05, "loss": 2.0754, "step": 1150 }, { "epoch": 0.7741286863270778, "grad_norm": 3.234375, "learning_rate": 9.04154247479776e-05, "loss": 2.0129, "step": 1155 }, { "epoch": 0.7774798927613941, "grad_norm": 3.328125, "learning_rate": 8.954715367323468e-05, "loss": 2.099, "step": 1160 }, { "epoch": 0.7808310991957105, "grad_norm": 3.171875, "learning_rate": 8.867967862320934e-05, "loss": 2.055, "step": 1165 }, { "epoch": 0.7841823056300268, "grad_norm": 3.109375, "learning_rate": 8.781306565948528e-05, "loss": 1.9264, "step": 1170 }, { "epoch": 0.7875335120643432, "grad_norm": 3.359375, "learning_rate": 8.694738077799488e-05, "loss": 1.9382, "step": 1175 }, { "epoch": 0.7908847184986595, "grad_norm": 3.5625, "learning_rate": 8.608268990399349e-05, "loss": 1.9913, "step": 1180 }, { "epoch": 0.7942359249329759, "grad_norm": 3.390625, "learning_rate": 8.521905888703893e-05, "loss": 2.2202, "step": 1185 }, { "epoch": 0.7975871313672922, "grad_norm": 3.296875, "learning_rate": 8.435655349597689e-05, "loss": 1.9226, "step": 1190 }, { "epoch": 0.8009383378016086, "grad_norm": 3.375, "learning_rate": 8.349523941393224e-05, "loss": 1.9842, "step": 1195 }, { "epoch": 0.8042895442359249, "grad_norm": 3.109375, "learning_rate": 8.263518223330697e-05, "loss": 2.0414, "step": 1200 }, { "epoch": 0.8076407506702413, "grad_norm": 3.375, "learning_rate": 8.177644745078526e-05, "loss": 1.9747, "step": 1205 }, { "epoch": 0.8109919571045576, "grad_norm": 3.1875, "learning_rate": 8.091910046234552e-05, "loss": 2.1483, "step": 1210 }, { "epoch": 0.814343163538874, "grad_norm": 3.046875, "learning_rate": 8.00632065582803e-05, "loss": 2.003, "step": 1215 }, { "epoch": 0.8176943699731903, "grad_norm": 3.4375, "learning_rate": 7.920883091822408e-05, "loss": 1.9795, "step": 1220 }, { "epoch": 0.8210455764075067, "grad_norm": 4.46875, "learning_rate": 7.835603860618972e-05, "loss": 1.9079, "step": 1225 }, { "epoch": 0.824396782841823, "grad_norm": 3.046875, "learning_rate": 7.750489456561352e-05, "loss": 1.9086, "step": 1230 }, { "epoch": 0.8277479892761395, "grad_norm": 3.1875, "learning_rate": 7.66554636144095e-05, "loss": 2.138, "step": 1235 }, { "epoch": 0.8310991957104558, "grad_norm": 3.125, "learning_rate": 7.580781044003324e-05, "loss": 1.9724, "step": 1240 }, { "epoch": 0.8344504021447721, "grad_norm": 3.265625, "learning_rate": 7.496199959455584e-05, "loss": 2.0067, "step": 1245 }, { "epoch": 0.8378016085790885, "grad_norm": 3.359375, "learning_rate": 7.411809548974792e-05, "loss": 1.9761, "step": 1250 }, { "epoch": 0.8411528150134048, "grad_norm": 3.265625, "learning_rate": 7.327616239217431e-05, "loss": 1.9118, "step": 1255 }, { "epoch": 0.8445040214477212, "grad_norm": 2.96875, "learning_rate": 7.243626441830009e-05, "loss": 1.992, "step": 1260 }, { "epoch": 0.8478552278820375, "grad_norm": 3.109375, "learning_rate": 7.159846552960774e-05, "loss": 1.9095, "step": 1265 }, { "epoch": 0.8512064343163539, "grad_norm": 3.0625, "learning_rate": 7.076282952772633e-05, "loss": 1.9637, "step": 1270 }, { "epoch": 0.8545576407506702, "grad_norm": 3.265625, "learning_rate": 6.992942004957271e-05, "loss": 1.9976, "step": 1275 }, { "epoch": 0.8579088471849866, "grad_norm": 3.203125, "learning_rate": 6.909830056250527e-05, "loss": 2.0146, "step": 1280 }, { "epoch": 0.8612600536193029, "grad_norm": 4.03125, "learning_rate": 6.826953435949081e-05, "loss": 1.9493, "step": 1285 }, { "epoch": 0.8646112600536193, "grad_norm": 3.25, "learning_rate": 6.744318455428436e-05, "loss": 2.0348, "step": 1290 }, { "epoch": 0.8679624664879356, "grad_norm": 3.09375, "learning_rate": 6.661931407662292e-05, "loss": 1.9574, "step": 1295 }, { "epoch": 0.871313672922252, "grad_norm": 3.234375, "learning_rate": 6.579798566743314e-05, "loss": 2.023, "step": 1300 }, { "epoch": 0.8746648793565683, "grad_norm": 3.109375, "learning_rate": 6.497926187405326e-05, "loss": 1.9266, "step": 1305 }, { "epoch": 0.8780160857908847, "grad_norm": 3.078125, "learning_rate": 6.416320504546997e-05, "loss": 1.9451, "step": 1310 }, { "epoch": 0.881367292225201, "grad_norm": 3.296875, "learning_rate": 6.334987732757029e-05, "loss": 2.0478, "step": 1315 }, { "epoch": 0.8847184986595175, "grad_norm": 3.15625, "learning_rate": 6.25393406584088e-05, "loss": 2.0049, "step": 1320 }, { "epoch": 0.8880697050938338, "grad_norm": 3.15625, "learning_rate": 6.173165676349103e-05, "loss": 1.9374, "step": 1325 }, { "epoch": 0.8914209115281502, "grad_norm": 3.140625, "learning_rate": 6.092688715107264e-05, "loss": 1.9352, "step": 1330 }, { "epoch": 0.8947721179624665, "grad_norm": 3.203125, "learning_rate": 6.012509310747538e-05, "loss": 1.8953, "step": 1335 }, { "epoch": 0.8981233243967829, "grad_norm": 3.25, "learning_rate": 5.9326335692419995e-05, "loss": 2.0176, "step": 1340 }, { "epoch": 0.9014745308310992, "grad_norm": 3.203125, "learning_rate": 5.853067573437612e-05, "loss": 1.9532, "step": 1345 }, { "epoch": 0.9048257372654156, "grad_norm": 3.03125, "learning_rate": 5.773817382593008e-05, "loss": 1.9968, "step": 1350 }, { "epoch": 0.9081769436997319, "grad_norm": 3.0625, "learning_rate": 5.694889031917047e-05, "loss": 2.0223, "step": 1355 }, { "epoch": 0.9115281501340483, "grad_norm": 3.0625, "learning_rate": 5.616288532109225e-05, "loss": 1.8986, "step": 1360 }, { "epoch": 0.9148793565683646, "grad_norm": 3.109375, "learning_rate": 5.5380218689019125e-05, "loss": 1.8947, "step": 1365 }, { "epoch": 0.9182305630026809, "grad_norm": 2.9375, "learning_rate": 5.4600950026045326e-05, "loss": 2.0307, "step": 1370 }, { "epoch": 0.9215817694369973, "grad_norm": 2.859375, "learning_rate": 5.3825138676496624e-05, "loss": 1.8391, "step": 1375 }, { "epoch": 0.9249329758713136, "grad_norm": 3.078125, "learning_rate": 5.305284372141095e-05, "loss": 1.9281, "step": 1380 }, { "epoch": 0.92828418230563, "grad_norm": 3.03125, "learning_rate": 5.2284123974039154e-05, "loss": 2.0512, "step": 1385 }, { "epoch": 0.9316353887399463, "grad_norm": 3.171875, "learning_rate": 5.15190379753663e-05, "loss": 1.9567, "step": 1390 }, { "epoch": 0.9349865951742627, "grad_norm": 3.125, "learning_rate": 5.07576439896533e-05, "loss": 1.864, "step": 1395 }, { "epoch": 0.938337801608579, "grad_norm": 3.296875, "learning_rate": 5.000000000000002e-05, "loss": 1.9016, "step": 1400 }, { "epoch": 0.9416890080428955, "grad_norm": 3.28125, "learning_rate": 4.924616370392961e-05, "loss": 2.0128, "step": 1405 }, { "epoch": 0.9450402144772118, "grad_norm": 2.8125, "learning_rate": 4.8496192508994576e-05, "loss": 2.0419, "step": 1410 }, { "epoch": 0.9483914209115282, "grad_norm": 3.03125, "learning_rate": 4.7750143528405126e-05, "loss": 1.964, "step": 1415 }, { "epoch": 0.9517426273458445, "grad_norm": 3.0625, "learning_rate": 4.700807357667952e-05, "loss": 1.9684, "step": 1420 }, { "epoch": 0.9550938337801609, "grad_norm": 3.265625, "learning_rate": 4.6270039165317605e-05, "loss": 1.9713, "step": 1425 }, { "epoch": 0.9584450402144772, "grad_norm": 3.015625, "learning_rate": 4.553609649849728e-05, "loss": 1.8728, "step": 1430 }, { "epoch": 0.9617962466487936, "grad_norm": 3.28125, "learning_rate": 4.480630146879419e-05, "loss": 1.8794, "step": 1435 }, { "epoch": 0.9651474530831099, "grad_norm": 3.296875, "learning_rate": 4.4080709652925336e-05, "loss": 2.017, "step": 1440 }, { "epoch": 0.9684986595174263, "grad_norm": 3.140625, "learning_rate": 4.335937630751674e-05, "loss": 1.9458, "step": 1445 }, { "epoch": 0.9718498659517426, "grad_norm": 3.09375, "learning_rate": 4.264235636489542e-05, "loss": 2.0505, "step": 1450 }, { "epoch": 0.975201072386059, "grad_norm": 3.21875, "learning_rate": 4.1929704428906026e-05, "loss": 1.9101, "step": 1455 }, { "epoch": 0.9785522788203753, "grad_norm": 3.078125, "learning_rate": 4.12214747707527e-05, "loss": 1.8192, "step": 1460 }, { "epoch": 0.9819034852546917, "grad_norm": 3.0, "learning_rate": 4.0517721324865884e-05, "loss": 1.9514, "step": 1465 }, { "epoch": 0.985254691689008, "grad_norm": 3.3125, "learning_rate": 3.981849768479517e-05, "loss": 1.9101, "step": 1470 }, { "epoch": 0.9886058981233244, "grad_norm": 2.96875, "learning_rate": 3.9123857099127936e-05, "loss": 1.8583, "step": 1475 }, { "epoch": 0.9919571045576407, "grad_norm": 3.078125, "learning_rate": 3.843385246743417e-05, "loss": 1.8693, "step": 1480 }, { "epoch": 0.9953083109919572, "grad_norm": 3.109375, "learning_rate": 3.774853633623806e-05, "loss": 1.952, "step": 1485 }, { "epoch": 0.9986595174262735, "grad_norm": 2.9375, "learning_rate": 3.7067960895016275e-05, "loss": 1.9502, "step": 1490 }, { "epoch": 1.0020107238605898, "grad_norm": 4.1875, "learning_rate": 3.6392177972223594e-05, "loss": 1.7212, "step": 1495 }, { "epoch": 1.0053619302949062, "grad_norm": 2.96875, "learning_rate": 3.5721239031346066e-05, "loss": 1.6225, "step": 1500 }, { "epoch": 1.0053619302949062, "eval_128_ap": 0.7931236608249025, "eval_128_auc": 0.9804633885794023, "eval_128_loss": 1.6620492935180664, "eval_128_runtime": 20.588, "eval_128_samples_per_second": 19.429, "eval_128_steps_per_second": 4.857, "step": 1500 } ], "logging_steps": 5, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4517613999095808.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }