{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.751640112464855, "eval_steps": 250, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01874414245548266, "grad_norm": 1.0379021167755127, "learning_rate": 0.0007999941425412586, "loss": 0.6752, "step": 10 }, { "epoch": 0.03748828491096532, "grad_norm": 0.779513955116272, "learning_rate": 0.000799988285082517, "loss": 0.3775, "step": 20 }, { "epoch": 0.056232427366447985, "grad_norm": 0.7819342017173767, "learning_rate": 0.0007999824276237754, "loss": 0.3341, "step": 30 }, { "epoch": 0.07497656982193064, "grad_norm": 0.8172865509986877, "learning_rate": 0.0007999765701650339, "loss": 0.3026, "step": 40 }, { "epoch": 0.09372071227741331, "grad_norm": 0.6789480447769165, "learning_rate": 0.0007999707127062924, "loss": 0.3091, "step": 50 }, { "epoch": 0.11246485473289597, "grad_norm": 0.6779003143310547, "learning_rate": 0.0007999648552475509, "loss": 0.2934, "step": 60 }, { "epoch": 0.13120899718837864, "grad_norm": 0.5085116028785706, "learning_rate": 0.0007999589977888094, "loss": 0.2856, "step": 70 }, { "epoch": 0.14995313964386128, "grad_norm": 0.5592414140701294, "learning_rate": 0.0007999531403300678, "loss": 0.2736, "step": 80 }, { "epoch": 0.16869728209934395, "grad_norm": 0.5976341366767883, "learning_rate": 0.0007999472828713263, "loss": 0.2938, "step": 90 }, { "epoch": 0.18744142455482662, "grad_norm": 0.5462539196014404, "learning_rate": 0.0007999414254125848, "loss": 0.2597, "step": 100 }, { "epoch": 0.20618556701030927, "grad_norm": 0.5744641423225403, "learning_rate": 0.0007999355679538432, "loss": 0.2784, "step": 110 }, { "epoch": 0.22492970946579194, "grad_norm": 0.6287326812744141, "learning_rate": 0.0007999297104951018, "loss": 0.2616, "step": 120 }, { "epoch": 0.2436738519212746, "grad_norm": 0.5516992211341858, "learning_rate": 0.0007999238530363602, "loss": 0.2668, "step": 130 }, { "epoch": 0.2624179943767573, "grad_norm": 0.5527953505516052, "learning_rate": 0.0007999179955776188, "loss": 0.2623, "step": 140 }, { "epoch": 0.28116213683223995, "grad_norm": 0.5433118343353271, "learning_rate": 0.0007999121381188772, "loss": 0.2692, "step": 150 }, { "epoch": 0.29990627928772257, "grad_norm": 0.5417677760124207, "learning_rate": 0.0007999062806601356, "loss": 0.2435, "step": 160 }, { "epoch": 0.31865042174320524, "grad_norm": 0.523895800113678, "learning_rate": 0.0007999004232013942, "loss": 0.2459, "step": 170 }, { "epoch": 0.3373945641986879, "grad_norm": 0.7117316126823425, "learning_rate": 0.0007998945657426526, "loss": 0.2638, "step": 180 }, { "epoch": 0.3561387066541706, "grad_norm": 0.777367353439331, "learning_rate": 0.0007998887082839111, "loss": 0.2423, "step": 190 }, { "epoch": 0.37488284910965325, "grad_norm": 0.6237531304359436, "learning_rate": 0.0007998828508251696, "loss": 0.2446, "step": 200 }, { "epoch": 0.3936269915651359, "grad_norm": 0.47088104486465454, "learning_rate": 0.000799876993366428, "loss": 0.2556, "step": 210 }, { "epoch": 0.41237113402061853, "grad_norm": 0.48523184657096863, "learning_rate": 0.0007998711359076864, "loss": 0.2361, "step": 220 }, { "epoch": 0.4311152764761012, "grad_norm": 0.5119248628616333, "learning_rate": 0.000799865278448945, "loss": 0.2401, "step": 230 }, { "epoch": 0.4498594189315839, "grad_norm": 0.4833837151527405, "learning_rate": 0.0007998594209902034, "loss": 0.2412, "step": 240 }, { "epoch": 0.46860356138706655, "grad_norm": 0.5014116764068604, "learning_rate": 0.000799853563531462, "loss": 0.2475, "step": 250 }, { "epoch": 0.46860356138706655, "eval_loss": 0.11557532846927643, "eval_pearson_cosine": 0.7298070192337036, "eval_pearson_dot": 0.7093910574913025, "eval_pearson_euclidean": 0.7365932464599609, "eval_pearson_manhattan": 0.7382453680038452, "eval_runtime": 27.2701, "eval_samples_per_second": 55.005, "eval_spearman_cosine": 0.7577597198343433, "eval_spearman_dot": 0.7151991550847255, "eval_spearman_euclidean": 0.7434473510612767, "eval_spearman_manhattan": 0.747354805702794, "eval_steps_per_second": 6.894, "step": 250 }, { "epoch": 0.4873477038425492, "grad_norm": 0.4523856043815613, "learning_rate": 0.0007998477060727204, "loss": 0.2261, "step": 260 }, { "epoch": 0.5060918462980318, "grad_norm": 0.5232961177825928, "learning_rate": 0.0007998418486139788, "loss": 0.2473, "step": 270 }, { "epoch": 0.5248359887535146, "grad_norm": 0.5113686323165894, "learning_rate": 0.0007998359911552374, "loss": 0.2354, "step": 280 }, { "epoch": 0.5435801312089972, "grad_norm": 0.458387166261673, "learning_rate": 0.0007998301336964958, "loss": 0.2568, "step": 290 }, { "epoch": 0.5623242736644799, "grad_norm": 0.45359304547309875, "learning_rate": 0.0007998242762377543, "loss": 0.2415, "step": 300 }, { "epoch": 0.5810684161199625, "grad_norm": 0.46073561906814575, "learning_rate": 0.0007998184187790128, "loss": 0.2291, "step": 310 }, { "epoch": 0.5998125585754451, "grad_norm": 0.4503585696220398, "learning_rate": 0.0007998125613202712, "loss": 0.2368, "step": 320 }, { "epoch": 0.6185567010309279, "grad_norm": 0.4221174716949463, "learning_rate": 0.0007998067038615297, "loss": 0.2322, "step": 330 }, { "epoch": 0.6373008434864105, "grad_norm": 0.42522430419921875, "learning_rate": 0.0007998008464027882, "loss": 0.241, "step": 340 }, { "epoch": 0.6560449859418932, "grad_norm": 0.47986653447151184, "learning_rate": 0.0007997949889440467, "loss": 0.2252, "step": 350 }, { "epoch": 0.6747891283973758, "grad_norm": 0.6221345067024231, "learning_rate": 0.0007997891314853051, "loss": 0.2418, "step": 360 }, { "epoch": 0.6935332708528584, "grad_norm": 0.4737911820411682, "learning_rate": 0.0007997832740265637, "loss": 0.2232, "step": 370 }, { "epoch": 0.7122774133083412, "grad_norm": 0.47973355650901794, "learning_rate": 0.000799777416567822, "loss": 0.227, "step": 380 }, { "epoch": 0.7310215557638238, "grad_norm": 0.4451119005680084, "learning_rate": 0.0007997715591090806, "loss": 0.2206, "step": 390 }, { "epoch": 0.7497656982193065, "grad_norm": 0.4816949963569641, "learning_rate": 0.0007997657016503391, "loss": 0.221, "step": 400 }, { "epoch": 0.7685098406747891, "grad_norm": 0.44739213585853577, "learning_rate": 0.0007997598441915975, "loss": 0.226, "step": 410 }, { "epoch": 0.7872539831302718, "grad_norm": 0.4036339521408081, "learning_rate": 0.0007997539867328561, "loss": 0.2359, "step": 420 }, { "epoch": 0.8059981255857545, "grad_norm": 0.4639866054058075, "learning_rate": 0.0007997481292741144, "loss": 0.2251, "step": 430 }, { "epoch": 0.8247422680412371, "grad_norm": 0.4569236636161804, "learning_rate": 0.0007997422718153729, "loss": 0.2337, "step": 440 }, { "epoch": 0.8434864104967198, "grad_norm": 0.37712669372558594, "learning_rate": 0.0007997364143566314, "loss": 0.2009, "step": 450 }, { "epoch": 0.8622305529522024, "grad_norm": 0.3660425543785095, "learning_rate": 0.0007997305568978899, "loss": 0.2217, "step": 460 }, { "epoch": 0.8809746954076851, "grad_norm": 0.37786588072776794, "learning_rate": 0.0007997246994391483, "loss": 0.2256, "step": 470 }, { "epoch": 0.8997188378631678, "grad_norm": 0.34985071420669556, "learning_rate": 0.0007997188419804069, "loss": 0.2137, "step": 480 }, { "epoch": 0.9184629803186504, "grad_norm": 0.5390796661376953, "learning_rate": 0.0007997129845216653, "loss": 0.2164, "step": 490 }, { "epoch": 0.9372071227741331, "grad_norm": 0.45559704303741455, "learning_rate": 0.0007997071270629237, "loss": 0.2267, "step": 500 }, { "epoch": 0.9372071227741331, "eval_loss": 0.120799720287323, "eval_pearson_cosine": 0.739181399345398, "eval_pearson_dot": 0.7062755823135376, "eval_pearson_euclidean": 0.7443870306015015, "eval_pearson_manhattan": 0.7475869655609131, "eval_runtime": 27.6267, "eval_samples_per_second": 54.295, "eval_spearman_cosine": 0.7615568395698803, "eval_spearman_dot": 0.7105088137905408, "eval_spearman_euclidean": 0.7483334786946333, "eval_spearman_manhattan": 0.753415014125672, "eval_steps_per_second": 6.805, "step": 500 }, { "epoch": 0.9559512652296157, "grad_norm": 0.4264763593673706, "learning_rate": 0.0007997012696041823, "loss": 0.1925, "step": 510 }, { "epoch": 0.9746954076850984, "grad_norm": 0.3580343425273895, "learning_rate": 0.0007996954121454407, "loss": 0.2208, "step": 520 }, { "epoch": 0.993439550140581, "grad_norm": 0.3899773359298706, "learning_rate": 0.0007996895546866993, "loss": 0.2219, "step": 530 }, { "epoch": 1.013120899718838, "grad_norm": 0.34510987997055054, "learning_rate": 0.0007996836972279577, "loss": 0.1681, "step": 540 }, { "epoch": 1.0318650421743205, "grad_norm": 0.52265465259552, "learning_rate": 0.0007996778397692161, "loss": 0.1328, "step": 550 }, { "epoch": 1.0506091846298031, "grad_norm": 0.39699018001556396, "learning_rate": 0.0007996719823104747, "loss": 0.146, "step": 560 }, { "epoch": 1.069353327085286, "grad_norm": 0.3806276023387909, "learning_rate": 0.0007996661248517331, "loss": 0.1394, "step": 570 }, { "epoch": 1.0880974695407686, "grad_norm": 0.38312238454818726, "learning_rate": 0.0007996602673929916, "loss": 0.1296, "step": 580 }, { "epoch": 1.1068416119962512, "grad_norm": 0.40978288650512695, "learning_rate": 0.0007996544099342501, "loss": 0.1328, "step": 590 }, { "epoch": 1.1255857544517338, "grad_norm": 0.36600926518440247, "learning_rate": 0.0007996485524755086, "loss": 0.1403, "step": 600 }, { "epoch": 1.1443298969072164, "grad_norm": 0.44099515676498413, "learning_rate": 0.0007996426950167669, "loss": 0.133, "step": 610 }, { "epoch": 1.1630740393626993, "grad_norm": 0.3496938645839691, "learning_rate": 0.0007996368375580255, "loss": 0.1334, "step": 620 }, { "epoch": 1.1818181818181819, "grad_norm": 0.3754808306694031, "learning_rate": 0.0007996309800992839, "loss": 0.1417, "step": 630 }, { "epoch": 1.2005623242736645, "grad_norm": 0.45285704731941223, "learning_rate": 0.0007996251226405425, "loss": 0.1525, "step": 640 }, { "epoch": 1.219306466729147, "grad_norm": 0.4499760866165161, "learning_rate": 0.000799619265181801, "loss": 0.1432, "step": 650 }, { "epoch": 1.2380506091846297, "grad_norm": 0.3310897648334503, "learning_rate": 0.0007996134077230593, "loss": 0.1392, "step": 660 }, { "epoch": 1.2567947516401126, "grad_norm": 0.32931098341941833, "learning_rate": 0.0007996075502643179, "loss": 0.1484, "step": 670 }, { "epoch": 1.2755388940955952, "grad_norm": 0.32448434829711914, "learning_rate": 0.0007996016928055763, "loss": 0.1464, "step": 680 }, { "epoch": 1.2942830365510778, "grad_norm": 0.38759011030197144, "learning_rate": 0.0007995958353468348, "loss": 0.1412, "step": 690 }, { "epoch": 1.3130271790065604, "grad_norm": 0.29216083884239197, "learning_rate": 0.0007995899778880933, "loss": 0.1381, "step": 700 }, { "epoch": 1.331771321462043, "grad_norm": 0.36174365878105164, "learning_rate": 0.0007995841204293518, "loss": 0.1429, "step": 710 }, { "epoch": 1.3505154639175259, "grad_norm": 0.3533223569393158, "learning_rate": 0.0007995782629706102, "loss": 0.1349, "step": 720 }, { "epoch": 1.3692596063730085, "grad_norm": 0.34500110149383545, "learning_rate": 0.0007995724055118687, "loss": 0.1445, "step": 730 }, { "epoch": 1.388003748828491, "grad_norm": 0.3347356617450714, "learning_rate": 0.0007995665480531272, "loss": 0.1508, "step": 740 }, { "epoch": 1.4067478912839737, "grad_norm": 0.5160906910896301, "learning_rate": 0.0007995606905943857, "loss": 0.156, "step": 750 }, { "epoch": 1.4067478912839737, "eval_loss": 0.07170082628726959, "eval_pearson_cosine": 0.7533469200134277, "eval_pearson_dot": 0.7396403551101685, "eval_pearson_euclidean": 0.7499958276748657, "eval_pearson_manhattan": 0.7558424472808838, "eval_runtime": 27.4077, "eval_samples_per_second": 54.729, "eval_spearman_cosine": 0.7671397694369135, "eval_spearman_dot": 0.7444267819763823, "eval_spearman_euclidean": 0.753108908424924, "eval_spearman_manhattan": 0.7608183984789815, "eval_steps_per_second": 6.859, "step": 750 }, { "epoch": 1.4254920337394563, "grad_norm": 0.3751141428947449, "learning_rate": 0.0007995548331356442, "loss": 0.1606, "step": 760 }, { "epoch": 1.4442361761949392, "grad_norm": 0.3998653292655945, "learning_rate": 0.0007995489756769026, "loss": 0.1447, "step": 770 }, { "epoch": 1.4629803186504218, "grad_norm": 0.32710304856300354, "learning_rate": 0.0007995431182181611, "loss": 0.1381, "step": 780 }, { "epoch": 1.4817244611059044, "grad_norm": 0.3845181167125702, "learning_rate": 0.0007995372607594195, "loss": 0.1488, "step": 790 }, { "epoch": 1.5004686035613872, "grad_norm": 0.39582550525665283, "learning_rate": 0.000799531403300678, "loss": 0.1546, "step": 800 }, { "epoch": 1.5192127460168696, "grad_norm": 0.38061007857322693, "learning_rate": 0.0007995255458419366, "loss": 0.1403, "step": 810 }, { "epoch": 1.5379568884723525, "grad_norm": 0.4833431839942932, "learning_rate": 0.000799519688383195, "loss": 0.1582, "step": 820 }, { "epoch": 1.556701030927835, "grad_norm": 0.36174824833869934, "learning_rate": 0.0007995138309244534, "loss": 0.1561, "step": 830 }, { "epoch": 1.5754451733833177, "grad_norm": 0.4403337836265564, "learning_rate": 0.0007995079734657119, "loss": 0.1472, "step": 840 }, { "epoch": 1.5941893158388005, "grad_norm": 0.384498655796051, "learning_rate": 0.0007995021160069704, "loss": 0.1458, "step": 850 }, { "epoch": 1.612933458294283, "grad_norm": 0.29897651076316833, "learning_rate": 0.0007994962585482289, "loss": 0.1528, "step": 860 }, { "epoch": 1.6316776007497658, "grad_norm": 0.3865436613559723, "learning_rate": 0.0007994904010894874, "loss": 0.1497, "step": 870 }, { "epoch": 1.6504217432052484, "grad_norm": 0.34619590640068054, "learning_rate": 0.0007994845436307458, "loss": 0.1425, "step": 880 }, { "epoch": 1.669165885660731, "grad_norm": 0.2863396108150482, "learning_rate": 0.0007994786861720043, "loss": 0.1441, "step": 890 }, { "epoch": 1.6879100281162138, "grad_norm": 0.371105432510376, "learning_rate": 0.0007994728287132628, "loss": 0.1437, "step": 900 }, { "epoch": 1.7066541705716962, "grad_norm": 0.3657528758049011, "learning_rate": 0.0007994669712545212, "loss": 0.1435, "step": 910 }, { "epoch": 1.725398313027179, "grad_norm": 0.333408385515213, "learning_rate": 0.0007994611137957798, "loss": 0.1322, "step": 920 }, { "epoch": 1.7441424554826617, "grad_norm": 0.34960120916366577, "learning_rate": 0.0007994552563370382, "loss": 0.1551, "step": 930 }, { "epoch": 1.7628865979381443, "grad_norm": 0.30177751183509827, "learning_rate": 0.0007994493988782967, "loss": 0.1641, "step": 940 }, { "epoch": 1.7816307403936271, "grad_norm": 0.39110997319221497, "learning_rate": 0.0007994435414195552, "loss": 0.1523, "step": 950 }, { "epoch": 1.8003748828491095, "grad_norm": 0.30561545491218567, "learning_rate": 0.0007994376839608136, "loss": 0.1516, "step": 960 }, { "epoch": 1.8191190253045924, "grad_norm": 0.32364317774772644, "learning_rate": 0.0007994318265020722, "loss": 0.145, "step": 970 }, { "epoch": 1.837863167760075, "grad_norm": 0.36380302906036377, "learning_rate": 0.0007994259690433306, "loss": 0.1576, "step": 980 }, { "epoch": 1.8566073102155576, "grad_norm": 0.3005361258983612, "learning_rate": 0.0007994201115845891, "loss": 0.1535, "step": 990 }, { "epoch": 1.8753514526710404, "grad_norm": 0.33928126096725464, "learning_rate": 0.0007994142541258476, "loss": 0.1741, "step": 1000 }, { "epoch": 1.8753514526710404, "eval_loss": 0.08202449977397919, "eval_pearson_cosine": 0.7594348192214966, "eval_pearson_dot": 0.7289378643035889, "eval_pearson_euclidean": 0.7542859315872192, "eval_pearson_manhattan": 0.7619277238845825, "eval_runtime": 27.4437, "eval_samples_per_second": 54.657, "eval_spearman_cosine": 0.7679965558172381, "eval_spearman_dot": 0.7298723962118674, "eval_spearman_euclidean": 0.7535528712822376, "eval_spearman_manhattan": 0.7621231744319878, "eval_steps_per_second": 6.85, "step": 1000 }, { "epoch": 1.8940955951265228, "grad_norm": 0.3324965834617615, "learning_rate": 0.000799408396667106, "loss": 0.1491, "step": 1010 }, { "epoch": 1.9128397375820057, "grad_norm": 0.3112243711948395, "learning_rate": 0.0007994025392083644, "loss": 0.1622, "step": 1020 }, { "epoch": 1.9315838800374883, "grad_norm": 0.3381972014904022, "learning_rate": 0.000799396681749623, "loss": 0.1462, "step": 1030 }, { "epoch": 1.9503280224929709, "grad_norm": 0.3424859642982483, "learning_rate": 0.0007993908242908814, "loss": 0.1651, "step": 1040 }, { "epoch": 1.9690721649484537, "grad_norm": 0.42161494493484497, "learning_rate": 0.0007993849668321399, "loss": 0.1521, "step": 1050 }, { "epoch": 1.9878163074039361, "grad_norm": 0.3541307747364044, "learning_rate": 0.0007993791093733984, "loss": 0.162, "step": 1060 }, { "epoch": 2.007497656982193, "grad_norm": 0.22963856160640717, "learning_rate": 0.0007993732519146568, "loss": 0.1297, "step": 1070 }, { "epoch": 2.026241799437676, "grad_norm": 0.28242990374565125, "learning_rate": 0.0007993673944559154, "loss": 0.0773, "step": 1080 }, { "epoch": 2.044985941893158, "grad_norm": 0.3516603112220764, "learning_rate": 0.0007993615369971738, "loss": 0.0799, "step": 1090 }, { "epoch": 2.063730084348641, "grad_norm": 0.3558428883552551, "learning_rate": 0.0007993556795384323, "loss": 0.0885, "step": 1100 }, { "epoch": 2.082474226804124, "grad_norm": 0.3211170732975006, "learning_rate": 0.0007993498220796908, "loss": 0.0825, "step": 1110 }, { "epoch": 2.1012183692596063, "grad_norm": 0.20844395458698273, "learning_rate": 0.0007993439646209492, "loss": 0.0763, "step": 1120 }, { "epoch": 2.119962511715089, "grad_norm": 0.3156029284000397, "learning_rate": 0.0007993381071622077, "loss": 0.0797, "step": 1130 }, { "epoch": 2.138706654170572, "grad_norm": 0.3986193835735321, "learning_rate": 0.0007993322497034662, "loss": 0.0852, "step": 1140 }, { "epoch": 2.1574507966260543, "grad_norm": 0.18681703507900238, "learning_rate": 0.0007993263922447247, "loss": 0.0779, "step": 1150 }, { "epoch": 2.176194939081537, "grad_norm": 0.2365262657403946, "learning_rate": 0.0007993205347859831, "loss": 0.0833, "step": 1160 }, { "epoch": 2.1949390815370196, "grad_norm": 0.25459378957748413, "learning_rate": 0.0007993146773272417, "loss": 0.0761, "step": 1170 }, { "epoch": 2.2136832239925024, "grad_norm": 0.39024218916893005, "learning_rate": 0.0007993088198685, "loss": 0.0873, "step": 1180 }, { "epoch": 2.2324273664479852, "grad_norm": 0.3662407100200653, "learning_rate": 0.0007993029624097585, "loss": 0.0842, "step": 1190 }, { "epoch": 2.2511715089034676, "grad_norm": 0.30686551332473755, "learning_rate": 0.0007992971049510171, "loss": 0.0845, "step": 1200 }, { "epoch": 2.2699156513589505, "grad_norm": 0.29860755801200867, "learning_rate": 0.0007992912474922755, "loss": 0.0806, "step": 1210 }, { "epoch": 2.288659793814433, "grad_norm": 0.272029310464859, "learning_rate": 0.0007992853900335341, "loss": 0.0849, "step": 1220 }, { "epoch": 2.3074039362699157, "grad_norm": 0.23034346103668213, "learning_rate": 0.0007992795325747924, "loss": 0.0873, "step": 1230 }, { "epoch": 2.3261480787253985, "grad_norm": 0.38400229811668396, "learning_rate": 0.0007992736751160509, "loss": 0.0854, "step": 1240 }, { "epoch": 2.344892221180881, "grad_norm": 0.2619285583496094, "learning_rate": 0.0007992678176573094, "loss": 0.0854, "step": 1250 }, { "epoch": 2.344892221180881, "eval_loss": 0.0549059733748436, "eval_pearson_cosine": 0.7565033435821533, "eval_pearson_dot": 0.7438405752182007, "eval_pearson_euclidean": 0.7398021221160889, "eval_pearson_manhattan": 0.7514023780822754, "eval_runtime": 27.2363, "eval_samples_per_second": 55.074, "eval_spearman_cosine": 0.7657686934808458, "eval_spearman_dot": 0.7450125999969373, "eval_spearman_euclidean": 0.7411997174627442, "eval_spearman_manhattan": 0.754436544283217, "eval_steps_per_second": 6.903, "step": 1250 }, { "epoch": 2.3636363636363638, "grad_norm": 0.2573038935661316, "learning_rate": 0.0007992619601985679, "loss": 0.0812, "step": 1260 }, { "epoch": 2.382380506091846, "grad_norm": 0.2684009373188019, "learning_rate": 0.0007992561027398263, "loss": 0.0833, "step": 1270 }, { "epoch": 2.401124648547329, "grad_norm": 0.2773861289024353, "learning_rate": 0.0007992502452810849, "loss": 0.0902, "step": 1280 }, { "epoch": 2.419868791002812, "grad_norm": 0.3180435001850128, "learning_rate": 0.0007992443878223433, "loss": 0.0882, "step": 1290 }, { "epoch": 2.438612933458294, "grad_norm": 0.2758583426475525, "learning_rate": 0.0007992385303636017, "loss": 0.0815, "step": 1300 }, { "epoch": 2.457357075913777, "grad_norm": 0.3327929973602295, "learning_rate": 0.0007992326729048603, "loss": 0.0949, "step": 1310 }, { "epoch": 2.4761012183692594, "grad_norm": 0.31645268201828003, "learning_rate": 0.0007992268154461187, "loss": 0.0942, "step": 1320 }, { "epoch": 2.4948453608247423, "grad_norm": 0.2587279975414276, "learning_rate": 0.0007992209579873773, "loss": 0.0889, "step": 1330 }, { "epoch": 2.513589503280225, "grad_norm": 0.29799187183380127, "learning_rate": 0.0007992151005286357, "loss": 0.1027, "step": 1340 }, { "epoch": 2.5323336457357075, "grad_norm": 0.3042343258857727, "learning_rate": 0.0007992092430698941, "loss": 0.0947, "step": 1350 }, { "epoch": 2.5510777881911904, "grad_norm": 0.36439308524131775, "learning_rate": 0.0007992033856111527, "loss": 0.0887, "step": 1360 }, { "epoch": 2.5698219306466727, "grad_norm": 0.24675941467285156, "learning_rate": 0.0007991975281524111, "loss": 0.0893, "step": 1370 }, { "epoch": 2.5885660731021556, "grad_norm": 0.3232560157775879, "learning_rate": 0.0007991916706936696, "loss": 0.0949, "step": 1380 }, { "epoch": 2.6073102155576384, "grad_norm": 0.3095908463001251, "learning_rate": 0.0007991858132349281, "loss": 0.0893, "step": 1390 }, { "epoch": 2.626054358013121, "grad_norm": 0.24996769428253174, "learning_rate": 0.0007991799557761866, "loss": 0.0918, "step": 1400 }, { "epoch": 2.6447985004686037, "grad_norm": 0.3013332486152649, "learning_rate": 0.0007991740983174449, "loss": 0.0965, "step": 1410 }, { "epoch": 2.663542642924086, "grad_norm": 0.43422290682792664, "learning_rate": 0.0007991682408587035, "loss": 0.1144, "step": 1420 }, { "epoch": 2.682286785379569, "grad_norm": 0.3462458848953247, "learning_rate": 0.0007991623833999619, "loss": 0.1068, "step": 1430 }, { "epoch": 2.7010309278350517, "grad_norm": 0.2752937078475952, "learning_rate": 0.0007991565259412205, "loss": 0.1048, "step": 1440 }, { "epoch": 2.719775070290534, "grad_norm": 0.33038660883903503, "learning_rate": 0.000799150668482479, "loss": 0.1055, "step": 1450 }, { "epoch": 2.738519212746017, "grad_norm": 0.28442054986953735, "learning_rate": 0.0007991448110237373, "loss": 0.1053, "step": 1460 }, { "epoch": 2.7572633552014993, "grad_norm": 0.25279343128204346, "learning_rate": 0.0007991389535649959, "loss": 0.109, "step": 1470 }, { "epoch": 2.776007497656982, "grad_norm": 0.3681808114051819, "learning_rate": 0.0007991330961062543, "loss": 0.1092, "step": 1480 }, { "epoch": 2.794751640112465, "grad_norm": 0.3884279429912567, "learning_rate": 0.0007991272386475128, "loss": 0.1105, "step": 1490 }, { "epoch": 2.8134957825679474, "grad_norm": 0.3542380928993225, "learning_rate": 0.0007991213811887713, "loss": 0.109, "step": 1500 }, { "epoch": 2.8134957825679474, "eval_loss": 0.06194353476166725, "eval_pearson_cosine": 0.7544945478439331, "eval_pearson_dot": 0.7297648787498474, "eval_pearson_euclidean": 0.7457708120346069, "eval_pearson_manhattan": 0.7537869215011597, "eval_runtime": 27.28, "eval_samples_per_second": 54.985, "eval_spearman_cosine": 0.7677406665753612, "eval_spearman_dot": 0.7355031880736892, "eval_spearman_euclidean": 0.752266788615453, "eval_spearman_manhattan": 0.7620929193607933, "eval_steps_per_second": 6.892, "step": 1500 }, { "epoch": 2.8322399250234302, "grad_norm": 0.28738659620285034, "learning_rate": 0.0007991155237300298, "loss": 0.1043, "step": 1510 }, { "epoch": 2.8509840674789126, "grad_norm": 0.39117714762687683, "learning_rate": 0.0007991096662712882, "loss": 0.0993, "step": 1520 }, { "epoch": 2.8697282099343955, "grad_norm": 0.3144415616989136, "learning_rate": 0.0007991038088125467, "loss": 0.1145, "step": 1530 }, { "epoch": 2.8884723523898783, "grad_norm": 0.28154823184013367, "learning_rate": 0.0007990979513538052, "loss": 0.1128, "step": 1540 }, { "epoch": 2.9072164948453607, "grad_norm": 0.3766768276691437, "learning_rate": 0.0007990920938950637, "loss": 0.1033, "step": 1550 }, { "epoch": 2.9259606373008435, "grad_norm": 0.38604792952537537, "learning_rate": 0.0007990862364363222, "loss": 0.1044, "step": 1560 }, { "epoch": 2.944704779756326, "grad_norm": 0.36833906173706055, "learning_rate": 0.0007990803789775806, "loss": 0.1159, "step": 1570 }, { "epoch": 2.963448922211809, "grad_norm": 0.3357650935649872, "learning_rate": 0.0007990745215188391, "loss": 0.1185, "step": 1580 }, { "epoch": 2.9821930646672916, "grad_norm": 0.30260348320007324, "learning_rate": 0.0007990686640600976, "loss": 0.1167, "step": 1590 }, { "epoch": 3.0018744142455485, "grad_norm": 0.28110650181770325, "learning_rate": 0.000799062806601356, "loss": 0.1115, "step": 1600 }, { "epoch": 3.020618556701031, "grad_norm": 0.32038745284080505, "learning_rate": 0.0007990569491426146, "loss": 0.0637, "step": 1610 }, { "epoch": 3.0393626991565137, "grad_norm": 0.29342755675315857, "learning_rate": 0.000799051091683873, "loss": 0.0687, "step": 1620 }, { "epoch": 3.058106841611996, "grad_norm": 0.33964619040489197, "learning_rate": 0.0007990452342251314, "loss": 0.0611, "step": 1630 }, { "epoch": 3.076850984067479, "grad_norm": 0.23580531775951385, "learning_rate": 0.0007990393767663899, "loss": 0.0635, "step": 1640 }, { "epoch": 3.0955951265229618, "grad_norm": 0.2617776393890381, "learning_rate": 0.0007990335193076484, "loss": 0.0709, "step": 1650 }, { "epoch": 3.114339268978444, "grad_norm": 0.25627410411834717, "learning_rate": 0.0007990276618489068, "loss": 0.0682, "step": 1660 }, { "epoch": 3.133083411433927, "grad_norm": 0.21987001597881317, "learning_rate": 0.0007990218043901654, "loss": 0.06, "step": 1670 }, { "epoch": 3.1518275538894094, "grad_norm": 0.2657093405723572, "learning_rate": 0.0007990159469314238, "loss": 0.0712, "step": 1680 }, { "epoch": 3.170571696344892, "grad_norm": 0.23929661512374878, "learning_rate": 0.0007990100894726823, "loss": 0.0566, "step": 1690 }, { "epoch": 3.189315838800375, "grad_norm": 0.23572145402431488, "learning_rate": 0.0007990042320139408, "loss": 0.0571, "step": 1700 }, { "epoch": 3.2080599812558575, "grad_norm": 0.26287132501602173, "learning_rate": 0.0007989983745551992, "loss": 0.067, "step": 1710 }, { "epoch": 3.2268041237113403, "grad_norm": 0.24504464864730835, "learning_rate": 0.0007989925170964578, "loss": 0.0637, "step": 1720 }, { "epoch": 3.2455482661668227, "grad_norm": 0.17006747424602509, "learning_rate": 0.0007989866596377162, "loss": 0.0552, "step": 1730 }, { "epoch": 3.2642924086223055, "grad_norm": 0.2752683460712433, "learning_rate": 0.0007989808021789747, "loss": 0.0639, "step": 1740 }, { "epoch": 3.2830365510777884, "grad_norm": 0.2681417465209961, "learning_rate": 0.0007989749447202332, "loss": 0.0705, "step": 1750 }, { "epoch": 3.2830365510777884, "eval_loss": 0.0486464686691761, "eval_pearson_cosine": 0.7632350921630859, "eval_pearson_dot": 0.7505504488945007, "eval_pearson_euclidean": 0.7458865642547607, "eval_pearson_manhattan": 0.7597954273223877, "eval_runtime": 27.3673, "eval_samples_per_second": 54.81, "eval_spearman_cosine": 0.7679814031707208, "eval_spearman_dot": 0.7517654374212466, "eval_spearman_euclidean": 0.7467275015139031, "eval_spearman_manhattan": 0.7607208640788498, "eval_steps_per_second": 6.87, "step": 1750 }, { "epoch": 3.3017806935332707, "grad_norm": 0.24346262216567993, "learning_rate": 0.0007989690872614916, "loss": 0.0658, "step": 1760 }, { "epoch": 3.3205248359887536, "grad_norm": 0.24957306683063507, "learning_rate": 0.0007989632298027502, "loss": 0.0643, "step": 1770 }, { "epoch": 3.3392689784442364, "grad_norm": 0.24416255950927734, "learning_rate": 0.0007989573723440086, "loss": 0.0626, "step": 1780 }, { "epoch": 3.358013120899719, "grad_norm": 0.2224712073802948, "learning_rate": 0.0007989515148852671, "loss": 0.0634, "step": 1790 }, { "epoch": 3.3767572633552017, "grad_norm": 0.27588558197021484, "learning_rate": 0.0007989456574265256, "loss": 0.0644, "step": 1800 }, { "epoch": 3.395501405810684, "grad_norm": 0.26377061009407043, "learning_rate": 0.000798939799967784, "loss": 0.0585, "step": 1810 }, { "epoch": 3.414245548266167, "grad_norm": 0.23178541660308838, "learning_rate": 0.0007989339425090424, "loss": 0.0588, "step": 1820 }, { "epoch": 3.4329896907216497, "grad_norm": 0.1893617808818817, "learning_rate": 0.000798928085050301, "loss": 0.0649, "step": 1830 }, { "epoch": 3.451733833177132, "grad_norm": 0.23445335030555725, "learning_rate": 0.0007989222275915595, "loss": 0.0629, "step": 1840 }, { "epoch": 3.470477975632615, "grad_norm": 0.457109659910202, "learning_rate": 0.0007989163701328179, "loss": 0.0646, "step": 1850 }, { "epoch": 3.4892221180880973, "grad_norm": 0.2316947728395462, "learning_rate": 0.0007989105126740764, "loss": 0.0677, "step": 1860 }, { "epoch": 3.50796626054358, "grad_norm": 0.26950669288635254, "learning_rate": 0.0007989046552153348, "loss": 0.0732, "step": 1870 }, { "epoch": 3.526710402999063, "grad_norm": 0.25258171558380127, "learning_rate": 0.0007988987977565933, "loss": 0.0635, "step": 1880 }, { "epoch": 3.5454545454545454, "grad_norm": 0.2282831370830536, "learning_rate": 0.0007988929402978518, "loss": 0.0766, "step": 1890 }, { "epoch": 3.5641986879100283, "grad_norm": 0.3049706220626831, "learning_rate": 0.0007988870828391103, "loss": 0.0766, "step": 1900 }, { "epoch": 3.5829428303655106, "grad_norm": 0.21556228399276733, "learning_rate": 0.0007988812253803688, "loss": 0.0694, "step": 1910 }, { "epoch": 3.6016869728209935, "grad_norm": 0.2859863340854645, "learning_rate": 0.0007988753679216272, "loss": 0.0665, "step": 1920 }, { "epoch": 3.6204311152764763, "grad_norm": 0.22522784769535065, "learning_rate": 0.0007988695104628857, "loss": 0.073, "step": 1930 }, { "epoch": 3.6391752577319587, "grad_norm": 0.3301334083080292, "learning_rate": 0.0007988636530041442, "loss": 0.0745, "step": 1940 }, { "epoch": 3.6579194001874415, "grad_norm": 0.21438319981098175, "learning_rate": 0.0007988577955454027, "loss": 0.0713, "step": 1950 }, { "epoch": 3.676663542642924, "grad_norm": 0.3207626938819885, "learning_rate": 0.0007988519380866611, "loss": 0.0759, "step": 1960 }, { "epoch": 3.695407685098407, "grad_norm": 0.25493231415748596, "learning_rate": 0.0007988460806279197, "loss": 0.0722, "step": 1970 }, { "epoch": 3.7141518275538896, "grad_norm": 0.2732018530368805, "learning_rate": 0.0007988402231691781, "loss": 0.0773, "step": 1980 }, { "epoch": 3.732895970009372, "grad_norm": 0.19611899554729462, "learning_rate": 0.0007988343657104365, "loss": 0.0773, "step": 1990 }, { "epoch": 3.751640112464855, "grad_norm": 0.2664394676685333, "learning_rate": 0.0007988285082516951, "loss": 0.072, "step": 2000 }, { "epoch": 3.751640112464855, "eval_loss": 0.05059043690562248, "eval_pearson_cosine": 0.7549334764480591, "eval_pearson_dot": 0.7364022731781006, "eval_pearson_euclidean": 0.7430644035339355, "eval_pearson_manhattan": 0.7528964281082153, "eval_runtime": 27.2774, "eval_samples_per_second": 54.991, "eval_spearman_cosine": 0.7612361982335023, "eval_spearman_dot": 0.7370856746295986, "eval_spearman_euclidean": 0.7449844586260276, "eval_spearman_manhattan": 0.7551494271561938, "eval_steps_per_second": 6.892, "step": 2000 } ], "logging_steps": 10, "max_steps": 5330, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }