diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,5381 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.280224929709466, + "eval_steps": 250, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.004686035613870665, + "grad_norm": 0.2652921974658966, + "learning_rate": 9.999414245548266e-06, + "loss": 3.4545, + "step": 10 + }, + { + "epoch": 0.00937207122774133, + "grad_norm": 0.9133270382881165, + "learning_rate": 9.998828491096533e-06, + "loss": 3.4005, + "step": 20 + }, + { + "epoch": 0.014058106841611996, + "grad_norm": 1.7206453084945679, + "learning_rate": 9.9982427366448e-06, + "loss": 3.2559, + "step": 30 + }, + { + "epoch": 0.01874414245548266, + "grad_norm": 1.3775238990783691, + "learning_rate": 9.997656982193065e-06, + "loss": 3.0152, + "step": 40 + }, + { + "epoch": 0.023430178069353328, + "grad_norm": 1.5043880939483643, + "learning_rate": 9.997071227741332e-06, + "loss": 2.904, + "step": 50 + }, + { + "epoch": 0.028116213683223992, + "grad_norm": 1.3289176225662231, + "learning_rate": 9.996485473289597e-06, + "loss": 2.7989, + "step": 60 + }, + { + "epoch": 0.03280224929709466, + "grad_norm": 1.5171725749969482, + "learning_rate": 9.995899718837864e-06, + "loss": 2.7384, + "step": 70 + }, + { + "epoch": 0.03748828491096532, + "grad_norm": 2.6390421390533447, + "learning_rate": 9.99531396438613e-06, + "loss": 2.6937, + "step": 80 + }, + { + "epoch": 0.04217432052483599, + "grad_norm": 2.382201671600342, + "learning_rate": 9.994728209934396e-06, + "loss": 2.5994, + "step": 90 + }, + { + "epoch": 0.046860356138706656, + "grad_norm": 3.3837928771972656, + "learning_rate": 9.994142455482663e-06, + "loss": 2.5344, + "step": 100 + }, + { + "epoch": 0.05154639175257732, + "grad_norm": 3.612391710281372, + "learning_rate": 9.993556701030928e-06, + "loss": 2.4489, + "step": 110 + }, + { + "epoch": 0.056232427366447985, + "grad_norm": 4.58568811416626, + "learning_rate": 9.992970946579195e-06, + "loss": 2.2835, + "step": 120 + }, + { + "epoch": 0.06091846298031865, + "grad_norm": 4.746246814727783, + "learning_rate": 9.99238519212746e-06, + "loss": 2.2496, + "step": 130 + }, + { + "epoch": 0.06560449859418932, + "grad_norm": 3.7526397705078125, + "learning_rate": 9.991799437675727e-06, + "loss": 2.0891, + "step": 140 + }, + { + "epoch": 0.07029053420805999, + "grad_norm": 3.860639810562134, + "learning_rate": 9.991213683223994e-06, + "loss": 1.988, + "step": 150 + }, + { + "epoch": 0.07497656982193064, + "grad_norm": 4.522952079772949, + "learning_rate": 9.99062792877226e-06, + "loss": 1.7019, + "step": 160 + }, + { + "epoch": 0.07966260543580131, + "grad_norm": 4.140243053436279, + "learning_rate": 9.990042174320525e-06, + "loss": 1.5405, + "step": 170 + }, + { + "epoch": 0.08434864104967198, + "grad_norm": 4.114628791809082, + "learning_rate": 9.989456419868792e-06, + "loss": 1.324, + "step": 180 + }, + { + "epoch": 0.08903467666354264, + "grad_norm": 4.97472620010376, + "learning_rate": 9.988870665417057e-06, + "loss": 1.2381, + "step": 190 + }, + { + "epoch": 0.09372071227741331, + "grad_norm": 4.743481636047363, + "learning_rate": 9.988284910965324e-06, + "loss": 1.1454, + "step": 200 + }, + { + "epoch": 0.09840674789128398, + "grad_norm": 4.455767631530762, + "learning_rate": 9.98769915651359e-06, + "loss": 1.1716, + "step": 210 + }, + { + "epoch": 0.10309278350515463, + "grad_norm": 4.199008941650391, + "learning_rate": 9.987113402061856e-06, + "loss": 1.1001, + "step": 220 + }, + { + "epoch": 0.1077788191190253, + "grad_norm": 4.948373794555664, + "learning_rate": 9.986527647610123e-06, + "loss": 1.0628, + "step": 230 + }, + { + "epoch": 0.11246485473289597, + "grad_norm": 4.091549396514893, + "learning_rate": 9.985941893158388e-06, + "loss": 0.9783, + "step": 240 + }, + { + "epoch": 0.11715089034676664, + "grad_norm": 3.7287604808807373, + "learning_rate": 9.985356138706655e-06, + "loss": 1.0084, + "step": 250 + }, + { + "epoch": 0.11715089034676664, + "eval_loss": 0.15787309408187866, + "eval_pearson_cosine": 0.6837765056882574, + "eval_pearson_dot": 0.3479516218854144, + "eval_pearson_euclidean": 0.662064077308699, + "eval_pearson_manhattan": 0.6615032406288108, + "eval_runtime": 39.6668, + "eval_samples_per_second": 37.815, + "eval_spearman_cosine": 0.6994144273095011, + "eval_spearman_dot": 0.34419950655316756, + "eval_spearman_euclidean": 0.6693727884988351, + "eval_spearman_manhattan": 0.66929953399832, + "eval_steps_per_second": 37.815, + "step": 250 + }, + { + "epoch": 0.1218369259606373, + "grad_norm": 4.118359565734863, + "learning_rate": 9.984770384254922e-06, + "loss": 1.0095, + "step": 260 + }, + { + "epoch": 0.12652296157450796, + "grad_norm": 4.312327861785889, + "learning_rate": 9.984184629803187e-06, + "loss": 0.8829, + "step": 270 + }, + { + "epoch": 0.13120899718837864, + "grad_norm": 4.199273586273193, + "learning_rate": 9.983598875351454e-06, + "loss": 0.9286, + "step": 280 + }, + { + "epoch": 0.1358950328022493, + "grad_norm": 4.146886348724365, + "learning_rate": 9.98301312089972e-06, + "loss": 0.935, + "step": 290 + }, + { + "epoch": 0.14058106841611998, + "grad_norm": 4.277599811553955, + "learning_rate": 9.982427366447985e-06, + "loss": 0.9089, + "step": 300 + }, + { + "epoch": 0.14526710402999063, + "grad_norm": 4.278558731079102, + "learning_rate": 9.981841611996253e-06, + "loss": 0.8515, + "step": 310 + }, + { + "epoch": 0.14995313964386128, + "grad_norm": 4.297761917114258, + "learning_rate": 9.981255857544518e-06, + "loss": 0.9178, + "step": 320 + }, + { + "epoch": 0.15463917525773196, + "grad_norm": 4.595494747161865, + "learning_rate": 9.980670103092784e-06, + "loss": 0.8835, + "step": 330 + }, + { + "epoch": 0.15932521087160262, + "grad_norm": 3.567511558532715, + "learning_rate": 9.98008434864105e-06, + "loss": 0.8519, + "step": 340 + }, + { + "epoch": 0.1640112464854733, + "grad_norm": 4.046640872955322, + "learning_rate": 9.979498594189316e-06, + "loss": 0.8736, + "step": 350 + }, + { + "epoch": 0.16869728209934395, + "grad_norm": 4.551926136016846, + "learning_rate": 9.978912839737583e-06, + "loss": 0.8348, + "step": 360 + }, + { + "epoch": 0.1733833177132146, + "grad_norm": 4.367603778839111, + "learning_rate": 9.97832708528585e-06, + "loss": 0.7652, + "step": 370 + }, + { + "epoch": 0.1780693533270853, + "grad_norm": 4.336862564086914, + "learning_rate": 9.977741330834115e-06, + "loss": 0.8035, + "step": 380 + }, + { + "epoch": 0.18275538894095594, + "grad_norm": 3.89697265625, + "learning_rate": 9.977155576382382e-06, + "loss": 0.7218, + "step": 390 + }, + { + "epoch": 0.18744142455482662, + "grad_norm": 4.190406799316406, + "learning_rate": 9.976569821930647e-06, + "loss": 0.8163, + "step": 400 + }, + { + "epoch": 0.19212746016869728, + "grad_norm": 4.282820224761963, + "learning_rate": 9.975984067478914e-06, + "loss": 0.7987, + "step": 410 + }, + { + "epoch": 0.19681349578256796, + "grad_norm": 3.7369320392608643, + "learning_rate": 9.97539831302718e-06, + "loss": 0.8085, + "step": 420 + }, + { + "epoch": 0.2014995313964386, + "grad_norm": 4.697251319885254, + "learning_rate": 9.974812558575446e-06, + "loss": 0.82, + "step": 430 + }, + { + "epoch": 0.20618556701030927, + "grad_norm": 3.3654944896698, + "learning_rate": 9.974226804123713e-06, + "loss": 0.7689, + "step": 440 + }, + { + "epoch": 0.21087160262417995, + "grad_norm": 4.78535270690918, + "learning_rate": 9.973641049671978e-06, + "loss": 0.704, + "step": 450 + }, + { + "epoch": 0.2155576382380506, + "grad_norm": 3.661980628967285, + "learning_rate": 9.973055295220243e-06, + "loss": 0.7202, + "step": 460 + }, + { + "epoch": 0.22024367385192128, + "grad_norm": 3.4837870597839355, + "learning_rate": 9.97246954076851e-06, + "loss": 0.7008, + "step": 470 + }, + { + "epoch": 0.22492970946579194, + "grad_norm": 4.407717704772949, + "learning_rate": 9.971883786316777e-06, + "loss": 0.7182, + "step": 480 + }, + { + "epoch": 0.2296157450796626, + "grad_norm": 3.2862961292266846, + "learning_rate": 9.971298031865042e-06, + "loss": 0.7351, + "step": 490 + }, + { + "epoch": 0.23430178069353327, + "grad_norm": 3.018371343612671, + "learning_rate": 9.97071227741331e-06, + "loss": 0.7072, + "step": 500 + }, + { + "epoch": 0.23430178069353327, + "eval_loss": 0.13636364042758942, + "eval_pearson_cosine": 0.7226338282187044, + "eval_pearson_dot": 0.40017542467791145, + "eval_pearson_euclidean": 0.7214409735618545, + "eval_pearson_manhattan": 0.7207106754958978, + "eval_runtime": 38.9491, + "eval_samples_per_second": 38.512, + "eval_spearman_cosine": 0.737511621673308, + "eval_spearman_dot": 0.39096188952446115, + "eval_spearman_euclidean": 0.7270501587145487, + "eval_spearman_manhattan": 0.7263223218978012, + "eval_steps_per_second": 38.512, + "step": 500 + }, + { + "epoch": 0.23898781630740393, + "grad_norm": 4.259694576263428, + "learning_rate": 9.970126522961575e-06, + "loss": 0.7365, + "step": 510 + }, + { + "epoch": 0.2436738519212746, + "grad_norm": 4.129746437072754, + "learning_rate": 9.969540768509842e-06, + "loss": 0.7005, + "step": 520 + }, + { + "epoch": 0.24835988753514526, + "grad_norm": 3.5990495681762695, + "learning_rate": 9.968955014058108e-06, + "loss": 0.6892, + "step": 530 + }, + { + "epoch": 0.2530459231490159, + "grad_norm": 4.118338584899902, + "learning_rate": 9.968369259606374e-06, + "loss": 0.7235, + "step": 540 + }, + { + "epoch": 0.25773195876288657, + "grad_norm": 3.2775630950927734, + "learning_rate": 9.96778350515464e-06, + "loss": 0.6567, + "step": 550 + }, + { + "epoch": 0.2624179943767573, + "grad_norm": 3.7363176345825195, + "learning_rate": 9.967197750702906e-06, + "loss": 0.6724, + "step": 560 + }, + { + "epoch": 0.26710402999062793, + "grad_norm": 3.317964792251587, + "learning_rate": 9.966611996251173e-06, + "loss": 0.6603, + "step": 570 + }, + { + "epoch": 0.2717900656044986, + "grad_norm": 4.1027703285217285, + "learning_rate": 9.966026241799438e-06, + "loss": 0.6384, + "step": 580 + }, + { + "epoch": 0.27647610121836924, + "grad_norm": 3.434147357940674, + "learning_rate": 9.965440487347705e-06, + "loss": 0.6555, + "step": 590 + }, + { + "epoch": 0.28116213683223995, + "grad_norm": 3.4407315254211426, + "learning_rate": 9.964854732895972e-06, + "loss": 0.6695, + "step": 600 + }, + { + "epoch": 0.2858481724461106, + "grad_norm": 3.800232172012329, + "learning_rate": 9.964268978444237e-06, + "loss": 0.6727, + "step": 610 + }, + { + "epoch": 0.29053420805998126, + "grad_norm": 3.191505193710327, + "learning_rate": 9.963683223992502e-06, + "loss": 0.6513, + "step": 620 + }, + { + "epoch": 0.2952202436738519, + "grad_norm": 4.152797222137451, + "learning_rate": 9.963097469540769e-06, + "loss": 0.7115, + "step": 630 + }, + { + "epoch": 0.29990627928772257, + "grad_norm": 3.339895248413086, + "learning_rate": 9.962511715089036e-06, + "loss": 0.6515, + "step": 640 + }, + { + "epoch": 0.3045923149015933, + "grad_norm": 3.2941200733184814, + "learning_rate": 9.961925960637301e-06, + "loss": 0.6024, + "step": 650 + }, + { + "epoch": 0.30927835051546393, + "grad_norm": 3.0853567123413086, + "learning_rate": 9.961340206185568e-06, + "loss": 0.6179, + "step": 660 + }, + { + "epoch": 0.3139643861293346, + "grad_norm": 3.827177047729492, + "learning_rate": 9.960754451733833e-06, + "loss": 0.6678, + "step": 670 + }, + { + "epoch": 0.31865042174320524, + "grad_norm": 3.8214621543884277, + "learning_rate": 9.9601686972821e-06, + "loss": 0.6248, + "step": 680 + }, + { + "epoch": 0.3233364573570759, + "grad_norm": 4.431512832641602, + "learning_rate": 9.959582942830366e-06, + "loss": 0.6562, + "step": 690 + }, + { + "epoch": 0.3280224929709466, + "grad_norm": 3.046006679534912, + "learning_rate": 9.958997188378632e-06, + "loss": 0.5865, + "step": 700 + }, + { + "epoch": 0.33270852858481725, + "grad_norm": 3.6602745056152344, + "learning_rate": 9.9584114339269e-06, + "loss": 0.611, + "step": 710 + }, + { + "epoch": 0.3373945641986879, + "grad_norm": 3.815204381942749, + "learning_rate": 9.957825679475165e-06, + "loss": 0.619, + "step": 720 + }, + { + "epoch": 0.34208059981255856, + "grad_norm": 3.5464768409729004, + "learning_rate": 9.957239925023431e-06, + "loss": 0.563, + "step": 730 + }, + { + "epoch": 0.3467666354264292, + "grad_norm": 3.254077911376953, + "learning_rate": 9.956654170571697e-06, + "loss": 0.6541, + "step": 740 + }, + { + "epoch": 0.3514526710402999, + "grad_norm": 3.465792655944824, + "learning_rate": 9.956068416119962e-06, + "loss": 0.6207, + "step": 750 + }, + { + "epoch": 0.3514526710402999, + "eval_loss": 0.11937286704778671, + "eval_pearson_cosine": 0.7370582778898296, + "eval_pearson_dot": 0.4517357649062319, + "eval_pearson_euclidean": 0.7300011163349307, + "eval_pearson_manhattan": 0.7294921991619532, + "eval_runtime": 40.2019, + "eval_samples_per_second": 37.312, + "eval_spearman_cosine": 0.7509292468775599, + "eval_spearman_dot": 0.44621372678317905, + "eval_spearman_euclidean": 0.7400885968162902, + "eval_spearman_manhattan": 0.7397623919578791, + "eval_steps_per_second": 37.312, + "step": 750 + }, + { + "epoch": 0.3561387066541706, + "grad_norm": 3.754749298095703, + "learning_rate": 9.95548266166823e-06, + "loss": 0.5565, + "step": 760 + }, + { + "epoch": 0.36082474226804123, + "grad_norm": 4.001673698425293, + "learning_rate": 9.954896907216496e-06, + "loss": 0.6017, + "step": 770 + }, + { + "epoch": 0.3655107778819119, + "grad_norm": 3.2223927974700928, + "learning_rate": 9.954311152764761e-06, + "loss": 0.6053, + "step": 780 + }, + { + "epoch": 0.3701968134957826, + "grad_norm": 3.135162353515625, + "learning_rate": 9.953725398313028e-06, + "loss": 0.5703, + "step": 790 + }, + { + "epoch": 0.37488284910965325, + "grad_norm": 3.6593549251556396, + "learning_rate": 9.953139643861293e-06, + "loss": 0.5683, + "step": 800 + }, + { + "epoch": 0.3795688847235239, + "grad_norm": 3.2421703338623047, + "learning_rate": 9.95255388940956e-06, + "loss": 0.6306, + "step": 810 + }, + { + "epoch": 0.38425492033739456, + "grad_norm": 3.2221267223358154, + "learning_rate": 9.951968134957827e-06, + "loss": 0.5942, + "step": 820 + }, + { + "epoch": 0.3889409559512652, + "grad_norm": 3.55708646774292, + "learning_rate": 9.951382380506092e-06, + "loss": 0.5695, + "step": 830 + }, + { + "epoch": 0.3936269915651359, + "grad_norm": 3.310382843017578, + "learning_rate": 9.950796626054359e-06, + "loss": 0.6249, + "step": 840 + }, + { + "epoch": 0.3983130271790066, + "grad_norm": 3.126347303390503, + "learning_rate": 9.950210871602624e-06, + "loss": 0.5515, + "step": 850 + }, + { + "epoch": 0.4029990627928772, + "grad_norm": 3.691459894180298, + "learning_rate": 9.949625117150891e-06, + "loss": 0.6074, + "step": 860 + }, + { + "epoch": 0.4076850984067479, + "grad_norm": 3.8297834396362305, + "learning_rate": 9.949039362699158e-06, + "loss": 0.6538, + "step": 870 + }, + { + "epoch": 0.41237113402061853, + "grad_norm": 3.097637414932251, + "learning_rate": 9.948453608247423e-06, + "loss": 0.5832, + "step": 880 + }, + { + "epoch": 0.41705716963448924, + "grad_norm": 4.003639221191406, + "learning_rate": 9.94786785379569e-06, + "loss": 0.5407, + "step": 890 + }, + { + "epoch": 0.4217432052483599, + "grad_norm": 3.648155689239502, + "learning_rate": 9.947282099343956e-06, + "loss": 0.5298, + "step": 900 + }, + { + "epoch": 0.42642924086223055, + "grad_norm": 3.071843147277832, + "learning_rate": 9.94669634489222e-06, + "loss": 0.4969, + "step": 910 + }, + { + "epoch": 0.4311152764761012, + "grad_norm": 3.8248066902160645, + "learning_rate": 9.94611059044049e-06, + "loss": 0.5894, + "step": 920 + }, + { + "epoch": 0.43580131208997186, + "grad_norm": 3.3905293941497803, + "learning_rate": 9.945524835988755e-06, + "loss": 0.5924, + "step": 930 + }, + { + "epoch": 0.44048734770384257, + "grad_norm": 3.8499722480773926, + "learning_rate": 9.94493908153702e-06, + "loss": 0.5636, + "step": 940 + }, + { + "epoch": 0.4451733833177132, + "grad_norm": 3.613633155822754, + "learning_rate": 9.944353327085287e-06, + "loss": 0.5631, + "step": 950 + }, + { + "epoch": 0.4498594189315839, + "grad_norm": 2.895521879196167, + "learning_rate": 9.943767572633552e-06, + "loss": 0.5791, + "step": 960 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 3.783475160598755, + "learning_rate": 9.943181818181819e-06, + "loss": 0.583, + "step": 970 + }, + { + "epoch": 0.4592314901593252, + "grad_norm": 3.0767059326171875, + "learning_rate": 9.942596063730086e-06, + "loss": 0.5625, + "step": 980 + }, + { + "epoch": 0.4639175257731959, + "grad_norm": 3.0430691242218018, + "learning_rate": 9.942010309278351e-06, + "loss": 0.6107, + "step": 990 + }, + { + "epoch": 0.46860356138706655, + "grad_norm": 3.4637937545776367, + "learning_rate": 9.941424554826618e-06, + "loss": 0.5767, + "step": 1000 + }, + { + "epoch": 0.46860356138706655, + "eval_loss": 0.11473686993122101, + "eval_pearson_cosine": 0.7508291149999735, + "eval_pearson_dot": 0.5170270771997139, + "eval_pearson_euclidean": 0.7400425527843311, + "eval_pearson_manhattan": 0.7394957204847508, + "eval_runtime": 40.3307, + "eval_samples_per_second": 37.192, + "eval_spearman_cosine": 0.7636438911546342, + "eval_spearman_dot": 0.5181197481922067, + "eval_spearman_euclidean": 0.7511102060810355, + "eval_spearman_manhattan": 0.750245305109835, + "eval_steps_per_second": 37.192, + "step": 1000 + }, + { + "epoch": 0.4732895970009372, + "grad_norm": 3.62758207321167, + "learning_rate": 9.940838800374883e-06, + "loss": 0.5533, + "step": 1010 + }, + { + "epoch": 0.47797563261480785, + "grad_norm": 2.8225605487823486, + "learning_rate": 9.94025304592315e-06, + "loss": 0.562, + "step": 1020 + }, + { + "epoch": 0.48266166822867856, + "grad_norm": 3.6451101303100586, + "learning_rate": 9.939667291471417e-06, + "loss": 0.5854, + "step": 1030 + }, + { + "epoch": 0.4873477038425492, + "grad_norm": 3.448962450027466, + "learning_rate": 9.939081537019682e-06, + "loss": 0.6032, + "step": 1040 + }, + { + "epoch": 0.49203373945641987, + "grad_norm": 3.0601136684417725, + "learning_rate": 9.938495782567949e-06, + "loss": 0.5636, + "step": 1050 + }, + { + "epoch": 0.4967197750702905, + "grad_norm": 3.154737710952759, + "learning_rate": 9.937910028116214e-06, + "loss": 0.5563, + "step": 1060 + }, + { + "epoch": 0.5014058106841612, + "grad_norm": 3.4425132274627686, + "learning_rate": 9.93732427366448e-06, + "loss": 0.5956, + "step": 1070 + }, + { + "epoch": 0.5060918462980318, + "grad_norm": 3.6232352256774902, + "learning_rate": 9.936738519212746e-06, + "loss": 0.5745, + "step": 1080 + }, + { + "epoch": 0.5107778819119025, + "grad_norm": 3.701143980026245, + "learning_rate": 9.936152764761013e-06, + "loss": 0.583, + "step": 1090 + }, + { + "epoch": 0.5154639175257731, + "grad_norm": 3.6037821769714355, + "learning_rate": 9.935567010309279e-06, + "loss": 0.5367, + "step": 1100 + }, + { + "epoch": 0.5201499531396439, + "grad_norm": 3.333034038543701, + "learning_rate": 9.934981255857546e-06, + "loss": 0.5675, + "step": 1110 + }, + { + "epoch": 0.5248359887535146, + "grad_norm": 3.335798501968384, + "learning_rate": 9.93439550140581e-06, + "loss": 0.582, + "step": 1120 + }, + { + "epoch": 0.5295220243673852, + "grad_norm": 2.90682315826416, + "learning_rate": 9.933809746954078e-06, + "loss": 0.5866, + "step": 1130 + }, + { + "epoch": 0.5342080599812559, + "grad_norm": 3.268754482269287, + "learning_rate": 9.933223992502345e-06, + "loss": 0.5832, + "step": 1140 + }, + { + "epoch": 0.5388940955951266, + "grad_norm": 3.0026419162750244, + "learning_rate": 9.93263823805061e-06, + "loss": 0.5303, + "step": 1150 + }, + { + "epoch": 0.5435801312089972, + "grad_norm": 3.3386096954345703, + "learning_rate": 9.932052483598877e-06, + "loss": 0.525, + "step": 1160 + }, + { + "epoch": 0.5482661668228679, + "grad_norm": 4.069448947906494, + "learning_rate": 9.931466729147142e-06, + "loss": 0.5738, + "step": 1170 + }, + { + "epoch": 0.5529522024367385, + "grad_norm": 2.9254677295684814, + "learning_rate": 9.930880974695409e-06, + "loss": 0.5529, + "step": 1180 + }, + { + "epoch": 0.5576382380506092, + "grad_norm": 3.216367721557617, + "learning_rate": 9.930295220243674e-06, + "loss": 0.5646, + "step": 1190 + }, + { + "epoch": 0.5623242736644799, + "grad_norm": 3.7063798904418945, + "learning_rate": 9.929709465791941e-06, + "loss": 0.5793, + "step": 1200 + }, + { + "epoch": 0.5670103092783505, + "grad_norm": 3.1322641372680664, + "learning_rate": 9.929123711340208e-06, + "loss": 0.5415, + "step": 1210 + }, + { + "epoch": 0.5716963448922212, + "grad_norm": 2.747403621673584, + "learning_rate": 9.928537956888473e-06, + "loss": 0.5468, + "step": 1220 + }, + { + "epoch": 0.5763823805060918, + "grad_norm": 3.0426528453826904, + "learning_rate": 9.927952202436738e-06, + "loss": 0.5755, + "step": 1230 + }, + { + "epoch": 0.5810684161199625, + "grad_norm": 2.9256389141082764, + "learning_rate": 9.927366447985005e-06, + "loss": 0.5237, + "step": 1240 + }, + { + "epoch": 0.5857544517338332, + "grad_norm": 3.5383565425872803, + "learning_rate": 9.92678069353327e-06, + "loss": 0.5026, + "step": 1250 + }, + { + "epoch": 0.5857544517338332, + "eval_loss": 0.10466309636831284, + "eval_pearson_cosine": 0.7506514546227478, + "eval_pearson_dot": 0.5487479231049726, + "eval_pearson_euclidean": 0.7459287854123886, + "eval_pearson_manhattan": 0.7455307169513361, + "eval_runtime": 39.6161, + "eval_samples_per_second": 37.863, + "eval_spearman_cosine": 0.7634978803745991, + "eval_spearman_dot": 0.5530948044816919, + "eval_spearman_euclidean": 0.7564382757079575, + "eval_spearman_manhattan": 0.7557623143729565, + "eval_steps_per_second": 37.863, + "step": 1250 + }, + { + "epoch": 0.5904404873477038, + "grad_norm": 4.084426403045654, + "learning_rate": 9.926194939081537e-06, + "loss": 0.5576, + "step": 1260 + }, + { + "epoch": 0.5951265229615745, + "grad_norm": 3.5379457473754883, + "learning_rate": 9.925609184629804e-06, + "loss": 0.547, + "step": 1270 + }, + { + "epoch": 0.5998125585754451, + "grad_norm": 2.8720273971557617, + "learning_rate": 9.92502343017807e-06, + "loss": 0.5385, + "step": 1280 + }, + { + "epoch": 0.6044985941893158, + "grad_norm": 2.877453565597534, + "learning_rate": 9.924437675726336e-06, + "loss": 0.5114, + "step": 1290 + }, + { + "epoch": 0.6091846298031866, + "grad_norm": 3.771329164505005, + "learning_rate": 9.923851921274602e-06, + "loss": 0.5133, + "step": 1300 + }, + { + "epoch": 0.6138706654170571, + "grad_norm": 2.411574125289917, + "learning_rate": 9.923266166822869e-06, + "loss": 0.5207, + "step": 1310 + }, + { + "epoch": 0.6185567010309279, + "grad_norm": 3.733808755874634, + "learning_rate": 9.922680412371136e-06, + "loss": 0.5043, + "step": 1320 + }, + { + "epoch": 0.6232427366447985, + "grad_norm": 2.5523786544799805, + "learning_rate": 9.9220946579194e-06, + "loss": 0.5142, + "step": 1330 + }, + { + "epoch": 0.6279287722586692, + "grad_norm": 2.743069887161255, + "learning_rate": 9.921508903467668e-06, + "loss": 0.5184, + "step": 1340 + }, + { + "epoch": 0.6326148078725399, + "grad_norm": 3.086749792098999, + "learning_rate": 9.920923149015933e-06, + "loss": 0.5135, + "step": 1350 + }, + { + "epoch": 0.6373008434864105, + "grad_norm": 3.3106496334075928, + "learning_rate": 9.9203373945642e-06, + "loss": 0.5107, + "step": 1360 + }, + { + "epoch": 0.6419868791002812, + "grad_norm": 3.2946176528930664, + "learning_rate": 9.919751640112467e-06, + "loss": 0.5112, + "step": 1370 + }, + { + "epoch": 0.6466729147141518, + "grad_norm": 3.7036850452423096, + "learning_rate": 9.919165885660732e-06, + "loss": 0.5112, + "step": 1380 + }, + { + "epoch": 0.6513589503280225, + "grad_norm": 3.8145229816436768, + "learning_rate": 9.918580131208997e-06, + "loss": 0.555, + "step": 1390 + }, + { + "epoch": 0.6560449859418932, + "grad_norm": 3.3321220874786377, + "learning_rate": 9.917994376757264e-06, + "loss": 0.4917, + "step": 1400 + }, + { + "epoch": 0.6607310215557638, + "grad_norm": 2.517955780029297, + "learning_rate": 9.91740862230553e-06, + "loss": 0.527, + "step": 1410 + }, + { + "epoch": 0.6654170571696345, + "grad_norm": 3.0029234886169434, + "learning_rate": 9.916822867853796e-06, + "loss": 0.5192, + "step": 1420 + }, + { + "epoch": 0.6701030927835051, + "grad_norm": 2.7767951488494873, + "learning_rate": 9.916237113402063e-06, + "loss": 0.5526, + "step": 1430 + }, + { + "epoch": 0.6747891283973758, + "grad_norm": 3.2969696521759033, + "learning_rate": 9.915651358950328e-06, + "loss": 0.5444, + "step": 1440 + }, + { + "epoch": 0.6794751640112465, + "grad_norm": 3.046128034591675, + "learning_rate": 9.915065604498595e-06, + "loss": 0.479, + "step": 1450 + }, + { + "epoch": 0.6841611996251171, + "grad_norm": 3.239370822906494, + "learning_rate": 9.91447985004686e-06, + "loss": 0.4626, + "step": 1460 + }, + { + "epoch": 0.6888472352389878, + "grad_norm": 3.3885092735290527, + "learning_rate": 9.913894095595127e-06, + "loss": 0.5115, + "step": 1470 + }, + { + "epoch": 0.6935332708528584, + "grad_norm": 3.279419422149658, + "learning_rate": 9.913308341143394e-06, + "loss": 0.5054, + "step": 1480 + }, + { + "epoch": 0.6982193064667291, + "grad_norm": 3.451601028442383, + "learning_rate": 9.91272258669166e-06, + "loss": 0.5124, + "step": 1490 + }, + { + "epoch": 0.7029053420805998, + "grad_norm": 2.9452903270721436, + "learning_rate": 9.912136832239926e-06, + "loss": 0.5192, + "step": 1500 + }, + { + "epoch": 0.7029053420805998, + "eval_loss": 0.11661049723625183, + "eval_pearson_cosine": 0.7522095786209633, + "eval_pearson_dot": 0.5055151019348045, + "eval_pearson_euclidean": 0.7488737825239156, + "eval_pearson_manhattan": 0.7487340190517244, + "eval_runtime": 39.6015, + "eval_samples_per_second": 37.877, + "eval_spearman_cosine": 0.7673251411045983, + "eval_spearman_dot": 0.5052800577910852, + "eval_spearman_euclidean": 0.7594003970324229, + "eval_spearman_manhattan": 0.7590967348946761, + "eval_steps_per_second": 37.877, + "step": 1500 + }, + { + "epoch": 0.7075913776944704, + "grad_norm": 3.0128912925720215, + "learning_rate": 9.911551077788192e-06, + "loss": 0.4896, + "step": 1510 + }, + { + "epoch": 0.7122774133083412, + "grad_norm": 2.955695152282715, + "learning_rate": 9.910965323336459e-06, + "loss": 0.5161, + "step": 1520 + }, + { + "epoch": 0.7169634489222118, + "grad_norm": 2.8971164226531982, + "learning_rate": 9.910379568884726e-06, + "loss": 0.4868, + "step": 1530 + }, + { + "epoch": 0.7216494845360825, + "grad_norm": 2.9848475456237793, + "learning_rate": 9.90979381443299e-06, + "loss": 0.4919, + "step": 1540 + }, + { + "epoch": 0.7263355201499532, + "grad_norm": 3.1218748092651367, + "learning_rate": 9.909208059981256e-06, + "loss": 0.4954, + "step": 1550 + }, + { + "epoch": 0.7310215557638238, + "grad_norm": 3.0474631786346436, + "learning_rate": 9.908622305529523e-06, + "loss": 0.5007, + "step": 1560 + }, + { + "epoch": 0.7357075913776945, + "grad_norm": 2.8136134147644043, + "learning_rate": 9.908036551077788e-06, + "loss": 0.4998, + "step": 1570 + }, + { + "epoch": 0.7403936269915652, + "grad_norm": 2.537595272064209, + "learning_rate": 9.907450796626055e-06, + "loss": 0.4759, + "step": 1580 + }, + { + "epoch": 0.7450796626054358, + "grad_norm": 3.2701878547668457, + "learning_rate": 9.906865042174322e-06, + "loss": 0.5106, + "step": 1590 + }, + { + "epoch": 0.7497656982193065, + "grad_norm": 3.289370536804199, + "learning_rate": 9.906279287722587e-06, + "loss": 0.513, + "step": 1600 + }, + { + "epoch": 0.7544517338331771, + "grad_norm": 3.6762607097625732, + "learning_rate": 9.905693533270854e-06, + "loss": 0.4854, + "step": 1610 + }, + { + "epoch": 0.7591377694470478, + "grad_norm": 3.083775281906128, + "learning_rate": 9.90510777881912e-06, + "loss": 0.4831, + "step": 1620 + }, + { + "epoch": 0.7638238050609185, + "grad_norm": 3.9659175872802734, + "learning_rate": 9.904522024367386e-06, + "loss": 0.4634, + "step": 1630 + }, + { + "epoch": 0.7685098406747891, + "grad_norm": 3.014983654022217, + "learning_rate": 9.903936269915653e-06, + "loss": 0.4979, + "step": 1640 + }, + { + "epoch": 0.7731958762886598, + "grad_norm": 2.3927905559539795, + "learning_rate": 9.903350515463918e-06, + "loss": 0.528, + "step": 1650 + }, + { + "epoch": 0.7778819119025304, + "grad_norm": 3.1232478618621826, + "learning_rate": 9.902764761012185e-06, + "loss": 0.5355, + "step": 1660 + }, + { + "epoch": 0.7825679475164011, + "grad_norm": 3.5926144123077393, + "learning_rate": 9.90217900656045e-06, + "loss": 0.4849, + "step": 1670 + }, + { + "epoch": 0.7872539831302718, + "grad_norm": 3.9513533115386963, + "learning_rate": 9.901593252108717e-06, + "loss": 0.505, + "step": 1680 + }, + { + "epoch": 0.7919400187441424, + "grad_norm": 2.9392502307891846, + "learning_rate": 9.901007497656983e-06, + "loss": 0.5358, + "step": 1690 + }, + { + "epoch": 0.7966260543580131, + "grad_norm": 2.7503395080566406, + "learning_rate": 9.90042174320525e-06, + "loss": 0.5267, + "step": 1700 + }, + { + "epoch": 0.8013120899718837, + "grad_norm": 2.9122848510742188, + "learning_rate": 9.899835988753515e-06, + "loss": 0.508, + "step": 1710 + }, + { + "epoch": 0.8059981255857545, + "grad_norm": 3.3603107929229736, + "learning_rate": 9.899250234301782e-06, + "loss": 0.4904, + "step": 1720 + }, + { + "epoch": 0.8106841611996252, + "grad_norm": 2.9802725315093994, + "learning_rate": 9.898664479850047e-06, + "loss": 0.5019, + "step": 1730 + }, + { + "epoch": 0.8153701968134958, + "grad_norm": 3.4380977153778076, + "learning_rate": 9.898078725398314e-06, + "loss": 0.4894, + "step": 1740 + }, + { + "epoch": 0.8200562324273665, + "grad_norm": 3.106285572052002, + "learning_rate": 9.897492970946579e-06, + "loss": 0.5046, + "step": 1750 + }, + { + "epoch": 0.8200562324273665, + "eval_loss": 0.11102449893951416, + "eval_pearson_cosine": 0.7554957932917077, + "eval_pearson_dot": 0.5302646386582808, + "eval_pearson_euclidean": 0.7581196621481752, + "eval_pearson_manhattan": 0.7582295870999474, + "eval_runtime": 39.7307, + "eval_samples_per_second": 37.754, + "eval_spearman_cosine": 0.7675167675754492, + "eval_spearman_dot": 0.5391446720291554, + "eval_spearman_euclidean": 0.767156398991665, + "eval_spearman_manhattan": 0.7675093547587875, + "eval_steps_per_second": 37.754, + "step": 1750 + }, + { + "epoch": 0.8247422680412371, + "grad_norm": 3.348632574081421, + "learning_rate": 9.896907216494846e-06, + "loss": 0.4758, + "step": 1760 + }, + { + "epoch": 0.8294283036551078, + "grad_norm": 3.240816831588745, + "learning_rate": 9.896321462043113e-06, + "loss": 0.4631, + "step": 1770 + }, + { + "epoch": 0.8341143392689785, + "grad_norm": 3.1542975902557373, + "learning_rate": 9.895735707591378e-06, + "loss": 0.492, + "step": 1780 + }, + { + "epoch": 0.8388003748828491, + "grad_norm": 2.405806064605713, + "learning_rate": 9.895149953139645e-06, + "loss": 0.5083, + "step": 1790 + }, + { + "epoch": 0.8434864104967198, + "grad_norm": 3.3465261459350586, + "learning_rate": 9.89456419868791e-06, + "loss": 0.4179, + "step": 1800 + }, + { + "epoch": 0.8481724461105904, + "grad_norm": 3.382632255554199, + "learning_rate": 9.893978444236177e-06, + "loss": 0.4982, + "step": 1810 + }, + { + "epoch": 0.8528584817244611, + "grad_norm": 2.5863983631134033, + "learning_rate": 9.893392689784444e-06, + "loss": 0.5203, + "step": 1820 + }, + { + "epoch": 0.8575445173383318, + "grad_norm": 2.830101251602173, + "learning_rate": 9.89280693533271e-06, + "loss": 0.4997, + "step": 1830 + }, + { + "epoch": 0.8622305529522024, + "grad_norm": 3.096226453781128, + "learning_rate": 9.892221180880976e-06, + "loss": 0.493, + "step": 1840 + }, + { + "epoch": 0.8669165885660731, + "grad_norm": 3.1559505462646484, + "learning_rate": 9.891635426429241e-06, + "loss": 0.4968, + "step": 1850 + }, + { + "epoch": 0.8716026241799437, + "grad_norm": 3.1153781414031982, + "learning_rate": 9.891049671977507e-06, + "loss": 0.5037, + "step": 1860 + }, + { + "epoch": 0.8762886597938144, + "grad_norm": 3.222080945968628, + "learning_rate": 9.890463917525774e-06, + "loss": 0.538, + "step": 1870 + }, + { + "epoch": 0.8809746954076851, + "grad_norm": 2.716965436935425, + "learning_rate": 9.88987816307404e-06, + "loss": 0.4525, + "step": 1880 + }, + { + "epoch": 0.8856607310215557, + "grad_norm": 3.6638436317443848, + "learning_rate": 9.889292408622306e-06, + "loss": 0.4959, + "step": 1890 + }, + { + "epoch": 0.8903467666354264, + "grad_norm": 2.795483350753784, + "learning_rate": 9.888706654170573e-06, + "loss": 0.5156, + "step": 1900 + }, + { + "epoch": 0.895032802249297, + "grad_norm": 3.4826247692108154, + "learning_rate": 9.888120899718838e-06, + "loss": 0.5136, + "step": 1910 + }, + { + "epoch": 0.8997188378631678, + "grad_norm": 3.1624643802642822, + "learning_rate": 9.887535145267105e-06, + "loss": 0.4488, + "step": 1920 + }, + { + "epoch": 0.9044048734770385, + "grad_norm": 3.1892471313476562, + "learning_rate": 9.886949390815372e-06, + "loss": 0.4611, + "step": 1930 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 3.075989007949829, + "learning_rate": 9.886363636363637e-06, + "loss": 0.4376, + "step": 1940 + }, + { + "epoch": 0.9137769447047798, + "grad_norm": 3.629077672958374, + "learning_rate": 9.885777881911904e-06, + "loss": 0.5039, + "step": 1950 + }, + { + "epoch": 0.9184629803186504, + "grad_norm": 3.424208402633667, + "learning_rate": 9.885192127460169e-06, + "loss": 0.5169, + "step": 1960 + }, + { + "epoch": 0.9231490159325211, + "grad_norm": 2.9203758239746094, + "learning_rate": 9.884606373008436e-06, + "loss": 0.4944, + "step": 1970 + }, + { + "epoch": 0.9278350515463918, + "grad_norm": 3.0997154712677, + "learning_rate": 9.884020618556703e-06, + "loss": 0.4586, + "step": 1980 + }, + { + "epoch": 0.9325210871602624, + "grad_norm": 2.3105781078338623, + "learning_rate": 9.883434864104968e-06, + "loss": 0.5109, + "step": 1990 + }, + { + "epoch": 0.9372071227741331, + "grad_norm": 3.348557472229004, + "learning_rate": 9.882849109653235e-06, + "loss": 0.5055, + "step": 2000 + }, + { + "epoch": 0.9372071227741331, + "eval_loss": 0.10619169473648071, + "eval_pearson_cosine": 0.7545903634746969, + "eval_pearson_dot": 0.5637526434800861, + "eval_pearson_euclidean": 0.7502010625678288, + "eval_pearson_manhattan": 0.7501144315992683, + "eval_runtime": 39.4727, + "eval_samples_per_second": 38.001, + "eval_spearman_cosine": 0.7726348299355251, + "eval_spearman_dot": 0.571016162108862, + "eval_spearman_euclidean": 0.7650905718991688, + "eval_spearman_manhattan": 0.7650111430170369, + "eval_steps_per_second": 38.001, + "step": 2000 + }, + { + "epoch": 0.9418931583880038, + "grad_norm": 2.4951772689819336, + "learning_rate": 9.8822633552015e-06, + "loss": 0.4542, + "step": 2010 + }, + { + "epoch": 0.9465791940018744, + "grad_norm": 2.8216359615325928, + "learning_rate": 9.881677600749765e-06, + "loss": 0.4823, + "step": 2020 + }, + { + "epoch": 0.9512652296157451, + "grad_norm": 3.510946035385132, + "learning_rate": 9.881091846298032e-06, + "loss": 0.464, + "step": 2030 + }, + { + "epoch": 0.9559512652296157, + "grad_norm": 3.307612419128418, + "learning_rate": 9.8805060918463e-06, + "loss": 0.4641, + "step": 2040 + }, + { + "epoch": 0.9606373008434864, + "grad_norm": 3.4705116748809814, + "learning_rate": 9.879920337394564e-06, + "loss": 0.4643, + "step": 2050 + }, + { + "epoch": 0.9653233364573571, + "grad_norm": 3.5456154346466064, + "learning_rate": 9.879334582942831e-06, + "loss": 0.4673, + "step": 2060 + }, + { + "epoch": 0.9700093720712277, + "grad_norm": 3.027689218521118, + "learning_rate": 9.878748828491097e-06, + "loss": 0.4997, + "step": 2070 + }, + { + "epoch": 0.9746954076850984, + "grad_norm": 3.0385146141052246, + "learning_rate": 9.878163074039364e-06, + "loss": 0.4941, + "step": 2080 + }, + { + "epoch": 0.979381443298969, + "grad_norm": 2.4981963634490967, + "learning_rate": 9.87757731958763e-06, + "loss": 0.4553, + "step": 2090 + }, + { + "epoch": 0.9840674789128397, + "grad_norm": 2.5605556964874268, + "learning_rate": 9.876991565135896e-06, + "loss": 0.4515, + "step": 2100 + }, + { + "epoch": 0.9887535145267105, + "grad_norm": 3.263235330581665, + "learning_rate": 9.876405810684163e-06, + "loss": 0.5033, + "step": 2110 + }, + { + "epoch": 0.993439550140581, + "grad_norm": 3.3904616832733154, + "learning_rate": 9.875820056232428e-06, + "loss": 0.4285, + "step": 2120 + }, + { + "epoch": 0.9981255857544518, + "grad_norm": 3.5745186805725098, + "learning_rate": 9.875234301780695e-06, + "loss": 0.4793, + "step": 2130 + }, + { + "epoch": 1.0028116213683225, + "grad_norm": 2.888436794281006, + "learning_rate": 9.874648547328962e-06, + "loss": 0.4291, + "step": 2140 + }, + { + "epoch": 1.007497656982193, + "grad_norm": 2.378056526184082, + "learning_rate": 9.874062792877227e-06, + "loss": 0.3839, + "step": 2150 + }, + { + "epoch": 1.0121836925960637, + "grad_norm": 3.4162449836730957, + "learning_rate": 9.873477038425494e-06, + "loss": 0.3953, + "step": 2160 + }, + { + "epoch": 1.0168697282099344, + "grad_norm": 3.428281545639038, + "learning_rate": 9.872891283973759e-06, + "loss": 0.3927, + "step": 2170 + }, + { + "epoch": 1.021555763823805, + "grad_norm": 2.3197989463806152, + "learning_rate": 9.872305529522024e-06, + "loss": 0.3879, + "step": 2180 + }, + { + "epoch": 1.0262417994376758, + "grad_norm": 2.783487558364868, + "learning_rate": 9.871719775070291e-06, + "loss": 0.3992, + "step": 2190 + }, + { + "epoch": 1.0309278350515463, + "grad_norm": 3.5884554386138916, + "learning_rate": 9.871134020618558e-06, + "loss": 0.3921, + "step": 2200 + }, + { + "epoch": 1.035613870665417, + "grad_norm": 3.041579008102417, + "learning_rate": 9.870548266166823e-06, + "loss": 0.4212, + "step": 2210 + }, + { + "epoch": 1.0402999062792877, + "grad_norm": 3.096118688583374, + "learning_rate": 9.86996251171509e-06, + "loss": 0.388, + "step": 2220 + }, + { + "epoch": 1.0449859418931584, + "grad_norm": 3.3495426177978516, + "learning_rate": 9.869376757263355e-06, + "loss": 0.4283, + "step": 2230 + }, + { + "epoch": 1.0496719775070291, + "grad_norm": 3.1030259132385254, + "learning_rate": 9.868791002811622e-06, + "loss": 0.3841, + "step": 2240 + }, + { + "epoch": 1.0543580131208996, + "grad_norm": 2.785508155822754, + "learning_rate": 9.868205248359888e-06, + "loss": 0.4177, + "step": 2250 + }, + { + "epoch": 1.0543580131208996, + "eval_loss": 0.09417638927698135, + "eval_pearson_cosine": 0.7576901427376015, + "eval_pearson_dot": 0.5576948875195811, + "eval_pearson_euclidean": 0.750992670186065, + "eval_pearson_manhattan": 0.7511405973753114, + "eval_runtime": 39.4437, + "eval_samples_per_second": 38.029, + "eval_spearman_cosine": 0.7708766062647188, + "eval_spearman_dot": 0.5635255554893936, + "eval_spearman_euclidean": 0.7633148873618476, + "eval_spearman_manhattan": 0.7634983981685618, + "eval_steps_per_second": 38.029, + "step": 2250 + }, + { + "epoch": 1.0590440487347703, + "grad_norm": 2.5779619216918945, + "learning_rate": 9.867619493908154e-06, + "loss": 0.4064, + "step": 2260 + }, + { + "epoch": 1.063730084348641, + "grad_norm": 3.4719700813293457, + "learning_rate": 9.867033739456421e-06, + "loss": 0.424, + "step": 2270 + }, + { + "epoch": 1.0684161199625117, + "grad_norm": 2.8868348598480225, + "learning_rate": 9.866447985004687e-06, + "loss": 0.3964, + "step": 2280 + }, + { + "epoch": 1.0731021555763824, + "grad_norm": 3.0720372200012207, + "learning_rate": 9.865862230552954e-06, + "loss": 0.4319, + "step": 2290 + }, + { + "epoch": 1.077788191190253, + "grad_norm": 2.9588916301727295, + "learning_rate": 9.865276476101219e-06, + "loss": 0.4045, + "step": 2300 + }, + { + "epoch": 1.0824742268041236, + "grad_norm": 2.9125168323516846, + "learning_rate": 9.864690721649486e-06, + "loss": 0.4147, + "step": 2310 + }, + { + "epoch": 1.0871602624179943, + "grad_norm": 3.1080446243286133, + "learning_rate": 9.864104967197753e-06, + "loss": 0.3792, + "step": 2320 + }, + { + "epoch": 1.091846298031865, + "grad_norm": 3.130577802658081, + "learning_rate": 9.863519212746018e-06, + "loss": 0.3995, + "step": 2330 + }, + { + "epoch": 1.0965323336457358, + "grad_norm": 3.0566961765289307, + "learning_rate": 9.862933458294283e-06, + "loss": 0.4044, + "step": 2340 + }, + { + "epoch": 1.1012183692596063, + "grad_norm": 3.1854050159454346, + "learning_rate": 9.86234770384255e-06, + "loss": 0.3703, + "step": 2350 + }, + { + "epoch": 1.105904404873477, + "grad_norm": 3.034507989883423, + "learning_rate": 9.861761949390815e-06, + "loss": 0.4062, + "step": 2360 + }, + { + "epoch": 1.1105904404873477, + "grad_norm": 3.1277084350585938, + "learning_rate": 9.861176194939082e-06, + "loss": 0.3733, + "step": 2370 + }, + { + "epoch": 1.1152764761012184, + "grad_norm": 2.593057155609131, + "learning_rate": 9.860590440487349e-06, + "loss": 0.4395, + "step": 2380 + }, + { + "epoch": 1.119962511715089, + "grad_norm": 3.079909086227417, + "learning_rate": 9.860004686035614e-06, + "loss": 0.3849, + "step": 2390 + }, + { + "epoch": 1.1246485473289598, + "grad_norm": 3.3298873901367188, + "learning_rate": 9.859418931583881e-06, + "loss": 0.3753, + "step": 2400 + }, + { + "epoch": 1.1293345829428303, + "grad_norm": 3.211998224258423, + "learning_rate": 9.858833177132146e-06, + "loss": 0.4062, + "step": 2410 + }, + { + "epoch": 1.134020618556701, + "grad_norm": 3.5661370754241943, + "learning_rate": 9.858247422680413e-06, + "loss": 0.3734, + "step": 2420 + }, + { + "epoch": 1.1387066541705717, + "grad_norm": 3.004592180252075, + "learning_rate": 9.85766166822868e-06, + "loss": 0.3847, + "step": 2430 + }, + { + "epoch": 1.1433926897844424, + "grad_norm": 3.070695400238037, + "learning_rate": 9.857075913776945e-06, + "loss": 0.3951, + "step": 2440 + }, + { + "epoch": 1.148078725398313, + "grad_norm": 2.348797559738159, + "learning_rate": 9.856490159325212e-06, + "loss": 0.4213, + "step": 2450 + }, + { + "epoch": 1.1527647610121836, + "grad_norm": 2.997864246368408, + "learning_rate": 9.855904404873478e-06, + "loss": 0.39, + "step": 2460 + }, + { + "epoch": 1.1574507966260543, + "grad_norm": 2.844785451889038, + "learning_rate": 9.855318650421743e-06, + "loss": 0.3889, + "step": 2470 + }, + { + "epoch": 1.162136832239925, + "grad_norm": 2.3173370361328125, + "learning_rate": 9.854732895970011e-06, + "loss": 0.387, + "step": 2480 + }, + { + "epoch": 1.1668228678537957, + "grad_norm": 2.7600715160369873, + "learning_rate": 9.854147141518277e-06, + "loss": 0.3932, + "step": 2490 + }, + { + "epoch": 1.1715089034676662, + "grad_norm": 2.941807270050049, + "learning_rate": 9.853561387066542e-06, + "loss": 0.4136, + "step": 2500 + }, + { + "epoch": 1.1715089034676662, + "eval_loss": 0.09154797345399857, + "eval_pearson_cosine": 0.7611853083767528, + "eval_pearson_dot": 0.5553938407700798, + "eval_pearson_euclidean": 0.7585919353320634, + "eval_pearson_manhattan": 0.7584339332224772, + "eval_runtime": 39.5654, + "eval_samples_per_second": 37.912, + "eval_spearman_cosine": 0.7727273086490055, + "eval_spearman_dot": 0.5594934939736275, + "eval_spearman_euclidean": 0.7696220909317546, + "eval_spearman_manhattan": 0.7696255697874876, + "eval_steps_per_second": 37.912, + "step": 2500 + }, + { + "epoch": 1.176194939081537, + "grad_norm": 2.902358293533325, + "learning_rate": 9.852975632614809e-06, + "loss": 0.38, + "step": 2510 + }, + { + "epoch": 1.1808809746954076, + "grad_norm": 2.7850334644317627, + "learning_rate": 9.852389878163074e-06, + "loss": 0.3899, + "step": 2520 + }, + { + "epoch": 1.1855670103092784, + "grad_norm": 3.1563310623168945, + "learning_rate": 9.851804123711341e-06, + "loss": 0.3597, + "step": 2530 + }, + { + "epoch": 1.190253045923149, + "grad_norm": 2.9025473594665527, + "learning_rate": 9.851218369259608e-06, + "loss": 0.376, + "step": 2540 + }, + { + "epoch": 1.1949390815370198, + "grad_norm": 3.4233460426330566, + "learning_rate": 9.850632614807873e-06, + "loss": 0.4129, + "step": 2550 + }, + { + "epoch": 1.1996251171508903, + "grad_norm": 2.8708131313323975, + "learning_rate": 9.85004686035614e-06, + "loss": 0.3979, + "step": 2560 + }, + { + "epoch": 1.204311152764761, + "grad_norm": 2.335909843444824, + "learning_rate": 9.849461105904405e-06, + "loss": 0.3962, + "step": 2570 + }, + { + "epoch": 1.2089971883786317, + "grad_norm": 3.7836697101593018, + "learning_rate": 9.848875351452672e-06, + "loss": 0.4116, + "step": 2580 + }, + { + "epoch": 1.2136832239925024, + "grad_norm": 3.2370989322662354, + "learning_rate": 9.848289597000939e-06, + "loss": 0.3827, + "step": 2590 + }, + { + "epoch": 1.218369259606373, + "grad_norm": 3.3212838172912598, + "learning_rate": 9.847703842549204e-06, + "loss": 0.4082, + "step": 2600 + }, + { + "epoch": 1.2230552952202436, + "grad_norm": 3.7373929023742676, + "learning_rate": 9.847118088097471e-06, + "loss": 0.4193, + "step": 2610 + }, + { + "epoch": 1.2277413308341143, + "grad_norm": 2.658543825149536, + "learning_rate": 9.846532333645736e-06, + "loss": 0.3539, + "step": 2620 + }, + { + "epoch": 1.232427366447985, + "grad_norm": 3.5882198810577393, + "learning_rate": 9.845946579194002e-06, + "loss": 0.3966, + "step": 2630 + }, + { + "epoch": 1.2371134020618557, + "grad_norm": 3.1050803661346436, + "learning_rate": 9.84536082474227e-06, + "loss": 0.4653, + "step": 2640 + }, + { + "epoch": 1.2417994376757264, + "grad_norm": 2.8855292797088623, + "learning_rate": 9.844775070290535e-06, + "loss": 0.3762, + "step": 2650 + }, + { + "epoch": 1.246485473289597, + "grad_norm": 3.595693588256836, + "learning_rate": 9.8441893158388e-06, + "loss": 0.4139, + "step": 2660 + }, + { + "epoch": 1.2511715089034676, + "grad_norm": 3.307594060897827, + "learning_rate": 9.843603561387068e-06, + "loss": 0.3821, + "step": 2670 + }, + { + "epoch": 1.2558575445173383, + "grad_norm": 2.86323618888855, + "learning_rate": 9.843017806935333e-06, + "loss": 0.4311, + "step": 2680 + }, + { + "epoch": 1.260543580131209, + "grad_norm": 3.1539525985717773, + "learning_rate": 9.8424320524836e-06, + "loss": 0.381, + "step": 2690 + }, + { + "epoch": 1.2652296157450795, + "grad_norm": 2.8784306049346924, + "learning_rate": 9.841846298031867e-06, + "loss": 0.4353, + "step": 2700 + }, + { + "epoch": 1.2699156513589505, + "grad_norm": 2.426830291748047, + "learning_rate": 9.841260543580132e-06, + "loss": 0.387, + "step": 2710 + }, + { + "epoch": 1.274601686972821, + "grad_norm": 2.350255012512207, + "learning_rate": 9.840674789128399e-06, + "loss": 0.3409, + "step": 2720 + }, + { + "epoch": 1.2792877225866917, + "grad_norm": 2.589404344558716, + "learning_rate": 9.840089034676664e-06, + "loss": 0.3665, + "step": 2730 + }, + { + "epoch": 1.2839737582005624, + "grad_norm": 2.6439192295074463, + "learning_rate": 9.839503280224931e-06, + "loss": 0.4118, + "step": 2740 + }, + { + "epoch": 1.2886597938144329, + "grad_norm": 3.120351791381836, + "learning_rate": 9.838917525773196e-06, + "loss": 0.4425, + "step": 2750 + }, + { + "epoch": 1.2886597938144329, + "eval_loss": 0.09277115762233734, + "eval_pearson_cosine": 0.7605294387552419, + "eval_pearson_dot": 0.5497643159360166, + "eval_pearson_euclidean": 0.7462760938099731, + "eval_pearson_manhattan": 0.7461382274954786, + "eval_runtime": 39.1884, + "eval_samples_per_second": 38.277, + "eval_spearman_cosine": 0.7726058957512391, + "eval_spearman_dot": 0.551187587856229, + "eval_spearman_euclidean": 0.7591610142998378, + "eval_spearman_manhattan": 0.7591112847270914, + "eval_steps_per_second": 38.277, + "step": 2750 + }, + { + "epoch": 1.2933458294283038, + "grad_norm": 2.9216575622558594, + "learning_rate": 9.838331771321463e-06, + "loss": 0.376, + "step": 2760 + }, + { + "epoch": 1.2980318650421743, + "grad_norm": 2.9521021842956543, + "learning_rate": 9.83774601686973e-06, + "loss": 0.3597, + "step": 2770 + }, + { + "epoch": 1.302717900656045, + "grad_norm": 3.4117023944854736, + "learning_rate": 9.837160262417995e-06, + "loss": 0.3833, + "step": 2780 + }, + { + "epoch": 1.3074039362699157, + "grad_norm": 3.0038156509399414, + "learning_rate": 9.83657450796626e-06, + "loss": 0.3996, + "step": 2790 + }, + { + "epoch": 1.3120899718837864, + "grad_norm": 2.500950813293457, + "learning_rate": 9.835988753514527e-06, + "loss": 0.3619, + "step": 2800 + }, + { + "epoch": 1.316776007497657, + "grad_norm": 2.954556465148926, + "learning_rate": 9.835402999062794e-06, + "loss": 0.4053, + "step": 2810 + }, + { + "epoch": 1.3214620431115276, + "grad_norm": 2.647721290588379, + "learning_rate": 9.83481724461106e-06, + "loss": 0.367, + "step": 2820 + }, + { + "epoch": 1.3261480787253983, + "grad_norm": 3.3207929134368896, + "learning_rate": 9.834231490159326e-06, + "loss": 0.3857, + "step": 2830 + }, + { + "epoch": 1.330834114339269, + "grad_norm": 2.4351987838745117, + "learning_rate": 9.833645735707592e-06, + "loss": 0.4087, + "step": 2840 + }, + { + "epoch": 1.3355201499531397, + "grad_norm": 2.845399856567383, + "learning_rate": 9.833059981255859e-06, + "loss": 0.3628, + "step": 2850 + }, + { + "epoch": 1.3402061855670104, + "grad_norm": 2.342569589614868, + "learning_rate": 9.832474226804124e-06, + "loss": 0.3656, + "step": 2860 + }, + { + "epoch": 1.344892221180881, + "grad_norm": 3.218336582183838, + "learning_rate": 9.83188847235239e-06, + "loss": 0.3445, + "step": 2870 + }, + { + "epoch": 1.3495782567947516, + "grad_norm": 3.147611141204834, + "learning_rate": 9.831302717900658e-06, + "loss": 0.3766, + "step": 2880 + }, + { + "epoch": 1.3542642924086223, + "grad_norm": 2.8554821014404297, + "learning_rate": 9.830716963448923e-06, + "loss": 0.3719, + "step": 2890 + }, + { + "epoch": 1.358950328022493, + "grad_norm": 3.0164778232574463, + "learning_rate": 9.83013120899719e-06, + "loss": 0.3625, + "step": 2900 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 2.705883741378784, + "learning_rate": 9.829545454545455e-06, + "loss": 0.4185, + "step": 2910 + }, + { + "epoch": 1.3683223992502342, + "grad_norm": 2.8652660846710205, + "learning_rate": 9.828959700093722e-06, + "loss": 0.3563, + "step": 2920 + }, + { + "epoch": 1.373008434864105, + "grad_norm": 2.8929593563079834, + "learning_rate": 9.828373945641989e-06, + "loss": 0.3769, + "step": 2930 + }, + { + "epoch": 1.3776944704779757, + "grad_norm": 2.8356854915618896, + "learning_rate": 9.827788191190254e-06, + "loss": 0.3796, + "step": 2940 + }, + { + "epoch": 1.3823805060918464, + "grad_norm": 2.890596866607666, + "learning_rate": 9.82720243673852e-06, + "loss": 0.4299, + "step": 2950 + }, + { + "epoch": 1.387066541705717, + "grad_norm": 3.0824170112609863, + "learning_rate": 9.826616682286786e-06, + "loss": 0.3807, + "step": 2960 + }, + { + "epoch": 1.3917525773195876, + "grad_norm": 2.549909830093384, + "learning_rate": 9.826030927835051e-06, + "loss": 0.392, + "step": 2970 + }, + { + "epoch": 1.3964386129334583, + "grad_norm": 2.3966541290283203, + "learning_rate": 9.825445173383318e-06, + "loss": 0.3743, + "step": 2980 + }, + { + "epoch": 1.401124648547329, + "grad_norm": 2.7320258617401123, + "learning_rate": 9.824859418931585e-06, + "loss": 0.3664, + "step": 2990 + }, + { + "epoch": 1.4058106841611997, + "grad_norm": 1.8689631223678589, + "learning_rate": 9.82427366447985e-06, + "loss": 0.3708, + "step": 3000 + }, + { + "epoch": 1.4058106841611997, + "eval_loss": 0.08185213804244995, + "eval_pearson_cosine": 0.7670467055300527, + "eval_pearson_dot": 0.5834479149623917, + "eval_pearson_euclidean": 0.7481055315337883, + "eval_pearson_manhattan": 0.7477726537908893, + "eval_runtime": 39.1851, + "eval_samples_per_second": 38.28, + "eval_spearman_cosine": 0.7783408589060774, + "eval_spearman_dot": 0.5847220399435686, + "eval_spearman_euclidean": 0.7636547096676689, + "eval_spearman_manhattan": 0.7633506525118475, + "eval_steps_per_second": 38.28, + "step": 3000 + }, + { + "epoch": 1.4104967197750704, + "grad_norm": 3.3057873249053955, + "learning_rate": 9.823687910028117e-06, + "loss": 0.3923, + "step": 3010 + }, + { + "epoch": 1.415182755388941, + "grad_norm": 2.955125331878662, + "learning_rate": 9.823102155576383e-06, + "loss": 0.3788, + "step": 3020 + }, + { + "epoch": 1.4198687910028116, + "grad_norm": 2.7104790210723877, + "learning_rate": 9.82251640112465e-06, + "loss": 0.4158, + "step": 3030 + }, + { + "epoch": 1.4245548266166823, + "grad_norm": 2.648211717605591, + "learning_rate": 9.821930646672916e-06, + "loss": 0.3619, + "step": 3040 + }, + { + "epoch": 1.429240862230553, + "grad_norm": 2.5092930793762207, + "learning_rate": 9.821344892221182e-06, + "loss": 0.3995, + "step": 3050 + }, + { + "epoch": 1.4339268978444237, + "grad_norm": 3.207554340362549, + "learning_rate": 9.820759137769448e-06, + "loss": 0.3602, + "step": 3060 + }, + { + "epoch": 1.4386129334582942, + "grad_norm": 3.2078282833099365, + "learning_rate": 9.820173383317714e-06, + "loss": 0.3644, + "step": 3070 + }, + { + "epoch": 1.443298969072165, + "grad_norm": 2.8371787071228027, + "learning_rate": 9.819587628865979e-06, + "loss": 0.368, + "step": 3080 + }, + { + "epoch": 1.4479850046860356, + "grad_norm": 2.8893673419952393, + "learning_rate": 9.819001874414248e-06, + "loss": 0.3611, + "step": 3090 + }, + { + "epoch": 1.4526710402999063, + "grad_norm": 3.0875115394592285, + "learning_rate": 9.818416119962513e-06, + "loss": 0.3809, + "step": 3100 + }, + { + "epoch": 1.457357075913777, + "grad_norm": 3.2046687602996826, + "learning_rate": 9.817830365510778e-06, + "loss": 0.4254, + "step": 3110 + }, + { + "epoch": 1.4620431115276475, + "grad_norm": 2.6490259170532227, + "learning_rate": 9.817244611059045e-06, + "loss": 0.3708, + "step": 3120 + }, + { + "epoch": 1.4667291471415183, + "grad_norm": 2.928335189819336, + "learning_rate": 9.81665885660731e-06, + "loss": 0.4054, + "step": 3130 + }, + { + "epoch": 1.471415182755389, + "grad_norm": 3.058704137802124, + "learning_rate": 9.816073102155577e-06, + "loss": 0.3795, + "step": 3140 + }, + { + "epoch": 1.4761012183692597, + "grad_norm": 3.042874336242676, + "learning_rate": 9.815487347703844e-06, + "loss": 0.349, + "step": 3150 + }, + { + "epoch": 1.4807872539831304, + "grad_norm": 3.230562210083008, + "learning_rate": 9.81490159325211e-06, + "loss": 0.3883, + "step": 3160 + }, + { + "epoch": 1.4854732895970009, + "grad_norm": 2.735044240951538, + "learning_rate": 9.814315838800376e-06, + "loss": 0.3573, + "step": 3170 + }, + { + "epoch": 1.4901593252108716, + "grad_norm": 3.3735740184783936, + "learning_rate": 9.813730084348641e-06, + "loss": 0.3969, + "step": 3180 + }, + { + "epoch": 1.4948453608247423, + "grad_norm": 2.9641244411468506, + "learning_rate": 9.813144329896908e-06, + "loss": 0.4103, + "step": 3190 + }, + { + "epoch": 1.499531396438613, + "grad_norm": 2.666172742843628, + "learning_rate": 9.812558575445175e-06, + "loss": 0.3739, + "step": 3200 + }, + { + "epoch": 1.5042174320524837, + "grad_norm": 3.061447858810425, + "learning_rate": 9.81197282099344e-06, + "loss": 0.3759, + "step": 3210 + }, + { + "epoch": 1.5089034676663542, + "grad_norm": 2.6585240364074707, + "learning_rate": 9.811387066541707e-06, + "loss": 0.3521, + "step": 3220 + }, + { + "epoch": 1.513589503280225, + "grad_norm": 3.168820858001709, + "learning_rate": 9.810801312089973e-06, + "loss": 0.3681, + "step": 3230 + }, + { + "epoch": 1.5182755388940956, + "grad_norm": 3.069749355316162, + "learning_rate": 9.810215557638238e-06, + "loss": 0.3696, + "step": 3240 + }, + { + "epoch": 1.522961574507966, + "grad_norm": 3.588362693786621, + "learning_rate": 9.809629803186505e-06, + "loss": 0.3934, + "step": 3250 + }, + { + "epoch": 1.522961574507966, + "eval_loss": 0.08483820408582687, + "eval_pearson_cosine": 0.7708958105015995, + "eval_pearson_dot": 0.5654852409557236, + "eval_pearson_euclidean": 0.7542229262005833, + "eval_pearson_manhattan": 0.7539007508537807, + "eval_runtime": 40.1586, + "eval_samples_per_second": 37.352, + "eval_spearman_cosine": 0.7814200034454707, + "eval_spearman_dot": 0.5667704812278335, + "eval_spearman_euclidean": 0.7689048877801009, + "eval_spearman_manhattan": 0.7692340606121537, + "eval_steps_per_second": 37.352, + "step": 3250 + }, + { + "epoch": 1.527647610121837, + "grad_norm": 2.5335776805877686, + "learning_rate": 9.809044048734772e-06, + "loss": 0.404, + "step": 3260 + }, + { + "epoch": 1.5323336457357075, + "grad_norm": 3.6495397090911865, + "learning_rate": 9.808458294283037e-06, + "loss": 0.4017, + "step": 3270 + }, + { + "epoch": 1.5370196813495782, + "grad_norm": 3.3083150386810303, + "learning_rate": 9.807872539831304e-06, + "loss": 0.3839, + "step": 3280 + }, + { + "epoch": 1.541705716963449, + "grad_norm": 2.9375736713409424, + "learning_rate": 9.807286785379569e-06, + "loss": 0.4067, + "step": 3290 + }, + { + "epoch": 1.5463917525773194, + "grad_norm": 2.8293914794921875, + "learning_rate": 9.806701030927836e-06, + "loss": 0.4506, + "step": 3300 + }, + { + "epoch": 1.5510777881911904, + "grad_norm": 2.9611403942108154, + "learning_rate": 9.806115276476103e-06, + "loss": 0.3703, + "step": 3310 + }, + { + "epoch": 1.5557638238050608, + "grad_norm": 2.821136713027954, + "learning_rate": 9.805529522024368e-06, + "loss": 0.3756, + "step": 3320 + }, + { + "epoch": 1.5604498594189316, + "grad_norm": 2.8910715579986572, + "learning_rate": 9.804943767572635e-06, + "loss": 0.3911, + "step": 3330 + }, + { + "epoch": 1.5651358950328023, + "grad_norm": 3.2398719787597656, + "learning_rate": 9.8043580131209e-06, + "loss": 0.4014, + "step": 3340 + }, + { + "epoch": 1.569821930646673, + "grad_norm": 2.7479453086853027, + "learning_rate": 9.803772258669167e-06, + "loss": 0.3854, + "step": 3350 + }, + { + "epoch": 1.5745079662605437, + "grad_norm": 3.2347192764282227, + "learning_rate": 9.803186504217432e-06, + "loss": 0.3913, + "step": 3360 + }, + { + "epoch": 1.5791940018744142, + "grad_norm": 2.865525245666504, + "learning_rate": 9.8026007497657e-06, + "loss": 0.3685, + "step": 3370 + }, + { + "epoch": 1.5838800374882849, + "grad_norm": 3.2018258571624756, + "learning_rate": 9.802014995313966e-06, + "loss": 0.3376, + "step": 3380 + }, + { + "epoch": 1.5885660731021556, + "grad_norm": 2.9291999340057373, + "learning_rate": 9.801429240862231e-06, + "loss": 0.3673, + "step": 3390 + }, + { + "epoch": 1.5932521087160263, + "grad_norm": 2.761934757232666, + "learning_rate": 9.800843486410497e-06, + "loss": 0.3885, + "step": 3400 + }, + { + "epoch": 1.597938144329897, + "grad_norm": 2.509103775024414, + "learning_rate": 9.800257731958763e-06, + "loss": 0.3523, + "step": 3410 + }, + { + "epoch": 1.6026241799437675, + "grad_norm": 2.377322196960449, + "learning_rate": 9.79967197750703e-06, + "loss": 0.3791, + "step": 3420 + }, + { + "epoch": 1.6073102155576382, + "grad_norm": 3.3458712100982666, + "learning_rate": 9.799086223055296e-06, + "loss": 0.3897, + "step": 3430 + }, + { + "epoch": 1.611996251171509, + "grad_norm": 2.7032053470611572, + "learning_rate": 9.798500468603563e-06, + "loss": 0.339, + "step": 3440 + }, + { + "epoch": 1.6166822867853796, + "grad_norm": 3.09551739692688, + "learning_rate": 9.797914714151828e-06, + "loss": 0.384, + "step": 3450 + }, + { + "epoch": 1.6213683223992503, + "grad_norm": 3.2145328521728516, + "learning_rate": 9.797328959700095e-06, + "loss": 0.3961, + "step": 3460 + }, + { + "epoch": 1.6260543580131208, + "grad_norm": 3.6237032413482666, + "learning_rate": 9.79674320524836e-06, + "loss": 0.348, + "step": 3470 + }, + { + "epoch": 1.6307403936269915, + "grad_norm": 2.706770420074463, + "learning_rate": 9.796157450796627e-06, + "loss": 0.3281, + "step": 3480 + }, + { + "epoch": 1.6354264292408622, + "grad_norm": 3.071535348892212, + "learning_rate": 9.795571696344894e-06, + "loss": 0.3645, + "step": 3490 + }, + { + "epoch": 1.640112464854733, + "grad_norm": 2.757957696914673, + "learning_rate": 9.794985941893159e-06, + "loss": 0.3203, + "step": 3500 + }, + { + "epoch": 1.640112464854733, + "eval_loss": 0.07808861136436462, + "eval_pearson_cosine": 0.7705724911864849, + "eval_pearson_dot": 0.5870542264131444, + "eval_pearson_euclidean": 0.7531019492466129, + "eval_pearson_manhattan": 0.7529286527249113, + "eval_runtime": 39.5374, + "eval_samples_per_second": 37.939, + "eval_spearman_cosine": 0.781003234465689, + "eval_spearman_dot": 0.5891471444366018, + "eval_spearman_euclidean": 0.7690560764997629, + "eval_spearman_manhattan": 0.7689244287192701, + "eval_steps_per_second": 37.939, + "step": 3500 + }, + { + "epoch": 1.6447985004686037, + "grad_norm": 2.8622870445251465, + "learning_rate": 9.794400187441426e-06, + "loss": 0.361, + "step": 3510 + }, + { + "epoch": 1.6494845360824741, + "grad_norm": 3.1786670684814453, + "learning_rate": 9.793814432989691e-06, + "loss": 0.376, + "step": 3520 + }, + { + "epoch": 1.6541705716963448, + "grad_norm": 2.9098026752471924, + "learning_rate": 9.793228678537958e-06, + "loss": 0.383, + "step": 3530 + }, + { + "epoch": 1.6588566073102156, + "grad_norm": 3.0556936264038086, + "learning_rate": 9.792642924086225e-06, + "loss": 0.3966, + "step": 3540 + }, + { + "epoch": 1.6635426429240863, + "grad_norm": 2.920565128326416, + "learning_rate": 9.79205716963449e-06, + "loss": 0.3531, + "step": 3550 + }, + { + "epoch": 1.668228678537957, + "grad_norm": 2.669887065887451, + "learning_rate": 9.791471415182755e-06, + "loss": 0.3784, + "step": 3560 + }, + { + "epoch": 1.6729147141518275, + "grad_norm": 2.7581350803375244, + "learning_rate": 9.790885660731022e-06, + "loss": 0.3562, + "step": 3570 + }, + { + "epoch": 1.6776007497656982, + "grad_norm": 3.3164896965026855, + "learning_rate": 9.790299906279287e-06, + "loss": 0.363, + "step": 3580 + }, + { + "epoch": 1.6822867853795689, + "grad_norm": 3.1747140884399414, + "learning_rate": 9.789714151827554e-06, + "loss": 0.3816, + "step": 3590 + }, + { + "epoch": 1.6869728209934396, + "grad_norm": 2.896433115005493, + "learning_rate": 9.789128397375821e-06, + "loss": 0.3727, + "step": 3600 + }, + { + "epoch": 1.6916588566073103, + "grad_norm": 2.8748035430908203, + "learning_rate": 9.788542642924087e-06, + "loss": 0.3501, + "step": 3610 + }, + { + "epoch": 1.6963448922211808, + "grad_norm": 3.0647246837615967, + "learning_rate": 9.787956888472353e-06, + "loss": 0.3742, + "step": 3620 + }, + { + "epoch": 1.7010309278350515, + "grad_norm": 2.368314504623413, + "learning_rate": 9.787371134020619e-06, + "loss": 0.3509, + "step": 3630 + }, + { + "epoch": 1.7057169634489222, + "grad_norm": 2.563969135284424, + "learning_rate": 9.786785379568886e-06, + "loss": 0.3205, + "step": 3640 + }, + { + "epoch": 1.710402999062793, + "grad_norm": 3.2098007202148438, + "learning_rate": 9.786199625117153e-06, + "loss": 0.3607, + "step": 3650 + }, + { + "epoch": 1.7150890346766636, + "grad_norm": 3.1060285568237305, + "learning_rate": 9.785613870665418e-06, + "loss": 0.3159, + "step": 3660 + }, + { + "epoch": 1.7197750702905341, + "grad_norm": 2.471100330352783, + "learning_rate": 9.785028116213685e-06, + "loss": 0.375, + "step": 3670 + }, + { + "epoch": 1.7244611059044048, + "grad_norm": 2.991903305053711, + "learning_rate": 9.78444236176195e-06, + "loss": 0.3944, + "step": 3680 + }, + { + "epoch": 1.7291471415182755, + "grad_norm": 2.6646485328674316, + "learning_rate": 9.783856607310215e-06, + "loss": 0.4026, + "step": 3690 + }, + { + "epoch": 1.7338331771321462, + "grad_norm": 2.392664670944214, + "learning_rate": 9.783270852858484e-06, + "loss": 0.3606, + "step": 3700 + }, + { + "epoch": 1.738519212746017, + "grad_norm": 3.0139496326446533, + "learning_rate": 9.782685098406749e-06, + "loss": 0.3475, + "step": 3710 + }, + { + "epoch": 1.7432052483598874, + "grad_norm": 2.6688876152038574, + "learning_rate": 9.782099343955014e-06, + "loss": 0.413, + "step": 3720 + }, + { + "epoch": 1.7478912839737581, + "grad_norm": 3.372584581375122, + "learning_rate": 9.781513589503281e-06, + "loss": 0.3544, + "step": 3730 + }, + { + "epoch": 1.7525773195876289, + "grad_norm": 2.9981796741485596, + "learning_rate": 9.780927835051546e-06, + "loss": 0.3747, + "step": 3740 + }, + { + "epoch": 1.7572633552014996, + "grad_norm": 3.5128731727600098, + "learning_rate": 9.780342080599813e-06, + "loss": 0.4052, + "step": 3750 + }, + { + "epoch": 1.7572633552014996, + "eval_loss": 0.08239996433258057, + "eval_pearson_cosine": 0.7705098430219977, + "eval_pearson_dot": 0.5909106805114561, + "eval_pearson_euclidean": 0.7628273731383075, + "eval_pearson_manhattan": 0.7627845021563395, + "eval_runtime": 39.7472, + "eval_samples_per_second": 37.738, + "eval_spearman_cosine": 0.7816481068106005, + "eval_spearman_dot": 0.5989182063745997, + "eval_spearman_euclidean": 0.7771176907760753, + "eval_spearman_manhattan": 0.7770550572795577, + "eval_steps_per_second": 37.738, + "step": 3750 + }, + { + "epoch": 1.7619493908153703, + "grad_norm": 3.215078353881836, + "learning_rate": 9.77975632614808e-06, + "loss": 0.3424, + "step": 3760 + }, + { + "epoch": 1.7666354264292408, + "grad_norm": 3.310758113861084, + "learning_rate": 9.779170571696345e-06, + "loss": 0.3765, + "step": 3770 + }, + { + "epoch": 1.7713214620431117, + "grad_norm": 2.7431821823120117, + "learning_rate": 9.778584817244612e-06, + "loss": 0.3545, + "step": 3780 + }, + { + "epoch": 1.7760074976569822, + "grad_norm": 3.1174018383026123, + "learning_rate": 9.777999062792877e-06, + "loss": 0.3495, + "step": 3790 + }, + { + "epoch": 1.780693533270853, + "grad_norm": 2.911485195159912, + "learning_rate": 9.777413308341144e-06, + "loss": 0.3684, + "step": 3800 + }, + { + "epoch": 1.7853795688847236, + "grad_norm": 3.4018588066101074, + "learning_rate": 9.776827553889411e-06, + "loss": 0.3702, + "step": 3810 + }, + { + "epoch": 1.790065604498594, + "grad_norm": 2.8226757049560547, + "learning_rate": 9.776241799437677e-06, + "loss": 0.3506, + "step": 3820 + }, + { + "epoch": 1.794751640112465, + "grad_norm": 3.5894434452056885, + "learning_rate": 9.775656044985943e-06, + "loss": 0.3928, + "step": 3830 + }, + { + "epoch": 1.7994376757263355, + "grad_norm": 2.49806547164917, + "learning_rate": 9.775070290534209e-06, + "loss": 0.3162, + "step": 3840 + }, + { + "epoch": 1.8041237113402062, + "grad_norm": 3.0954976081848145, + "learning_rate": 9.774484536082474e-06, + "loss": 0.3657, + "step": 3850 + }, + { + "epoch": 1.808809746954077, + "grad_norm": 2.629826307296753, + "learning_rate": 9.77389878163074e-06, + "loss": 0.3017, + "step": 3860 + }, + { + "epoch": 1.8134957825679474, + "grad_norm": 3.1166768074035645, + "learning_rate": 9.773313027179008e-06, + "loss": 0.3778, + "step": 3870 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 3.0856707096099854, + "learning_rate": 9.772727272727273e-06, + "loss": 0.3949, + "step": 3880 + }, + { + "epoch": 1.8228678537956888, + "grad_norm": 2.7912850379943848, + "learning_rate": 9.77214151827554e-06, + "loss": 0.377, + "step": 3890 + }, + { + "epoch": 1.8275538894095595, + "grad_norm": 2.9028005599975586, + "learning_rate": 9.771555763823805e-06, + "loss": 0.361, + "step": 3900 + }, + { + "epoch": 1.8322399250234302, + "grad_norm": 3.034365177154541, + "learning_rate": 9.770970009372072e-06, + "loss": 0.3776, + "step": 3910 + }, + { + "epoch": 1.8369259606373007, + "grad_norm": 2.7740864753723145, + "learning_rate": 9.770384254920339e-06, + "loss": 0.3322, + "step": 3920 + }, + { + "epoch": 1.8416119962511717, + "grad_norm": 3.775836944580078, + "learning_rate": 9.769798500468604e-06, + "loss": 0.3668, + "step": 3930 + }, + { + "epoch": 1.8462980318650422, + "grad_norm": 3.1334052085876465, + "learning_rate": 9.769212746016871e-06, + "loss": 0.3633, + "step": 3940 + }, + { + "epoch": 1.8509840674789129, + "grad_norm": 3.311575174331665, + "learning_rate": 9.768626991565136e-06, + "loss": 0.3507, + "step": 3950 + }, + { + "epoch": 1.8556701030927836, + "grad_norm": 3.3283636569976807, + "learning_rate": 9.768041237113403e-06, + "loss": 0.3393, + "step": 3960 + }, + { + "epoch": 1.860356138706654, + "grad_norm": 2.8960700035095215, + "learning_rate": 9.767455482661668e-06, + "loss": 0.4164, + "step": 3970 + }, + { + "epoch": 1.865042174320525, + "grad_norm": 2.63865327835083, + "learning_rate": 9.766869728209935e-06, + "loss": 0.3561, + "step": 3980 + }, + { + "epoch": 1.8697282099343955, + "grad_norm": 2.750246286392212, + "learning_rate": 9.766283973758202e-06, + "loss": 0.3668, + "step": 3990 + }, + { + "epoch": 1.8744142455482662, + "grad_norm": 2.736009359359741, + "learning_rate": 9.765698219306467e-06, + "loss": 0.3723, + "step": 4000 + }, + { + "epoch": 1.8744142455482662, + "eval_loss": 0.0818546786904335, + "eval_pearson_cosine": 0.7719873880440176, + "eval_pearson_dot": 0.5710773246097212, + "eval_pearson_euclidean": 0.7519982934890663, + "eval_pearson_manhattan": 0.7515239555531821, + "eval_runtime": 39.1942, + "eval_samples_per_second": 38.271, + "eval_spearman_cosine": 0.7840046796615474, + "eval_spearman_dot": 0.5713176227902256, + "eval_spearman_euclidean": 0.7685352754174982, + "eval_spearman_manhattan": 0.76785681763944, + "eval_steps_per_second": 38.271, + "step": 4000 + }, + { + "epoch": 1.879100281162137, + "grad_norm": 3.1088342666625977, + "learning_rate": 9.765112464854733e-06, + "loss": 0.3162, + "step": 4010 + }, + { + "epoch": 1.8837863167760074, + "grad_norm": 2.2759673595428467, + "learning_rate": 9.764526710403e-06, + "loss": 0.3647, + "step": 4020 + }, + { + "epoch": 1.8884723523898783, + "grad_norm": 2.4756667613983154, + "learning_rate": 9.763940955951267e-06, + "loss": 0.3741, + "step": 4030 + }, + { + "epoch": 1.8931583880037488, + "grad_norm": 2.7309701442718506, + "learning_rate": 9.763355201499532e-06, + "loss": 0.3732, + "step": 4040 + }, + { + "epoch": 1.8978444236176195, + "grad_norm": 2.3570773601531982, + "learning_rate": 9.762769447047799e-06, + "loss": 0.3593, + "step": 4050 + }, + { + "epoch": 1.9025304592314902, + "grad_norm": 3.3677115440368652, + "learning_rate": 9.762183692596064e-06, + "loss": 0.3859, + "step": 4060 + }, + { + "epoch": 1.9072164948453607, + "grad_norm": 3.787653684616089, + "learning_rate": 9.76159793814433e-06, + "loss": 0.3775, + "step": 4070 + }, + { + "epoch": 1.9119025304592316, + "grad_norm": 2.686065196990967, + "learning_rate": 9.761012183692596e-06, + "loss": 0.3531, + "step": 4080 + }, + { + "epoch": 1.9165885660731021, + "grad_norm": 2.230189323425293, + "learning_rate": 9.760426429240863e-06, + "loss": 0.3291, + "step": 4090 + }, + { + "epoch": 1.9212746016869728, + "grad_norm": 2.815934896469116, + "learning_rate": 9.75984067478913e-06, + "loss": 0.3644, + "step": 4100 + }, + { + "epoch": 1.9259606373008435, + "grad_norm": 3.123044967651367, + "learning_rate": 9.759254920337395e-06, + "loss": 0.377, + "step": 4110 + }, + { + "epoch": 1.930646672914714, + "grad_norm": 2.2790846824645996, + "learning_rate": 9.758669165885662e-06, + "loss": 0.3623, + "step": 4120 + }, + { + "epoch": 1.935332708528585, + "grad_norm": 2.658747434616089, + "learning_rate": 9.758083411433927e-06, + "loss": 0.3289, + "step": 4130 + }, + { + "epoch": 1.9400187441424555, + "grad_norm": 2.783327102661133, + "learning_rate": 9.757497656982194e-06, + "loss": 0.3903, + "step": 4140 + }, + { + "epoch": 1.9447047797563262, + "grad_norm": 2.442927598953247, + "learning_rate": 9.756911902530461e-06, + "loss": 0.3539, + "step": 4150 + }, + { + "epoch": 1.9493908153701969, + "grad_norm": 3.6274266242980957, + "learning_rate": 9.756326148078726e-06, + "loss": 0.3407, + "step": 4160 + }, + { + "epoch": 1.9540768509840674, + "grad_norm": 3.194626808166504, + "learning_rate": 9.755740393626991e-06, + "loss": 0.3585, + "step": 4170 + }, + { + "epoch": 1.9587628865979383, + "grad_norm": 2.9573676586151123, + "learning_rate": 9.755154639175258e-06, + "loss": 0.343, + "step": 4180 + }, + { + "epoch": 1.9634489222118088, + "grad_norm": 2.608351707458496, + "learning_rate": 9.754568884723524e-06, + "loss": 0.3632, + "step": 4190 + }, + { + "epoch": 1.9681349578256795, + "grad_norm": 2.9371848106384277, + "learning_rate": 9.75398313027179e-06, + "loss": 0.3341, + "step": 4200 + }, + { + "epoch": 1.9728209934395502, + "grad_norm": 3.698261022567749, + "learning_rate": 9.753397375820057e-06, + "loss": 0.3903, + "step": 4210 + }, + { + "epoch": 1.9775070290534207, + "grad_norm": 2.6606619358062744, + "learning_rate": 9.752811621368323e-06, + "loss": 0.3365, + "step": 4220 + }, + { + "epoch": 1.9821930646672916, + "grad_norm": 3.170403480529785, + "learning_rate": 9.75222586691659e-06, + "loss": 0.3577, + "step": 4230 + }, + { + "epoch": 1.986879100281162, + "grad_norm": 2.7663040161132812, + "learning_rate": 9.751640112464855e-06, + "loss": 0.3687, + "step": 4240 + }, + { + "epoch": 1.9915651358950328, + "grad_norm": 2.217230796813965, + "learning_rate": 9.751054358013122e-06, + "loss": 0.3645, + "step": 4250 + }, + { + "epoch": 1.9915651358950328, + "eval_loss": 0.08020295202732086, + "eval_pearson_cosine": 0.7676372739184956, + "eval_pearson_dot": 0.5685492783866799, + "eval_pearson_euclidean": 0.7560020323373777, + "eval_pearson_manhattan": 0.7559508884315278, + "eval_runtime": 39.1696, + "eval_samples_per_second": 38.295, + "eval_spearman_cosine": 0.7804455454568248, + "eval_spearman_dot": 0.570131474662444, + "eval_spearman_euclidean": 0.770297760657173, + "eval_spearman_manhattan": 0.7703714920263548, + "eval_steps_per_second": 38.295, + "step": 4250 + }, + { + "epoch": 1.9962511715089035, + "grad_norm": 2.7864959239959717, + "learning_rate": 9.750468603561389e-06, + "loss": 0.3761, + "step": 4260 + }, + { + "epoch": 2.000937207122774, + "grad_norm": 2.399378776550293, + "learning_rate": 9.749882849109654e-06, + "loss": 0.3253, + "step": 4270 + }, + { + "epoch": 2.005623242736645, + "grad_norm": 3.282205820083618, + "learning_rate": 9.74929709465792e-06, + "loss": 0.2838, + "step": 4280 + }, + { + "epoch": 2.0103092783505154, + "grad_norm": 2.5611684322357178, + "learning_rate": 9.748711340206186e-06, + "loss": 0.2921, + "step": 4290 + }, + { + "epoch": 2.014995313964386, + "grad_norm": 2.7141940593719482, + "learning_rate": 9.748125585754453e-06, + "loss": 0.274, + "step": 4300 + }, + { + "epoch": 2.019681349578257, + "grad_norm": 2.566196918487549, + "learning_rate": 9.74753983130272e-06, + "loss": 0.2701, + "step": 4310 + }, + { + "epoch": 2.0243673851921273, + "grad_norm": 2.6438400745391846, + "learning_rate": 9.746954076850985e-06, + "loss": 0.2898, + "step": 4320 + }, + { + "epoch": 2.0290534208059983, + "grad_norm": 2.603959560394287, + "learning_rate": 9.74636832239925e-06, + "loss": 0.2588, + "step": 4330 + }, + { + "epoch": 2.0337394564198688, + "grad_norm": 3.4404914379119873, + "learning_rate": 9.745782567947517e-06, + "loss": 0.3006, + "step": 4340 + }, + { + "epoch": 2.0384254920337392, + "grad_norm": 2.573943853378296, + "learning_rate": 9.745196813495782e-06, + "loss": 0.2826, + "step": 4350 + }, + { + "epoch": 2.04311152764761, + "grad_norm": 2.327101945877075, + "learning_rate": 9.74461105904405e-06, + "loss": 0.2842, + "step": 4360 + }, + { + "epoch": 2.0477975632614807, + "grad_norm": 3.299482583999634, + "learning_rate": 9.744025304592316e-06, + "loss": 0.2649, + "step": 4370 + }, + { + "epoch": 2.0524835988753516, + "grad_norm": 2.5444982051849365, + "learning_rate": 9.743439550140581e-06, + "loss": 0.293, + "step": 4380 + }, + { + "epoch": 2.057169634489222, + "grad_norm": 2.7384984493255615, + "learning_rate": 9.742853795688848e-06, + "loss": 0.3027, + "step": 4390 + }, + { + "epoch": 2.0618556701030926, + "grad_norm": 2.323399782180786, + "learning_rate": 9.742268041237114e-06, + "loss": 0.3082, + "step": 4400 + }, + { + "epoch": 2.0665417057169635, + "grad_norm": 2.821531057357788, + "learning_rate": 9.74168228678538e-06, + "loss": 0.3044, + "step": 4410 + }, + { + "epoch": 2.071227741330834, + "grad_norm": 3.8503706455230713, + "learning_rate": 9.741096532333647e-06, + "loss": 0.2886, + "step": 4420 + }, + { + "epoch": 2.075913776944705, + "grad_norm": 3.2392382621765137, + "learning_rate": 9.740510777881913e-06, + "loss": 0.2589, + "step": 4430 + }, + { + "epoch": 2.0805998125585754, + "grad_norm": 2.823723316192627, + "learning_rate": 9.73992502343018e-06, + "loss": 0.2924, + "step": 4440 + }, + { + "epoch": 2.085285848172446, + "grad_norm": 2.604548931121826, + "learning_rate": 9.739339268978445e-06, + "loss": 0.3004, + "step": 4450 + }, + { + "epoch": 2.089971883786317, + "grad_norm": 2.7885937690734863, + "learning_rate": 9.738753514526712e-06, + "loss": 0.2967, + "step": 4460 + }, + { + "epoch": 2.0946579194001873, + "grad_norm": 2.909656524658203, + "learning_rate": 9.738167760074977e-06, + "loss": 0.2602, + "step": 4470 + }, + { + "epoch": 2.0993439550140582, + "grad_norm": 3.450695514678955, + "learning_rate": 9.737582005623244e-06, + "loss": 0.2841, + "step": 4480 + }, + { + "epoch": 2.1040299906279287, + "grad_norm": 2.1142079830169678, + "learning_rate": 9.736996251171509e-06, + "loss": 0.2492, + "step": 4490 + }, + { + "epoch": 2.108716026241799, + "grad_norm": 3.1369121074676514, + "learning_rate": 9.736410496719776e-06, + "loss": 0.3007, + "step": 4500 + }, + { + "epoch": 2.108716026241799, + "eval_loss": 0.06621846556663513, + "eval_pearson_cosine": 0.768161993291983, + "eval_pearson_dot": 0.5972572426991363, + "eval_pearson_euclidean": 0.7574105370032385, + "eval_pearson_manhattan": 0.7571659262987636, + "eval_runtime": 40.2217, + "eval_samples_per_second": 37.293, + "eval_spearman_cosine": 0.7799039169576075, + "eval_spearman_dot": 0.5981151115114701, + "eval_spearman_euclidean": 0.7720521764960889, + "eval_spearman_manhattan": 0.772146767310716, + "eval_steps_per_second": 37.293, + "step": 4500 + }, + { + "epoch": 2.11340206185567, + "grad_norm": 2.972123146057129, + "learning_rate": 9.735824742268041e-06, + "loss": 0.2515, + "step": 4510 + }, + { + "epoch": 2.1180880974695406, + "grad_norm": 2.9807615280151367, + "learning_rate": 9.735238987816308e-06, + "loss": 0.2773, + "step": 4520 + }, + { + "epoch": 2.1227741330834116, + "grad_norm": 2.88916015625, + "learning_rate": 9.734653233364575e-06, + "loss": 0.3021, + "step": 4530 + }, + { + "epoch": 2.127460168697282, + "grad_norm": 2.4502129554748535, + "learning_rate": 9.73406747891284e-06, + "loss": 0.2657, + "step": 4540 + }, + { + "epoch": 2.1321462043111525, + "grad_norm": 3.6442370414733887, + "learning_rate": 9.733481724461107e-06, + "loss": 0.2737, + "step": 4550 + }, + { + "epoch": 2.1368322399250235, + "grad_norm": 3.181819438934326, + "learning_rate": 9.732895970009372e-06, + "loss": 0.3209, + "step": 4560 + }, + { + "epoch": 2.141518275538894, + "grad_norm": 2.9747514724731445, + "learning_rate": 9.73231021555764e-06, + "loss": 0.2731, + "step": 4570 + }, + { + "epoch": 2.146204311152765, + "grad_norm": 3.7340550422668457, + "learning_rate": 9.731724461105905e-06, + "loss": 0.3319, + "step": 4580 + }, + { + "epoch": 2.1508903467666354, + "grad_norm": 2.3309686183929443, + "learning_rate": 9.731138706654171e-06, + "loss": 0.2854, + "step": 4590 + }, + { + "epoch": 2.155576382380506, + "grad_norm": 2.801131010055542, + "learning_rate": 9.730552952202438e-06, + "loss": 0.2759, + "step": 4600 + }, + { + "epoch": 2.160262417994377, + "grad_norm": 2.5506978034973145, + "learning_rate": 9.729967197750704e-06, + "loss": 0.2425, + "step": 4610 + }, + { + "epoch": 2.1649484536082473, + "grad_norm": 2.3702712059020996, + "learning_rate": 9.72938144329897e-06, + "loss": 0.2645, + "step": 4620 + }, + { + "epoch": 2.169634489222118, + "grad_norm": 2.560842752456665, + "learning_rate": 9.728795688847236e-06, + "loss": 0.28, + "step": 4630 + }, + { + "epoch": 2.1743205248359887, + "grad_norm": 2.8994758129119873, + "learning_rate": 9.728209934395501e-06, + "loss": 0.2764, + "step": 4640 + }, + { + "epoch": 2.179006560449859, + "grad_norm": 3.6292624473571777, + "learning_rate": 9.727624179943768e-06, + "loss": 0.2827, + "step": 4650 + }, + { + "epoch": 2.18369259606373, + "grad_norm": 2.619485378265381, + "learning_rate": 9.727038425492035e-06, + "loss": 0.2869, + "step": 4660 + }, + { + "epoch": 2.1883786316776006, + "grad_norm": 2.676844358444214, + "learning_rate": 9.7264526710403e-06, + "loss": 0.2862, + "step": 4670 + }, + { + "epoch": 2.1930646672914715, + "grad_norm": 2.7725651264190674, + "learning_rate": 9.725866916588567e-06, + "loss": 0.2965, + "step": 4680 + }, + { + "epoch": 2.197750702905342, + "grad_norm": 2.34023380279541, + "learning_rate": 9.725281162136832e-06, + "loss": 0.2483, + "step": 4690 + }, + { + "epoch": 2.2024367385192125, + "grad_norm": 2.652027130126953, + "learning_rate": 9.724695407685099e-06, + "loss": 0.2901, + "step": 4700 + }, + { + "epoch": 2.2071227741330834, + "grad_norm": 2.3790388107299805, + "learning_rate": 9.724109653233366e-06, + "loss": 0.2797, + "step": 4710 + }, + { + "epoch": 2.211808809746954, + "grad_norm": 2.6680283546447754, + "learning_rate": 9.723523898781631e-06, + "loss": 0.2446, + "step": 4720 + }, + { + "epoch": 2.216494845360825, + "grad_norm": 3.196193218231201, + "learning_rate": 9.722938144329898e-06, + "loss": 0.3073, + "step": 4730 + }, + { + "epoch": 2.2211808809746953, + "grad_norm": 2.9742684364318848, + "learning_rate": 9.722352389878163e-06, + "loss": 0.2641, + "step": 4740 + }, + { + "epoch": 2.2258669165885663, + "grad_norm": 1.8688490390777588, + "learning_rate": 9.72176663542643e-06, + "loss": 0.2397, + "step": 4750 + }, + { + "epoch": 2.2258669165885663, + "eval_loss": 0.06173858791589737, + "eval_pearson_cosine": 0.7692502208827889, + "eval_pearson_dot": 0.5854569446239672, + "eval_pearson_euclidean": 0.7502268637430376, + "eval_pearson_manhattan": 0.7501450224167883, + "eval_runtime": 40.4876, + "eval_samples_per_second": 37.048, + "eval_spearman_cosine": 0.7781711434828433, + "eval_spearman_dot": 0.5898324527411585, + "eval_spearman_euclidean": 0.7652357962017762, + "eval_spearman_manhattan": 0.765505305290314, + "eval_steps_per_second": 37.048, + "step": 4750 + }, + { + "epoch": 2.2305529522024368, + "grad_norm": 3.1260733604431152, + "learning_rate": 9.721180880974697e-06, + "loss": 0.2686, + "step": 4760 + }, + { + "epoch": 2.2352389878163073, + "grad_norm": 2.8164772987365723, + "learning_rate": 9.720595126522962e-06, + "loss": 0.2664, + "step": 4770 + }, + { + "epoch": 2.239925023430178, + "grad_norm": 2.9061129093170166, + "learning_rate": 9.72000937207123e-06, + "loss": 0.2894, + "step": 4780 + }, + { + "epoch": 2.2446110590440487, + "grad_norm": 2.427224636077881, + "learning_rate": 9.719423617619495e-06, + "loss": 0.2952, + "step": 4790 + }, + { + "epoch": 2.2492970946579196, + "grad_norm": 3.4472455978393555, + "learning_rate": 9.71883786316776e-06, + "loss": 0.3036, + "step": 4800 + }, + { + "epoch": 2.25398313027179, + "grad_norm": 3.032599925994873, + "learning_rate": 9.718252108716027e-06, + "loss": 0.2702, + "step": 4810 + }, + { + "epoch": 2.2586691658856606, + "grad_norm": 2.9868416786193848, + "learning_rate": 9.717666354264294e-06, + "loss": 0.2547, + "step": 4820 + }, + { + "epoch": 2.2633552014995315, + "grad_norm": 3.0330114364624023, + "learning_rate": 9.717080599812559e-06, + "loss": 0.2701, + "step": 4830 + }, + { + "epoch": 2.268041237113402, + "grad_norm": 3.1494383811950684, + "learning_rate": 9.716494845360826e-06, + "loss": 0.2474, + "step": 4840 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 2.392869472503662, + "learning_rate": 9.715909090909091e-06, + "loss": 0.2571, + "step": 4850 + }, + { + "epoch": 2.2774133083411434, + "grad_norm": 2.711582660675049, + "learning_rate": 9.715323336457358e-06, + "loss": 0.2778, + "step": 4860 + }, + { + "epoch": 2.282099343955014, + "grad_norm": 3.1134321689605713, + "learning_rate": 9.714737582005625e-06, + "loss": 0.2702, + "step": 4870 + }, + { + "epoch": 2.286785379568885, + "grad_norm": 3.022188901901245, + "learning_rate": 9.71415182755389e-06, + "loss": 0.248, + "step": 4880 + }, + { + "epoch": 2.2914714151827553, + "grad_norm": 2.618468999862671, + "learning_rate": 9.713566073102157e-06, + "loss": 0.2878, + "step": 4890 + }, + { + "epoch": 2.296157450796626, + "grad_norm": 2.559990882873535, + "learning_rate": 9.712980318650422e-06, + "loss": 0.259, + "step": 4900 + }, + { + "epoch": 2.3008434864104967, + "grad_norm": 3.009366750717163, + "learning_rate": 9.712394564198689e-06, + "loss": 0.2662, + "step": 4910 + }, + { + "epoch": 2.3055295220243672, + "grad_norm": 2.6885673999786377, + "learning_rate": 9.711808809746956e-06, + "loss": 0.2488, + "step": 4920 + }, + { + "epoch": 2.310215557638238, + "grad_norm": 2.8503615856170654, + "learning_rate": 9.711223055295221e-06, + "loss": 0.2508, + "step": 4930 + }, + { + "epoch": 2.3149015932521086, + "grad_norm": 2.469139575958252, + "learning_rate": 9.710637300843488e-06, + "loss": 0.3025, + "step": 4940 + }, + { + "epoch": 2.319587628865979, + "grad_norm": 3.6701242923736572, + "learning_rate": 9.710051546391753e-06, + "loss": 0.3029, + "step": 4950 + }, + { + "epoch": 2.32427366447985, + "grad_norm": 2.860112428665161, + "learning_rate": 9.709465791940019e-06, + "loss": 0.2924, + "step": 4960 + }, + { + "epoch": 2.3289597000937206, + "grad_norm": 1.8148601055145264, + "learning_rate": 9.708880037488286e-06, + "loss": 0.3011, + "step": 4970 + }, + { + "epoch": 2.3336457357075915, + "grad_norm": 3.073507070541382, + "learning_rate": 9.708294283036552e-06, + "loss": 0.2664, + "step": 4980 + }, + { + "epoch": 2.338331771321462, + "grad_norm": 3.759744882583618, + "learning_rate": 9.707708528584818e-06, + "loss": 0.2977, + "step": 4990 + }, + { + "epoch": 2.3430178069353325, + "grad_norm": 2.8136818408966064, + "learning_rate": 9.707122774133085e-06, + "loss": 0.28, + "step": 5000 + }, + { + "epoch": 2.3430178069353325, + "eval_loss": 0.0644521713256836, + "eval_pearson_cosine": 0.7653835203754937, + "eval_pearson_dot": 0.5924636721600756, + "eval_pearson_euclidean": 0.7568919402450831, + "eval_pearson_manhattan": 0.7567361379039284, + "eval_runtime": 40.0042, + "eval_samples_per_second": 37.496, + "eval_spearman_cosine": 0.7760073163206256, + "eval_spearman_dot": 0.5970251712536361, + "eval_spearman_euclidean": 0.7705144458647132, + "eval_spearman_manhattan": 0.7705196006726509, + "eval_steps_per_second": 37.496, + "step": 5000 + }, + { + "epoch": 2.3477038425492034, + "grad_norm": 2.8381240367889404, + "learning_rate": 9.70653701968135e-06, + "loss": 0.2584, + "step": 5010 + }, + { + "epoch": 2.352389878163074, + "grad_norm": 2.48789381980896, + "learning_rate": 9.705951265229617e-06, + "loss": 0.2518, + "step": 5020 + }, + { + "epoch": 2.357075913776945, + "grad_norm": 2.8016576766967773, + "learning_rate": 9.705365510777884e-06, + "loss": 0.2839, + "step": 5030 + }, + { + "epoch": 2.3617619493908153, + "grad_norm": 3.750737428665161, + "learning_rate": 9.704779756326149e-06, + "loss": 0.2757, + "step": 5040 + }, + { + "epoch": 2.3664479850046862, + "grad_norm": 3.028477668762207, + "learning_rate": 9.704194001874416e-06, + "loss": 0.2542, + "step": 5050 + }, + { + "epoch": 2.3711340206185567, + "grad_norm": 2.5787017345428467, + "learning_rate": 9.703608247422681e-06, + "loss": 0.2623, + "step": 5060 + }, + { + "epoch": 2.375820056232427, + "grad_norm": 3.54349422454834, + "learning_rate": 9.703022492970948e-06, + "loss": 0.2872, + "step": 5070 + }, + { + "epoch": 2.380506091846298, + "grad_norm": 3.924848794937134, + "learning_rate": 9.702436738519213e-06, + "loss": 0.2641, + "step": 5080 + }, + { + "epoch": 2.3851921274601686, + "grad_norm": 2.8141496181488037, + "learning_rate": 9.70185098406748e-06, + "loss": 0.2549, + "step": 5090 + }, + { + "epoch": 2.3898781630740396, + "grad_norm": 3.0236399173736572, + "learning_rate": 9.701265229615747e-06, + "loss": 0.2715, + "step": 5100 + }, + { + "epoch": 2.39456419868791, + "grad_norm": 2.611743688583374, + "learning_rate": 9.700679475164012e-06, + "loss": 0.2572, + "step": 5110 + }, + { + "epoch": 2.3992502343017805, + "grad_norm": 2.905355930328369, + "learning_rate": 9.700093720712277e-06, + "loss": 0.2772, + "step": 5120 + }, + { + "epoch": 2.4039362699156515, + "grad_norm": 1.6498035192489624, + "learning_rate": 9.699507966260544e-06, + "loss": 0.2776, + "step": 5130 + }, + { + "epoch": 2.408622305529522, + "grad_norm": 4.166587829589844, + "learning_rate": 9.69892221180881e-06, + "loss": 0.283, + "step": 5140 + }, + { + "epoch": 2.413308341143393, + "grad_norm": 2.960628032684326, + "learning_rate": 9.698336457357076e-06, + "loss": 0.2562, + "step": 5150 + }, + { + "epoch": 2.4179943767572634, + "grad_norm": 3.179250955581665, + "learning_rate": 9.697750702905343e-06, + "loss": 0.2654, + "step": 5160 + }, + { + "epoch": 2.422680412371134, + "grad_norm": 2.3080592155456543, + "learning_rate": 9.697164948453609e-06, + "loss": 0.2728, + "step": 5170 + }, + { + "epoch": 2.427366447985005, + "grad_norm": 2.4087467193603516, + "learning_rate": 9.696579194001876e-06, + "loss": 0.3017, + "step": 5180 + }, + { + "epoch": 2.4320524835988753, + "grad_norm": 2.0054640769958496, + "learning_rate": 9.69599343955014e-06, + "loss": 0.2592, + "step": 5190 + }, + { + "epoch": 2.436738519212746, + "grad_norm": 3.7607529163360596, + "learning_rate": 9.695407685098408e-06, + "loss": 0.2501, + "step": 5200 + }, + { + "epoch": 2.4414245548266167, + "grad_norm": 2.0960841178894043, + "learning_rate": 9.694821930646675e-06, + "loss": 0.2937, + "step": 5210 + }, + { + "epoch": 2.446110590440487, + "grad_norm": 2.9174606800079346, + "learning_rate": 9.69423617619494e-06, + "loss": 0.2509, + "step": 5220 + }, + { + "epoch": 2.450796626054358, + "grad_norm": 2.7972617149353027, + "learning_rate": 9.693650421743207e-06, + "loss": 0.2648, + "step": 5230 + }, + { + "epoch": 2.4554826616682286, + "grad_norm": 2.465677261352539, + "learning_rate": 9.693064667291472e-06, + "loss": 0.2888, + "step": 5240 + }, + { + "epoch": 2.4601686972820995, + "grad_norm": 2.6547045707702637, + "learning_rate": 9.692478912839737e-06, + "loss": 0.2631, + "step": 5250 + }, + { + "epoch": 2.4601686972820995, + "eval_loss": 0.06392496824264526, + "eval_pearson_cosine": 0.771214841842891, + "eval_pearson_dot": 0.5715473025039159, + "eval_pearson_euclidean": 0.75617638488007, + "eval_pearson_manhattan": 0.7560991988119383, + "eval_runtime": 40.3523, + "eval_samples_per_second": 37.173, + "eval_spearman_cosine": 0.7797840912495824, + "eval_spearman_dot": 0.5730858147900617, + "eval_spearman_euclidean": 0.7705235550970289, + "eval_spearman_manhattan": 0.7705309049511561, + "eval_steps_per_second": 37.173, + "step": 5250 + }, + { + "epoch": 2.46485473289597, + "grad_norm": 3.2558162212371826, + "learning_rate": 9.691893158388006e-06, + "loss": 0.2874, + "step": 5260 + }, + { + "epoch": 2.4695407685098405, + "grad_norm": 2.307089328765869, + "learning_rate": 9.691307403936271e-06, + "loss": 0.2681, + "step": 5270 + }, + { + "epoch": 2.4742268041237114, + "grad_norm": 3.001068115234375, + "learning_rate": 9.690721649484536e-06, + "loss": 0.2815, + "step": 5280 + }, + { + "epoch": 2.478912839737582, + "grad_norm": 2.5946176052093506, + "learning_rate": 9.690135895032803e-06, + "loss": 0.2639, + "step": 5290 + }, + { + "epoch": 2.483598875351453, + "grad_norm": 2.6081533432006836, + "learning_rate": 9.689550140581068e-06, + "loss": 0.2707, + "step": 5300 + }, + { + "epoch": 2.4882849109653233, + "grad_norm": 3.037405490875244, + "learning_rate": 9.688964386129335e-06, + "loss": 0.2844, + "step": 5310 + }, + { + "epoch": 2.492970946579194, + "grad_norm": 2.944249391555786, + "learning_rate": 9.688378631677602e-06, + "loss": 0.2728, + "step": 5320 + }, + { + "epoch": 2.4976569821930648, + "grad_norm": 2.7798855304718018, + "learning_rate": 9.687792877225867e-06, + "loss": 0.3013, + "step": 5330 + }, + { + "epoch": 2.5023430178069352, + "grad_norm": 2.005322217941284, + "learning_rate": 9.687207122774134e-06, + "loss": 0.2523, + "step": 5340 + }, + { + "epoch": 2.5070290534208057, + "grad_norm": 2.831803321838379, + "learning_rate": 9.6866213683224e-06, + "loss": 0.2542, + "step": 5350 + }, + { + "epoch": 2.5117150890346767, + "grad_norm": 3.1902036666870117, + "learning_rate": 9.686035613870666e-06, + "loss": 0.241, + "step": 5360 + }, + { + "epoch": 2.5164011246485476, + "grad_norm": 2.875300884246826, + "learning_rate": 9.685449859418933e-06, + "loss": 0.264, + "step": 5370 + }, + { + "epoch": 2.521087160262418, + "grad_norm": 3.253399133682251, + "learning_rate": 9.684864104967199e-06, + "loss": 0.3013, + "step": 5380 + }, + { + "epoch": 2.5257731958762886, + "grad_norm": 2.7400455474853516, + "learning_rate": 9.684278350515465e-06, + "loss": 0.3005, + "step": 5390 + }, + { + "epoch": 2.530459231490159, + "grad_norm": 2.604724884033203, + "learning_rate": 9.68369259606373e-06, + "loss": 0.2805, + "step": 5400 + }, + { + "epoch": 2.53514526710403, + "grad_norm": 2.5421054363250732, + "learning_rate": 9.683106841611996e-06, + "loss": 0.256, + "step": 5410 + }, + { + "epoch": 2.539831302717901, + "grad_norm": 3.7680563926696777, + "learning_rate": 9.682521087160265e-06, + "loss": 0.2685, + "step": 5420 + }, + { + "epoch": 2.5445173383317714, + "grad_norm": 3.140620708465576, + "learning_rate": 9.68193533270853e-06, + "loss": 0.288, + "step": 5430 + }, + { + "epoch": 2.549203373945642, + "grad_norm": 2.77986478805542, + "learning_rate": 9.681349578256795e-06, + "loss": 0.2827, + "step": 5440 + }, + { + "epoch": 2.5538894095595124, + "grad_norm": 3.1461918354034424, + "learning_rate": 9.680763823805062e-06, + "loss": 0.2907, + "step": 5450 + }, + { + "epoch": 2.5585754451733833, + "grad_norm": 2.403411626815796, + "learning_rate": 9.680178069353327e-06, + "loss": 0.2997, + "step": 5460 + }, + { + "epoch": 2.5632614807872542, + "grad_norm": 2.744910955429077, + "learning_rate": 9.679592314901594e-06, + "loss": 0.2615, + "step": 5470 + }, + { + "epoch": 2.5679475164011247, + "grad_norm": 3.292695999145508, + "learning_rate": 9.679006560449861e-06, + "loss": 0.2679, + "step": 5480 + }, + { + "epoch": 2.572633552014995, + "grad_norm": 2.8364174365997314, + "learning_rate": 9.678420805998126e-06, + "loss": 0.2763, + "step": 5490 + }, + { + "epoch": 2.5773195876288657, + "grad_norm": 2.456524610519409, + "learning_rate": 9.677835051546393e-06, + "loss": 0.2488, + "step": 5500 + }, + { + "epoch": 2.5773195876288657, + "eval_loss": 0.06359264999628067, + "eval_pearson_cosine": 0.7736336142524749, + "eval_pearson_dot": 0.5835126580557244, + "eval_pearson_euclidean": 0.7537697181391394, + "eval_pearson_manhattan": 0.7536913116270512, + "eval_runtime": 39.7446, + "eval_samples_per_second": 37.741, + "eval_spearman_cosine": 0.7838309998226812, + "eval_spearman_dot": 0.5860915978330865, + "eval_spearman_euclidean": 0.76851014194766, + "eval_spearman_manhattan": 0.7687268040235111, + "eval_steps_per_second": 37.741, + "step": 5500 + }, + { + "epoch": 2.5820056232427366, + "grad_norm": 2.6862993240356445, + "learning_rate": 9.677249297094658e-06, + "loss": 0.3049, + "step": 5510 + }, + { + "epoch": 2.5866916588566076, + "grad_norm": 2.146784782409668, + "learning_rate": 9.676663542642925e-06, + "loss": 0.2844, + "step": 5520 + }, + { + "epoch": 2.591377694470478, + "grad_norm": 2.9161078929901123, + "learning_rate": 9.676077788191192e-06, + "loss": 0.2505, + "step": 5530 + }, + { + "epoch": 2.5960637300843485, + "grad_norm": 2.771800994873047, + "learning_rate": 9.675492033739457e-06, + "loss": 0.2741, + "step": 5540 + }, + { + "epoch": 2.600749765698219, + "grad_norm": 2.6023566722869873, + "learning_rate": 9.674906279287724e-06, + "loss": 0.271, + "step": 5550 + }, + { + "epoch": 2.60543580131209, + "grad_norm": 3.007667064666748, + "learning_rate": 9.67432052483599e-06, + "loss": 0.2451, + "step": 5560 + }, + { + "epoch": 2.610121836925961, + "grad_norm": 3.014564275741577, + "learning_rate": 9.673734770384255e-06, + "loss": 0.3036, + "step": 5570 + }, + { + "epoch": 2.6148078725398314, + "grad_norm": 2.6932876110076904, + "learning_rate": 9.673149015932522e-06, + "loss": 0.2642, + "step": 5580 + }, + { + "epoch": 2.619493908153702, + "grad_norm": 3.396259307861328, + "learning_rate": 9.672563261480789e-06, + "loss": 0.26, + "step": 5590 + }, + { + "epoch": 2.624179943767573, + "grad_norm": 3.203933000564575, + "learning_rate": 9.671977507029054e-06, + "loss": 0.2629, + "step": 5600 + }, + { + "epoch": 2.6288659793814433, + "grad_norm": 3.077299118041992, + "learning_rate": 9.67139175257732e-06, + "loss": 0.2566, + "step": 5610 + }, + { + "epoch": 2.633552014995314, + "grad_norm": 2.445908308029175, + "learning_rate": 9.670805998125586e-06, + "loss": 0.2751, + "step": 5620 + }, + { + "epoch": 2.6382380506091847, + "grad_norm": 2.4680237770080566, + "learning_rate": 9.670220243673853e-06, + "loss": 0.2725, + "step": 5630 + }, + { + "epoch": 2.642924086223055, + "grad_norm": 3.320460081100464, + "learning_rate": 9.669634489222118e-06, + "loss": 0.3, + "step": 5640 + }, + { + "epoch": 2.647610121836926, + "grad_norm": 2.6988685131073, + "learning_rate": 9.669048734770385e-06, + "loss": 0.2754, + "step": 5650 + }, + { + "epoch": 2.6522961574507966, + "grad_norm": 3.1843039989471436, + "learning_rate": 9.668462980318652e-06, + "loss": 0.2711, + "step": 5660 + }, + { + "epoch": 2.6569821930646675, + "grad_norm": 2.5606629848480225, + "learning_rate": 9.667877225866917e-06, + "loss": 0.2772, + "step": 5670 + }, + { + "epoch": 2.661668228678538, + "grad_norm": 2.2161924839019775, + "learning_rate": 9.667291471415184e-06, + "loss": 0.2501, + "step": 5680 + }, + { + "epoch": 2.6663542642924085, + "grad_norm": 2.6044790744781494, + "learning_rate": 9.66670571696345e-06, + "loss": 0.2859, + "step": 5690 + }, + { + "epoch": 2.6710402999062794, + "grad_norm": 2.9149224758148193, + "learning_rate": 9.666119962511716e-06, + "loss": 0.2834, + "step": 5700 + }, + { + "epoch": 2.67572633552015, + "grad_norm": 3.0715954303741455, + "learning_rate": 9.665534208059983e-06, + "loss": 0.2802, + "step": 5710 + }, + { + "epoch": 2.680412371134021, + "grad_norm": 2.8845441341400146, + "learning_rate": 9.664948453608248e-06, + "loss": 0.2594, + "step": 5720 + }, + { + "epoch": 2.6850984067478914, + "grad_norm": 3.3252668380737305, + "learning_rate": 9.664362699156514e-06, + "loss": 0.2625, + "step": 5730 + }, + { + "epoch": 2.689784442361762, + "grad_norm": 2.8618457317352295, + "learning_rate": 9.66377694470478e-06, + "loss": 0.2769, + "step": 5740 + }, + { + "epoch": 2.6944704779756328, + "grad_norm": 3.196368455886841, + "learning_rate": 9.663191190253046e-06, + "loss": 0.2557, + "step": 5750 + }, + { + "epoch": 2.6944704779756328, + "eval_loss": 0.06136869639158249, + "eval_pearson_cosine": 0.7739160712003912, + "eval_pearson_dot": 0.6008471845975407, + "eval_pearson_euclidean": 0.7570902529490127, + "eval_pearson_manhattan": 0.7569554961897609, + "eval_runtime": 39.4871, + "eval_samples_per_second": 37.987, + "eval_spearman_cosine": 0.7830112643640406, + "eval_spearman_dot": 0.6040995035629967, + "eval_spearman_euclidean": 0.7716512847518021, + "eval_spearman_manhattan": 0.7716299413202475, + "eval_steps_per_second": 37.987, + "step": 5750 + }, + { + "epoch": 2.6991565135895033, + "grad_norm": 2.449201822280884, + "learning_rate": 9.662605435801313e-06, + "loss": 0.2807, + "step": 5760 + }, + { + "epoch": 2.703842549203374, + "grad_norm": 3.0767366886138916, + "learning_rate": 9.66201968134958e-06, + "loss": 0.2842, + "step": 5770 + }, + { + "epoch": 2.7085285848172447, + "grad_norm": 2.7501325607299805, + "learning_rate": 9.661433926897845e-06, + "loss": 0.281, + "step": 5780 + }, + { + "epoch": 2.713214620431115, + "grad_norm": 3.0382957458496094, + "learning_rate": 9.660848172446112e-06, + "loss": 0.2435, + "step": 5790 + }, + { + "epoch": 2.717900656044986, + "grad_norm": 3.02397084236145, + "learning_rate": 9.660262417994377e-06, + "loss": 0.2797, + "step": 5800 + }, + { + "epoch": 2.7225866916588566, + "grad_norm": 2.8019211292266846, + "learning_rate": 9.659676663542644e-06, + "loss": 0.2728, + "step": 5810 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 2.9764068126678467, + "learning_rate": 9.65909090909091e-06, + "loss": 0.2796, + "step": 5820 + }, + { + "epoch": 2.731958762886598, + "grad_norm": 3.254289150238037, + "learning_rate": 9.658505154639176e-06, + "loss": 0.2967, + "step": 5830 + }, + { + "epoch": 2.7366447985004685, + "grad_norm": 3.0329315662384033, + "learning_rate": 9.657919400187443e-06, + "loss": 0.2631, + "step": 5840 + }, + { + "epoch": 2.7413308341143394, + "grad_norm": 2.720784902572632, + "learning_rate": 9.657333645735708e-06, + "loss": 0.273, + "step": 5850 + }, + { + "epoch": 2.74601686972821, + "grad_norm": 2.6841161251068115, + "learning_rate": 9.656747891283973e-06, + "loss": 0.2729, + "step": 5860 + }, + { + "epoch": 2.750702905342081, + "grad_norm": 3.0265884399414062, + "learning_rate": 9.656162136832242e-06, + "loss": 0.2762, + "step": 5870 + }, + { + "epoch": 2.7553889409559513, + "grad_norm": 2.6341817378997803, + "learning_rate": 9.655576382380507e-06, + "loss": 0.2918, + "step": 5880 + }, + { + "epoch": 2.760074976569822, + "grad_norm": 3.1059679985046387, + "learning_rate": 9.654990627928772e-06, + "loss": 0.2448, + "step": 5890 + }, + { + "epoch": 2.7647610121836927, + "grad_norm": 3.105215311050415, + "learning_rate": 9.65440487347704e-06, + "loss": 0.3158, + "step": 5900 + }, + { + "epoch": 2.7694470477975632, + "grad_norm": 2.5665364265441895, + "learning_rate": 9.653819119025304e-06, + "loss": 0.2742, + "step": 5910 + }, + { + "epoch": 2.774133083411434, + "grad_norm": 3.4417858123779297, + "learning_rate": 9.653233364573571e-06, + "loss": 0.2666, + "step": 5920 + }, + { + "epoch": 2.7788191190253047, + "grad_norm": 3.3958632946014404, + "learning_rate": 9.652647610121838e-06, + "loss": 0.2861, + "step": 5930 + }, + { + "epoch": 2.783505154639175, + "grad_norm": 2.7881741523742676, + "learning_rate": 9.652061855670104e-06, + "loss": 0.2979, + "step": 5940 + }, + { + "epoch": 2.788191190253046, + "grad_norm": 3.3900156021118164, + "learning_rate": 9.65147610121837e-06, + "loss": 0.2919, + "step": 5950 + }, + { + "epoch": 2.7928772258669166, + "grad_norm": 2.6520965099334717, + "learning_rate": 9.650890346766636e-06, + "loss": 0.3384, + "step": 5960 + }, + { + "epoch": 2.7975632614807875, + "grad_norm": 2.6379964351654053, + "learning_rate": 9.650304592314903e-06, + "loss": 0.2553, + "step": 5970 + }, + { + "epoch": 2.802249297094658, + "grad_norm": 3.325380802154541, + "learning_rate": 9.64971883786317e-06, + "loss": 0.2599, + "step": 5980 + }, + { + "epoch": 2.8069353327085285, + "grad_norm": 2.9354546070098877, + "learning_rate": 9.649133083411435e-06, + "loss": 0.2981, + "step": 5990 + }, + { + "epoch": 2.8116213683223994, + "grad_norm": 3.915391445159912, + "learning_rate": 9.648547328959702e-06, + "loss": 0.2699, + "step": 6000 + }, + { + "epoch": 2.8116213683223994, + "eval_loss": 0.063628189265728, + "eval_pearson_cosine": 0.7721771737399621, + "eval_pearson_dot": 0.5844319866585721, + "eval_pearson_euclidean": 0.7572362286473506, + "eval_pearson_manhattan": 0.7569847240428516, + "eval_runtime": 39.6806, + "eval_samples_per_second": 37.802, + "eval_spearman_cosine": 0.7795493971324328, + "eval_spearman_dot": 0.586396480606851, + "eval_spearman_euclidean": 0.7700741181493094, + "eval_spearman_manhattan": 0.7699220379699384, + "eval_steps_per_second": 37.802, + "step": 6000 + }, + { + "epoch": 2.81630740393627, + "grad_norm": 2.758107900619507, + "learning_rate": 9.647961574507967e-06, + "loss": 0.277, + "step": 6010 + }, + { + "epoch": 2.820993439550141, + "grad_norm": 2.5489895343780518, + "learning_rate": 9.647375820056232e-06, + "loss": 0.2776, + "step": 6020 + }, + { + "epoch": 2.8256794751640113, + "grad_norm": 2.8726046085357666, + "learning_rate": 9.6467900656045e-06, + "loss": 0.2719, + "step": 6030 + }, + { + "epoch": 2.830365510777882, + "grad_norm": 2.856821298599243, + "learning_rate": 9.646204311152766e-06, + "loss": 0.2717, + "step": 6040 + }, + { + "epoch": 2.8350515463917527, + "grad_norm": 3.395746946334839, + "learning_rate": 9.645618556701031e-06, + "loss": 0.2657, + "step": 6050 + }, + { + "epoch": 2.839737582005623, + "grad_norm": 2.8443150520324707, + "learning_rate": 9.645032802249298e-06, + "loss": 0.2889, + "step": 6060 + }, + { + "epoch": 2.844423617619494, + "grad_norm": 2.680279016494751, + "learning_rate": 9.644447047797563e-06, + "loss": 0.3128, + "step": 6070 + }, + { + "epoch": 2.8491096532333646, + "grad_norm": 3.375882148742676, + "learning_rate": 9.64386129334583e-06, + "loss": 0.2784, + "step": 6080 + }, + { + "epoch": 2.853795688847235, + "grad_norm": 3.4931676387786865, + "learning_rate": 9.643275538894097e-06, + "loss": 0.2906, + "step": 6090 + }, + { + "epoch": 2.858481724461106, + "grad_norm": 3.0119099617004395, + "learning_rate": 9.642689784442362e-06, + "loss": 0.3064, + "step": 6100 + }, + { + "epoch": 2.8631677600749765, + "grad_norm": 2.9946563243865967, + "learning_rate": 9.64210402999063e-06, + "loss": 0.2708, + "step": 6110 + }, + { + "epoch": 2.8678537956888475, + "grad_norm": 2.687286853790283, + "learning_rate": 9.641518275538894e-06, + "loss": 0.2827, + "step": 6120 + }, + { + "epoch": 2.872539831302718, + "grad_norm": 2.7718350887298584, + "learning_rate": 9.640932521087161e-06, + "loss": 0.2815, + "step": 6130 + }, + { + "epoch": 2.8772258669165884, + "grad_norm": 2.4918980598449707, + "learning_rate": 9.640346766635427e-06, + "loss": 0.2383, + "step": 6140 + }, + { + "epoch": 2.8819119025304594, + "grad_norm": 3.1328234672546387, + "learning_rate": 9.639761012183694e-06, + "loss": 0.2917, + "step": 6150 + }, + { + "epoch": 2.88659793814433, + "grad_norm": 3.1907870769500732, + "learning_rate": 9.63917525773196e-06, + "loss": 0.2682, + "step": 6160 + }, + { + "epoch": 2.891283973758201, + "grad_norm": 2.851372241973877, + "learning_rate": 9.638589503280226e-06, + "loss": 0.2523, + "step": 6170 + }, + { + "epoch": 2.8959700093720713, + "grad_norm": 2.8064658641815186, + "learning_rate": 9.638003748828491e-06, + "loss": 0.276, + "step": 6180 + }, + { + "epoch": 2.9006560449859418, + "grad_norm": 3.0457820892333984, + "learning_rate": 9.637417994376758e-06, + "loss": 0.2713, + "step": 6190 + }, + { + "epoch": 2.9053420805998127, + "grad_norm": 2.8154349327087402, + "learning_rate": 9.636832239925025e-06, + "loss": 0.2757, + "step": 6200 + }, + { + "epoch": 2.910028116213683, + "grad_norm": 2.6084420680999756, + "learning_rate": 9.63624648547329e-06, + "loss": 0.2786, + "step": 6210 + }, + { + "epoch": 2.914714151827554, + "grad_norm": 3.3972620964050293, + "learning_rate": 9.635660731021557e-06, + "loss": 0.284, + "step": 6220 + }, + { + "epoch": 2.9194001874414246, + "grad_norm": 2.3089513778686523, + "learning_rate": 9.635074976569822e-06, + "loss": 0.2912, + "step": 6230 + }, + { + "epoch": 2.924086223055295, + "grad_norm": 2.8329966068267822, + "learning_rate": 9.634489222118089e-06, + "loss": 0.2872, + "step": 6240 + }, + { + "epoch": 2.928772258669166, + "grad_norm": 2.8177921772003174, + "learning_rate": 9.633903467666354e-06, + "loss": 0.2794, + "step": 6250 + }, + { + "epoch": 2.928772258669166, + "eval_loss": 0.06385794281959534, + "eval_pearson_cosine": 0.7704242421671665, + "eval_pearson_dot": 0.5817298961264266, + "eval_pearson_euclidean": 0.7580713117672779, + "eval_pearson_manhattan": 0.7582345366438972, + "eval_runtime": 39.8682, + "eval_samples_per_second": 37.624, + "eval_spearman_cosine": 0.7799516394114882, + "eval_spearman_dot": 0.5793232888676765, + "eval_spearman_euclidean": 0.7745810839892152, + "eval_spearman_manhattan": 0.7744580162860011, + "eval_steps_per_second": 37.624, + "step": 6250 + }, + { + "epoch": 2.9334582942830365, + "grad_norm": 3.0042576789855957, + "learning_rate": 9.633317713214621e-06, + "loss": 0.3072, + "step": 6260 + }, + { + "epoch": 2.9381443298969074, + "grad_norm": 2.3798370361328125, + "learning_rate": 9.632731958762888e-06, + "loss": 0.2472, + "step": 6270 + }, + { + "epoch": 2.942830365510778, + "grad_norm": 3.12076735496521, + "learning_rate": 9.632146204311153e-06, + "loss": 0.2649, + "step": 6280 + }, + { + "epoch": 2.9475164011246484, + "grad_norm": 2.4176595211029053, + "learning_rate": 9.63156044985942e-06, + "loss": 0.3011, + "step": 6290 + }, + { + "epoch": 2.9522024367385193, + "grad_norm": 4.061246871948242, + "learning_rate": 9.630974695407685e-06, + "loss": 0.2905, + "step": 6300 + }, + { + "epoch": 2.95688847235239, + "grad_norm": 3.4516801834106445, + "learning_rate": 9.630388940955952e-06, + "loss": 0.2757, + "step": 6310 + }, + { + "epoch": 2.9615745079662608, + "grad_norm": 2.8717644214630127, + "learning_rate": 9.62980318650422e-06, + "loss": 0.2566, + "step": 6320 + }, + { + "epoch": 2.9662605435801312, + "grad_norm": 2.948293685913086, + "learning_rate": 9.629217432052484e-06, + "loss": 0.2602, + "step": 6330 + }, + { + "epoch": 2.9709465791940017, + "grad_norm": 2.5591063499450684, + "learning_rate": 9.62863167760075e-06, + "loss": 0.2637, + "step": 6340 + }, + { + "epoch": 2.9756326148078727, + "grad_norm": 3.3883564472198486, + "learning_rate": 9.628045923149017e-06, + "loss": 0.3073, + "step": 6350 + }, + { + "epoch": 2.980318650421743, + "grad_norm": 2.641995668411255, + "learning_rate": 9.627460168697282e-06, + "loss": 0.2711, + "step": 6360 + }, + { + "epoch": 2.985004686035614, + "grad_norm": 2.4701485633850098, + "learning_rate": 9.626874414245549e-06, + "loss": 0.2458, + "step": 6370 + }, + { + "epoch": 2.9896907216494846, + "grad_norm": 3.103329658508301, + "learning_rate": 9.626288659793816e-06, + "loss": 0.2854, + "step": 6380 + }, + { + "epoch": 2.994376757263355, + "grad_norm": 2.9376931190490723, + "learning_rate": 9.625702905342081e-06, + "loss": 0.3098, + "step": 6390 + }, + { + "epoch": 2.999062792877226, + "grad_norm": 2.806793212890625, + "learning_rate": 9.625117150890348e-06, + "loss": 0.2574, + "step": 6400 + }, + { + "epoch": 3.0037488284910965, + "grad_norm": 2.9033491611480713, + "learning_rate": 9.624531396438613e-06, + "loss": 0.2101, + "step": 6410 + }, + { + "epoch": 3.0084348641049674, + "grad_norm": 2.3773772716522217, + "learning_rate": 9.62394564198688e-06, + "loss": 0.1983, + "step": 6420 + }, + { + "epoch": 3.013120899718838, + "grad_norm": 2.7549703121185303, + "learning_rate": 9.623359887535147e-06, + "loss": 0.1925, + "step": 6430 + }, + { + "epoch": 3.0178069353327084, + "grad_norm": 2.121821880340576, + "learning_rate": 9.622774133083412e-06, + "loss": 0.1893, + "step": 6440 + }, + { + "epoch": 3.0224929709465793, + "grad_norm": 1.9096688032150269, + "learning_rate": 9.622188378631679e-06, + "loss": 0.1832, + "step": 6450 + }, + { + "epoch": 3.02717900656045, + "grad_norm": 1.793131947517395, + "learning_rate": 9.621602624179944e-06, + "loss": 0.1903, + "step": 6460 + }, + { + "epoch": 3.0318650421743207, + "grad_norm": 2.6187257766723633, + "learning_rate": 9.62101686972821e-06, + "loss": 0.1846, + "step": 6470 + }, + { + "epoch": 3.036551077788191, + "grad_norm": 3.1435282230377197, + "learning_rate": 9.620431115276478e-06, + "loss": 0.2057, + "step": 6480 + }, + { + "epoch": 3.0412371134020617, + "grad_norm": 1.9570343494415283, + "learning_rate": 9.619845360824743e-06, + "loss": 0.2094, + "step": 6490 + }, + { + "epoch": 3.0459231490159326, + "grad_norm": 1.9423924684524536, + "learning_rate": 9.619259606373008e-06, + "loss": 0.1778, + "step": 6500 + }, + { + "epoch": 3.0459231490159326, + "eval_loss": 0.05262026935815811, + "eval_pearson_cosine": 0.7738324758503552, + "eval_pearson_dot": 0.6192771918549269, + "eval_pearson_euclidean": 0.7572697799605308, + "eval_pearson_manhattan": 0.7574249862527225, + "eval_runtime": 39.2621, + "eval_samples_per_second": 38.205, + "eval_spearman_cosine": 0.7811305074925675, + "eval_spearman_dot": 0.6255090522219068, + "eval_spearman_euclidean": 0.7739030959623808, + "eval_spearman_manhattan": 0.7738816457691842, + "eval_steps_per_second": 38.205, + "step": 6500 + }, + { + "epoch": 3.050609184629803, + "grad_norm": 2.544872760772705, + "learning_rate": 9.618673851921275e-06, + "loss": 0.2161, + "step": 6510 + }, + { + "epoch": 3.055295220243674, + "grad_norm": 3.1003711223602295, + "learning_rate": 9.61808809746954e-06, + "loss": 0.1964, + "step": 6520 + }, + { + "epoch": 3.0599812558575445, + "grad_norm": 2.2699685096740723, + "learning_rate": 9.617502343017808e-06, + "loss": 0.1992, + "step": 6530 + }, + { + "epoch": 3.064667291471415, + "grad_norm": 1.655086636543274, + "learning_rate": 9.616916588566074e-06, + "loss": 0.1834, + "step": 6540 + }, + { + "epoch": 3.069353327085286, + "grad_norm": 2.532160758972168, + "learning_rate": 9.61633083411434e-06, + "loss": 0.1978, + "step": 6550 + }, + { + "epoch": 3.0740393626991565, + "grad_norm": 3.1178247928619385, + "learning_rate": 9.615745079662607e-06, + "loss": 0.2249, + "step": 6560 + }, + { + "epoch": 3.0787253983130274, + "grad_norm": 3.530034303665161, + "learning_rate": 9.615159325210872e-06, + "loss": 0.214, + "step": 6570 + }, + { + "epoch": 3.083411433926898, + "grad_norm": 2.2964463233947754, + "learning_rate": 9.614573570759139e-06, + "loss": 0.1695, + "step": 6580 + }, + { + "epoch": 3.0880974695407684, + "grad_norm": 2.613321304321289, + "learning_rate": 9.613987816307406e-06, + "loss": 0.1888, + "step": 6590 + }, + { + "epoch": 3.0927835051546393, + "grad_norm": 3.2321712970733643, + "learning_rate": 9.613402061855671e-06, + "loss": 0.1721, + "step": 6600 + }, + { + "epoch": 3.0974695407685098, + "grad_norm": 2.779963731765747, + "learning_rate": 9.612816307403938e-06, + "loss": 0.2098, + "step": 6610 + }, + { + "epoch": 3.1021555763823807, + "grad_norm": 2.531707763671875, + "learning_rate": 9.612230552952203e-06, + "loss": 0.1906, + "step": 6620 + }, + { + "epoch": 3.106841611996251, + "grad_norm": 3.4784843921661377, + "learning_rate": 9.611644798500468e-06, + "loss": 0.1856, + "step": 6630 + }, + { + "epoch": 3.1115276476101217, + "grad_norm": 2.682420253753662, + "learning_rate": 9.611059044048735e-06, + "loss": 0.2047, + "step": 6640 + }, + { + "epoch": 3.1162136832239926, + "grad_norm": 2.4793782234191895, + "learning_rate": 9.610473289597002e-06, + "loss": 0.2062, + "step": 6650 + }, + { + "epoch": 3.120899718837863, + "grad_norm": 2.4194202423095703, + "learning_rate": 9.609887535145267e-06, + "loss": 0.184, + "step": 6660 + }, + { + "epoch": 3.125585754451734, + "grad_norm": 2.717632293701172, + "learning_rate": 9.609301780693534e-06, + "loss": 0.2148, + "step": 6670 + }, + { + "epoch": 3.1302717900656045, + "grad_norm": 2.401958703994751, + "learning_rate": 9.6087160262418e-06, + "loss": 0.1912, + "step": 6680 + }, + { + "epoch": 3.134957825679475, + "grad_norm": 2.585925340652466, + "learning_rate": 9.608130271790066e-06, + "loss": 0.1784, + "step": 6690 + }, + { + "epoch": 3.139643861293346, + "grad_norm": 3.0962131023406982, + "learning_rate": 9.607544517338333e-06, + "loss": 0.2255, + "step": 6700 + }, + { + "epoch": 3.1443298969072164, + "grad_norm": 2.8932714462280273, + "learning_rate": 9.606958762886598e-06, + "loss": 0.2022, + "step": 6710 + }, + { + "epoch": 3.1490159325210874, + "grad_norm": 3.122772693634033, + "learning_rate": 9.606373008434865e-06, + "loss": 0.224, + "step": 6720 + }, + { + "epoch": 3.153701968134958, + "grad_norm": 2.3057713508605957, + "learning_rate": 9.60578725398313e-06, + "loss": 0.1793, + "step": 6730 + }, + { + "epoch": 3.1583880037488283, + "grad_norm": 2.4614179134368896, + "learning_rate": 9.605201499531398e-06, + "loss": 0.2021, + "step": 6740 + }, + { + "epoch": 3.1630740393626993, + "grad_norm": 2.207731008529663, + "learning_rate": 9.604615745079663e-06, + "loss": 0.1791, + "step": 6750 + }, + { + "epoch": 3.1630740393626993, + "eval_loss": 0.051918212324380875, + "eval_pearson_cosine": 0.7727540079103647, + "eval_pearson_dot": 0.6115735313924304, + "eval_pearson_euclidean": 0.7538142936820122, + "eval_pearson_manhattan": 0.7540359717423257, + "eval_runtime": 40.2509, + "eval_samples_per_second": 37.266, + "eval_spearman_cosine": 0.7783352094358659, + "eval_spearman_dot": 0.6182056058746139, + "eval_spearman_euclidean": 0.7700146198863392, + "eval_spearman_manhattan": 0.7703753603565089, + "eval_steps_per_second": 37.266, + "step": 6750 + }, + { + "epoch": 3.1677600749765698, + "grad_norm": 3.1614291667938232, + "learning_rate": 9.60402999062793e-06, + "loss": 0.2023, + "step": 6760 + }, + { + "epoch": 3.1724461105904407, + "grad_norm": 2.1920371055603027, + "learning_rate": 9.603444236176197e-06, + "loss": 0.2102, + "step": 6770 + }, + { + "epoch": 3.177132146204311, + "grad_norm": 2.640230894088745, + "learning_rate": 9.602858481724462e-06, + "loss": 0.1945, + "step": 6780 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 2.749765396118164, + "learning_rate": 9.602272727272727e-06, + "loss": 0.1929, + "step": 6790 + }, + { + "epoch": 3.1865042174320526, + "grad_norm": 2.674724578857422, + "learning_rate": 9.601686972820994e-06, + "loss": 0.1615, + "step": 6800 + }, + { + "epoch": 3.191190253045923, + "grad_norm": 2.891061782836914, + "learning_rate": 9.601101218369261e-06, + "loss": 0.1957, + "step": 6810 + }, + { + "epoch": 3.195876288659794, + "grad_norm": 2.452010154724121, + "learning_rate": 9.600515463917526e-06, + "loss": 0.1728, + "step": 6820 + }, + { + "epoch": 3.2005623242736645, + "grad_norm": 2.2987475395202637, + "learning_rate": 9.599929709465793e-06, + "loss": 0.2001, + "step": 6830 + }, + { + "epoch": 3.205248359887535, + "grad_norm": 1.8266733884811401, + "learning_rate": 9.599343955014058e-06, + "loss": 0.178, + "step": 6840 + }, + { + "epoch": 3.209934395501406, + "grad_norm": 2.5674402713775635, + "learning_rate": 9.598758200562325e-06, + "loss": 0.1998, + "step": 6850 + }, + { + "epoch": 3.2146204311152764, + "grad_norm": 2.6671390533447266, + "learning_rate": 9.59817244611059e-06, + "loss": 0.1953, + "step": 6860 + }, + { + "epoch": 3.2193064667291473, + "grad_norm": 2.2907798290252686, + "learning_rate": 9.597586691658857e-06, + "loss": 0.2099, + "step": 6870 + }, + { + "epoch": 3.223992502343018, + "grad_norm": 3.427845001220703, + "learning_rate": 9.597000937207124e-06, + "loss": 0.2197, + "step": 6880 + }, + { + "epoch": 3.2286785379568883, + "grad_norm": 2.7697179317474365, + "learning_rate": 9.59641518275539e-06, + "loss": 0.2015, + "step": 6890 + }, + { + "epoch": 3.2333645735707592, + "grad_norm": 2.18623423576355, + "learning_rate": 9.595829428303656e-06, + "loss": 0.186, + "step": 6900 + }, + { + "epoch": 3.2380506091846297, + "grad_norm": 3.5410492420196533, + "learning_rate": 9.595243673851922e-06, + "loss": 0.1952, + "step": 6910 + }, + { + "epoch": 3.2427366447985007, + "grad_norm": 2.631042242050171, + "learning_rate": 9.594657919400188e-06, + "loss": 0.1987, + "step": 6920 + }, + { + "epoch": 3.247422680412371, + "grad_norm": 2.416839122772217, + "learning_rate": 9.594072164948455e-06, + "loss": 0.1882, + "step": 6930 + }, + { + "epoch": 3.2521087160262416, + "grad_norm": 1.9796010255813599, + "learning_rate": 9.59348641049672e-06, + "loss": 0.2133, + "step": 6940 + }, + { + "epoch": 3.2567947516401126, + "grad_norm": 2.4925854206085205, + "learning_rate": 9.592900656044986e-06, + "loss": 0.1915, + "step": 6950 + }, + { + "epoch": 3.261480787253983, + "grad_norm": 2.323317766189575, + "learning_rate": 9.592314901593253e-06, + "loss": 0.1951, + "step": 6960 + }, + { + "epoch": 3.266166822867854, + "grad_norm": 2.2892370223999023, + "learning_rate": 9.591729147141518e-06, + "loss": 0.1938, + "step": 6970 + }, + { + "epoch": 3.2708528584817245, + "grad_norm": 2.5155346393585205, + "learning_rate": 9.591143392689785e-06, + "loss": 0.2233, + "step": 6980 + }, + { + "epoch": 3.275538894095595, + "grad_norm": 2.344982624053955, + "learning_rate": 9.590557638238052e-06, + "loss": 0.2029, + "step": 6990 + }, + { + "epoch": 3.280224929709466, + "grad_norm": 2.588749408721924, + "learning_rate": 9.589971883786317e-06, + "loss": 0.201, + "step": 7000 + }, + { + "epoch": 3.280224929709466, + "eval_loss": 0.05114726722240448, + "eval_pearson_cosine": 0.7755109260376774, + "eval_pearson_dot": 0.6038894215473363, + "eval_pearson_euclidean": 0.7503042403542959, + "eval_pearson_manhattan": 0.7506062835400229, + "eval_runtime": 39.6185, + "eval_samples_per_second": 37.861, + "eval_spearman_cosine": 0.782480390320873, + "eval_spearman_dot": 0.6071370021112881, + "eval_spearman_euclidean": 0.7669617369314659, + "eval_spearman_manhattan": 0.767145329588772, + "eval_steps_per_second": 37.861, + "step": 7000 + } + ], + "logging_steps": 10, + "max_steps": 21340, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}