{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.8744142455482662, "eval_steps": 250, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004686035613870665, "grad_norm": 2.907787561416626, "learning_rate": 9.997071227741332e-06, "loss": 3.3815, "step": 10 }, { "epoch": 0.00937207122774133, "grad_norm": 2.2910118103027344, "learning_rate": 9.994142455482663e-06, "loss": 3.3605, "step": 20 }, { "epoch": 0.014058106841611996, "grad_norm": 2.791727066040039, "learning_rate": 9.991213683223994e-06, "loss": 3.3338, "step": 30 }, { "epoch": 0.01874414245548266, "grad_norm": 2.881253242492676, "learning_rate": 9.988284910965324e-06, "loss": 3.3047, "step": 40 }, { "epoch": 0.023430178069353328, "grad_norm": 3.5495920181274414, "learning_rate": 9.985356138706655e-06, "loss": 3.266, "step": 50 }, { "epoch": 0.028116213683223992, "grad_norm": 3.8195812702178955, "learning_rate": 9.982427366447985e-06, "loss": 3.2116, "step": 60 }, { "epoch": 0.03280224929709466, "grad_norm": 5.006792068481445, "learning_rate": 9.979498594189316e-06, "loss": 3.1271, "step": 70 }, { "epoch": 0.03748828491096532, "grad_norm": 5.206729412078857, "learning_rate": 9.976569821930647e-06, "loss": 3.0472, "step": 80 }, { "epoch": 0.04217432052483599, "grad_norm": 6.317724227905273, "learning_rate": 9.973641049671978e-06, "loss": 2.9458, "step": 90 }, { "epoch": 0.046860356138706656, "grad_norm": 7.30826997756958, "learning_rate": 9.97071227741331e-06, "loss": 2.9002, "step": 100 }, { "epoch": 0.05154639175257732, "grad_norm": 7.05161190032959, "learning_rate": 9.96778350515464e-06, "loss": 2.8379, "step": 110 }, { "epoch": 0.056232427366447985, "grad_norm": 12.389013290405273, "learning_rate": 9.964854732895972e-06, "loss": 2.7637, "step": 120 }, { "epoch": 0.06091846298031865, "grad_norm": 19.661762237548828, "learning_rate": 9.961925960637301e-06, "loss": 2.7413, "step": 130 }, { "epoch": 0.06560449859418932, "grad_norm": 7.9712018966674805, "learning_rate": 9.958997188378632e-06, "loss": 2.6953, "step": 140 }, { "epoch": 0.07029053420805999, "grad_norm": 44.79791259765625, "learning_rate": 9.956068416119962e-06, "loss": 2.6795, "step": 150 }, { "epoch": 0.07497656982193064, "grad_norm": 7.748485565185547, "learning_rate": 9.953139643861293e-06, "loss": 2.6179, "step": 160 }, { "epoch": 0.07966260543580131, "grad_norm": 7.135361194610596, "learning_rate": 9.950210871602624e-06, "loss": 2.5714, "step": 170 }, { "epoch": 0.08434864104967198, "grad_norm": 5.464244365692139, "learning_rate": 9.947282099343956e-06, "loss": 2.4817, "step": 180 }, { "epoch": 0.08903467666354264, "grad_norm": 10.304727554321289, "learning_rate": 9.944353327085287e-06, "loss": 2.3939, "step": 190 }, { "epoch": 0.09372071227741331, "grad_norm": 8.390380859375, "learning_rate": 9.941424554826618e-06, "loss": 2.3162, "step": 200 }, { "epoch": 0.09840674789128398, "grad_norm": 7.206277847290039, "learning_rate": 9.938495782567949e-06, "loss": 2.2413, "step": 210 }, { "epoch": 0.10309278350515463, "grad_norm": 10.72529411315918, "learning_rate": 9.935567010309279e-06, "loss": 2.1816, "step": 220 }, { "epoch": 0.1077788191190253, "grad_norm": 8.411327362060547, "learning_rate": 9.93263823805061e-06, "loss": 2.0204, "step": 230 }, { "epoch": 0.11246485473289597, "grad_norm": 9.118602752685547, "learning_rate": 9.929709465791941e-06, "loss": 1.9329, "step": 240 }, { "epoch": 0.11715089034676664, "grad_norm": 11.883502960205078, "learning_rate": 9.92678069353327e-06, "loss": 1.8041, "step": 250 }, { "epoch": 0.11715089034676664, "eval_loss": 0.20095524191856384, "eval_pearson_cosine": 0.5629603652959432, "eval_pearson_dot": 0.32442021258601983, "eval_pearson_euclidean": 0.5948642130310873, "eval_pearson_manhattan": 0.5931866084570743, "eval_runtime": 46.3498, "eval_samples_per_second": 32.363, "eval_spearman_cosine": 0.5645428688364399, "eval_spearman_dot": 0.3123519595505677, "eval_spearman_euclidean": 0.5966715855304487, "eval_spearman_manhattan": 0.5951499296436052, "eval_steps_per_second": 32.363, "step": 250 }, { "epoch": 0.1218369259606373, "grad_norm": 9.455839157104492, "learning_rate": 9.923851921274602e-06, "loss": 1.7175, "step": 260 }, { "epoch": 0.12652296157450796, "grad_norm": 9.907763481140137, "learning_rate": 9.920923149015933e-06, "loss": 1.5752, "step": 270 }, { "epoch": 0.13120899718837864, "grad_norm": 10.268372535705566, "learning_rate": 9.917994376757264e-06, "loss": 1.5905, "step": 280 }, { "epoch": 0.1358950328022493, "grad_norm": 12.264440536499023, "learning_rate": 9.915065604498595e-06, "loss": 1.4994, "step": 290 }, { "epoch": 0.14058106841611998, "grad_norm": 10.21927547454834, "learning_rate": 9.912136832239926e-06, "loss": 1.4741, "step": 300 }, { "epoch": 0.14526710402999063, "grad_norm": 12.204063415527344, "learning_rate": 9.909208059981256e-06, "loss": 1.3685, "step": 310 }, { "epoch": 0.14995313964386128, "grad_norm": 8.701486587524414, "learning_rate": 9.906279287722587e-06, "loss": 1.3407, "step": 320 }, { "epoch": 0.15463917525773196, "grad_norm": 11.478012084960938, "learning_rate": 9.903350515463918e-06, "loss": 1.3996, "step": 330 }, { "epoch": 0.15932521087160262, "grad_norm": 8.862137794494629, "learning_rate": 9.90042174320525e-06, "loss": 1.2921, "step": 340 }, { "epoch": 0.1640112464854733, "grad_norm": 8.181413650512695, "learning_rate": 9.897492970946579e-06, "loss": 1.2948, "step": 350 }, { "epoch": 0.16869728209934395, "grad_norm": 12.891910552978516, "learning_rate": 9.89456419868791e-06, "loss": 1.2444, "step": 360 }, { "epoch": 0.1733833177132146, "grad_norm": 9.783638000488281, "learning_rate": 9.891635426429241e-06, "loss": 1.1765, "step": 370 }, { "epoch": 0.1780693533270853, "grad_norm": 10.521812438964844, "learning_rate": 9.888706654170573e-06, "loss": 1.2163, "step": 380 }, { "epoch": 0.18275538894095594, "grad_norm": 9.507091522216797, "learning_rate": 9.885777881911904e-06, "loss": 1.1555, "step": 390 }, { "epoch": 0.18744142455482662, "grad_norm": 10.072102546691895, "learning_rate": 9.882849109653235e-06, "loss": 1.1631, "step": 400 }, { "epoch": 0.19212746016869728, "grad_norm": 12.557927131652832, "learning_rate": 9.879920337394564e-06, "loss": 1.1319, "step": 410 }, { "epoch": 0.19681349578256796, "grad_norm": 7.743768692016602, "learning_rate": 9.876991565135896e-06, "loss": 1.2022, "step": 420 }, { "epoch": 0.2014995313964386, "grad_norm": 9.258079528808594, "learning_rate": 9.874062792877227e-06, "loss": 1.1219, "step": 430 }, { "epoch": 0.20618556701030927, "grad_norm": 8.362629890441895, "learning_rate": 9.871134020618558e-06, "loss": 1.1138, "step": 440 }, { "epoch": 0.21087160262417995, "grad_norm": 8.71789264678955, "learning_rate": 9.868205248359888e-06, "loss": 1.0473, "step": 450 }, { "epoch": 0.2155576382380506, "grad_norm": 8.710640907287598, "learning_rate": 9.865276476101219e-06, "loss": 1.0933, "step": 460 }, { "epoch": 0.22024367385192128, "grad_norm": 7.57949686050415, "learning_rate": 9.86234770384255e-06, "loss": 1.0429, "step": 470 }, { "epoch": 0.22492970946579194, "grad_norm": 8.775091171264648, "learning_rate": 9.859418931583881e-06, "loss": 1.0406, "step": 480 }, { "epoch": 0.2296157450796626, "grad_norm": 9.942752838134766, "learning_rate": 9.856490159325212e-06, "loss": 1.0526, "step": 490 }, { "epoch": 0.23430178069353327, "grad_norm": 10.166437149047852, "learning_rate": 9.853561387066542e-06, "loss": 1.0265, "step": 500 }, { "epoch": 0.23430178069353327, "eval_loss": 0.09848710149526596, "eval_pearson_cosine": 0.7114527090607083, "eval_pearson_dot": 0.5814656567702485, "eval_pearson_euclidean": 0.7022168021213133, "eval_pearson_manhattan": 0.7010309676073874, "eval_runtime": 48.356, "eval_samples_per_second": 31.02, "eval_spearman_cosine": 0.7098203386273151, "eval_spearman_dot": 0.5861254786395066, "eval_spearman_euclidean": 0.7102590115372712, "eval_spearman_manhattan": 0.7094011853041999, "eval_steps_per_second": 31.02, "step": 500 }, { "epoch": 0.23898781630740393, "grad_norm": 6.910321235656738, "learning_rate": 9.850632614807873e-06, "loss": 1.0267, "step": 510 }, { "epoch": 0.2436738519212746, "grad_norm": 8.010503768920898, "learning_rate": 9.847703842549204e-06, "loss": 0.97, "step": 520 }, { "epoch": 0.24835988753514526, "grad_norm": 8.340336799621582, "learning_rate": 9.844775070290535e-06, "loss": 0.9773, "step": 530 }, { "epoch": 0.2530459231490159, "grad_norm": 6.75998592376709, "learning_rate": 9.841846298031867e-06, "loss": 0.9694, "step": 540 }, { "epoch": 0.25773195876288657, "grad_norm": 6.592973709106445, "learning_rate": 9.838917525773196e-06, "loss": 0.9101, "step": 550 }, { "epoch": 0.2624179943767573, "grad_norm": 8.13701343536377, "learning_rate": 9.835988753514527e-06, "loss": 0.9693, "step": 560 }, { "epoch": 0.26710402999062793, "grad_norm": 10.256951332092285, "learning_rate": 9.833059981255859e-06, "loss": 0.9405, "step": 570 }, { "epoch": 0.2717900656044986, "grad_norm": 9.521321296691895, "learning_rate": 9.83013120899719e-06, "loss": 0.8731, "step": 580 }, { "epoch": 0.27647610121836924, "grad_norm": 7.164852142333984, "learning_rate": 9.82720243673852e-06, "loss": 0.9387, "step": 590 }, { "epoch": 0.28116213683223995, "grad_norm": 8.326433181762695, "learning_rate": 9.82427366447985e-06, "loss": 0.8388, "step": 600 }, { "epoch": 0.2858481724461106, "grad_norm": 8.819974899291992, "learning_rate": 9.821344892221182e-06, "loss": 0.9034, "step": 610 }, { "epoch": 0.29053420805998126, "grad_norm": 6.0674052238464355, "learning_rate": 9.818416119962513e-06, "loss": 0.8225, "step": 620 }, { "epoch": 0.2952202436738519, "grad_norm": 7.898690223693848, "learning_rate": 9.815487347703844e-06, "loss": 0.8916, "step": 630 }, { "epoch": 0.29990627928772257, "grad_norm": 9.459305763244629, "learning_rate": 9.812558575445175e-06, "loss": 0.8771, "step": 640 }, { "epoch": 0.3045923149015933, "grad_norm": 7.231110095977783, "learning_rate": 9.809629803186505e-06, "loss": 0.8575, "step": 650 }, { "epoch": 0.30927835051546393, "grad_norm": 5.850890159606934, "learning_rate": 9.806701030927836e-06, "loss": 0.8294, "step": 660 }, { "epoch": 0.3139643861293346, "grad_norm": 12.532159805297852, "learning_rate": 9.803772258669167e-06, "loss": 0.8745, "step": 670 }, { "epoch": 0.31865042174320524, "grad_norm": 6.576635837554932, "learning_rate": 9.800843486410497e-06, "loss": 0.8167, "step": 680 }, { "epoch": 0.3233364573570759, "grad_norm": 7.243174076080322, "learning_rate": 9.797914714151828e-06, "loss": 0.8886, "step": 690 }, { "epoch": 0.3280224929709466, "grad_norm": 6.775111675262451, "learning_rate": 9.794985941893159e-06, "loss": 0.8205, "step": 700 }, { "epoch": 0.33270852858481725, "grad_norm": 7.494016647338867, "learning_rate": 9.79205716963449e-06, "loss": 0.7778, "step": 710 }, { "epoch": 0.3373945641986879, "grad_norm": 5.593213081359863, "learning_rate": 9.789128397375821e-06, "loss": 0.7875, "step": 720 }, { "epoch": 0.34208059981255856, "grad_norm": 7.325387001037598, "learning_rate": 9.786199625117153e-06, "loss": 0.7839, "step": 730 }, { "epoch": 0.3467666354264292, "grad_norm": 5.411241054534912, "learning_rate": 9.783270852858484e-06, "loss": 0.8363, "step": 740 }, { "epoch": 0.3514526710402999, "grad_norm": 5.667125225067139, "learning_rate": 9.780342080599813e-06, "loss": 0.7904, "step": 750 }, { "epoch": 0.3514526710402999, "eval_loss": 0.07609602808952332, "eval_pearson_cosine": 0.7390127527190131, "eval_pearson_dot": 0.6193519334256266, "eval_pearson_euclidean": 0.7286540107637123, "eval_pearson_manhattan": 0.7280163166143723, "eval_runtime": 48.6286, "eval_samples_per_second": 30.846, "eval_spearman_cosine": 0.7392385981828663, "eval_spearman_dot": 0.6275059521836013, "eval_spearman_euclidean": 0.7379755721813188, "eval_spearman_manhattan": 0.7372480627669395, "eval_steps_per_second": 30.846, "step": 750 }, { "epoch": 0.3561387066541706, "grad_norm": 5.931227207183838, "learning_rate": 9.777413308341144e-06, "loss": 0.7801, "step": 760 }, { "epoch": 0.36082474226804123, "grad_norm": 5.550874710083008, "learning_rate": 9.774484536082474e-06, "loss": 0.7466, "step": 770 }, { "epoch": 0.3655107778819119, "grad_norm": 5.67214298248291, "learning_rate": 9.771555763823805e-06, "loss": 0.7561, "step": 780 }, { "epoch": 0.3701968134957826, "grad_norm": 5.121714115142822, "learning_rate": 9.768626991565136e-06, "loss": 0.7395, "step": 790 }, { "epoch": 0.37488284910965325, "grad_norm": 4.957924842834473, "learning_rate": 9.765698219306467e-06, "loss": 0.7368, "step": 800 }, { "epoch": 0.3795688847235239, "grad_norm": 6.30219030380249, "learning_rate": 9.762769447047799e-06, "loss": 0.8091, "step": 810 }, { "epoch": 0.38425492033739456, "grad_norm": 6.518470287322998, "learning_rate": 9.75984067478913e-06, "loss": 0.7525, "step": 820 }, { "epoch": 0.3889409559512652, "grad_norm": 6.101437568664551, "learning_rate": 9.756911902530461e-06, "loss": 0.7263, "step": 830 }, { "epoch": 0.3936269915651359, "grad_norm": 5.428840160369873, "learning_rate": 9.75398313027179e-06, "loss": 0.7881, "step": 840 }, { "epoch": 0.3983130271790066, "grad_norm": 7.170475482940674, "learning_rate": 9.751054358013122e-06, "loss": 0.7218, "step": 850 }, { "epoch": 0.4029990627928772, "grad_norm": 6.153990745544434, "learning_rate": 9.748125585754453e-06, "loss": 0.748, "step": 860 }, { "epoch": 0.4076850984067479, "grad_norm": 5.364086151123047, "learning_rate": 9.745196813495782e-06, "loss": 0.786, "step": 870 }, { "epoch": 0.41237113402061853, "grad_norm": 5.541423797607422, "learning_rate": 9.742268041237114e-06, "loss": 0.7427, "step": 880 }, { "epoch": 0.41705716963448924, "grad_norm": 5.1667022705078125, "learning_rate": 9.739339268978445e-06, "loss": 0.6918, "step": 890 }, { "epoch": 0.4217432052483599, "grad_norm": 4.839612007141113, "learning_rate": 9.736410496719776e-06, "loss": 0.7056, "step": 900 }, { "epoch": 0.42642924086223055, "grad_norm": 4.407963275909424, "learning_rate": 9.733481724461107e-06, "loss": 0.6313, "step": 910 }, { "epoch": 0.4311152764761012, "grad_norm": 7.052595138549805, "learning_rate": 9.730552952202438e-06, "loss": 0.7489, "step": 920 }, { "epoch": 0.43580131208997186, "grad_norm": 5.71290397644043, "learning_rate": 9.727624179943768e-06, "loss": 0.6578, "step": 930 }, { "epoch": 0.44048734770384257, "grad_norm": 6.3575825691223145, "learning_rate": 9.724695407685099e-06, "loss": 0.6914, "step": 940 }, { "epoch": 0.4451733833177132, "grad_norm": 5.223476886749268, "learning_rate": 9.72176663542643e-06, "loss": 0.6494, "step": 950 }, { "epoch": 0.4498594189315839, "grad_norm": 6.220378398895264, "learning_rate": 9.71883786316776e-06, "loss": 0.6996, "step": 960 }, { "epoch": 0.45454545454545453, "grad_norm": 6.475409507751465, "learning_rate": 9.715909090909091e-06, "loss": 0.721, "step": 970 }, { "epoch": 0.4592314901593252, "grad_norm": 5.10095739364624, "learning_rate": 9.712980318650422e-06, "loss": 0.6734, "step": 980 }, { "epoch": 0.4639175257731959, "grad_norm": 7.8438801765441895, "learning_rate": 9.710051546391753e-06, "loss": 0.7409, "step": 990 }, { "epoch": 0.46860356138706655, "grad_norm": 5.446135997772217, "learning_rate": 9.707122774133085e-06, "loss": 0.6772, "step": 1000 }, { "epoch": 0.46860356138706655, "eval_loss": 0.06938865035772324, "eval_pearson_cosine": 0.7523242546763527, "eval_pearson_dot": 0.6339033623348058, "eval_pearson_euclidean": 0.7449881727323344, "eval_pearson_manhattan": 0.7443626147120028, "eval_runtime": 47.885, "eval_samples_per_second": 31.325, "eval_spearman_cosine": 0.7542578168613095, "eval_spearman_dot": 0.6408093688850417, "eval_spearman_euclidean": 0.7532432307302356, "eval_spearman_manhattan": 0.7526380381288565, "eval_steps_per_second": 31.325, "step": 1000 }, { "epoch": 0.4732895970009372, "grad_norm": 6.391997814178467, "learning_rate": 9.704194001874416e-06, "loss": 0.6965, "step": 1010 }, { "epoch": 0.47797563261480785, "grad_norm": 5.345996379852295, "learning_rate": 9.701265229615747e-06, "loss": 0.6447, "step": 1020 }, { "epoch": 0.48266166822867856, "grad_norm": 5.60822057723999, "learning_rate": 9.698336457357076e-06, "loss": 0.6854, "step": 1030 }, { "epoch": 0.4873477038425492, "grad_norm": 6.488014221191406, "learning_rate": 9.695407685098408e-06, "loss": 0.7089, "step": 1040 }, { "epoch": 0.49203373945641987, "grad_norm": 5.387355804443359, "learning_rate": 9.692478912839737e-06, "loss": 0.6949, "step": 1050 }, { "epoch": 0.4967197750702905, "grad_norm": 5.179281234741211, "learning_rate": 9.689550140581068e-06, "loss": 0.6571, "step": 1060 }, { "epoch": 0.5014058106841612, "grad_norm": 5.786458492279053, "learning_rate": 9.6866213683224e-06, "loss": 0.7154, "step": 1070 }, { "epoch": 0.5060918462980318, "grad_norm": 6.279985427856445, "learning_rate": 9.68369259606373e-06, "loss": 0.6757, "step": 1080 }, { "epoch": 0.5107778819119025, "grad_norm": 4.793182849884033, "learning_rate": 9.680763823805062e-06, "loss": 0.7136, "step": 1090 }, { "epoch": 0.5154639175257731, "grad_norm": 7.646529674530029, "learning_rate": 9.677835051546393e-06, "loss": 0.6396, "step": 1100 }, { "epoch": 0.5201499531396439, "grad_norm": 5.7034912109375, "learning_rate": 9.674906279287724e-06, "loss": 0.665, "step": 1110 }, { "epoch": 0.5248359887535146, "grad_norm": 6.54317045211792, "learning_rate": 9.671977507029054e-06, "loss": 0.6713, "step": 1120 }, { "epoch": 0.5295220243673852, "grad_norm": 5.6496806144714355, "learning_rate": 9.669048734770385e-06, "loss": 0.6876, "step": 1130 }, { "epoch": 0.5342080599812559, "grad_norm": 5.326486110687256, "learning_rate": 9.666119962511716e-06, "loss": 0.6951, "step": 1140 }, { "epoch": 0.5388940955951266, "grad_norm": 5.124545574188232, "learning_rate": 9.663191190253046e-06, "loss": 0.6388, "step": 1150 }, { "epoch": 0.5435801312089972, "grad_norm": 4.34152364730835, "learning_rate": 9.660262417994377e-06, "loss": 0.6322, "step": 1160 }, { "epoch": 0.5482661668228679, "grad_norm": 8.722075462341309, "learning_rate": 9.657333645735708e-06, "loss": 0.6776, "step": 1170 }, { "epoch": 0.5529522024367385, "grad_norm": 5.417623996734619, "learning_rate": 9.65440487347704e-06, "loss": 0.6492, "step": 1180 }, { "epoch": 0.5576382380506092, "grad_norm": 4.369041919708252, "learning_rate": 9.65147610121837e-06, "loss": 0.6039, "step": 1190 }, { "epoch": 0.5623242736644799, "grad_norm": 6.5720062255859375, "learning_rate": 9.648547328959702e-06, "loss": 0.6911, "step": 1200 }, { "epoch": 0.5670103092783505, "grad_norm": 7.112950325012207, "learning_rate": 9.645618556701031e-06, "loss": 0.6214, "step": 1210 }, { "epoch": 0.5716963448922212, "grad_norm": 5.643182277679443, "learning_rate": 9.642689784442362e-06, "loss": 0.6959, "step": 1220 }, { "epoch": 0.5763823805060918, "grad_norm": 5.078190803527832, "learning_rate": 9.639761012183694e-06, "loss": 0.6633, "step": 1230 }, { "epoch": 0.5810684161199625, "grad_norm": 5.247280120849609, "learning_rate": 9.636832239925025e-06, "loss": 0.6415, "step": 1240 }, { "epoch": 0.5857544517338332, "grad_norm": 5.110747814178467, "learning_rate": 9.633903467666354e-06, "loss": 0.6031, "step": 1250 }, { "epoch": 0.5857544517338332, "eval_loss": 0.06345358490943909, "eval_pearson_cosine": 0.7580338914962539, "eval_pearson_dot": 0.6394158052533783, "eval_pearson_euclidean": 0.7521759780114508, "eval_pearson_manhattan": 0.7513571158009427, "eval_runtime": 44.2242, "eval_samples_per_second": 33.918, "eval_spearman_cosine": 0.758882658229917, "eval_spearman_dot": 0.6455380162932587, "eval_spearman_euclidean": 0.7604619351541958, "eval_spearman_manhattan": 0.7599139087493931, "eval_steps_per_second": 33.918, "step": 1250 }, { "epoch": 0.5904404873477038, "grad_norm": 6.717201232910156, "learning_rate": 9.630974695407685e-06, "loss": 0.6553, "step": 1260 }, { "epoch": 0.5951265229615745, "grad_norm": 6.948915004730225, "learning_rate": 9.628045923149017e-06, "loss": 0.6528, "step": 1270 }, { "epoch": 0.5998125585754451, "grad_norm": 5.585124969482422, "learning_rate": 9.625117150890348e-06, "loss": 0.6125, "step": 1280 }, { "epoch": 0.6044985941893158, "grad_norm": 4.020166397094727, "learning_rate": 9.622188378631679e-06, "loss": 0.5857, "step": 1290 }, { "epoch": 0.6091846298031866, "grad_norm": 4.905421257019043, "learning_rate": 9.619259606373008e-06, "loss": 0.6128, "step": 1300 }, { "epoch": 0.6138706654170571, "grad_norm": 5.642446517944336, "learning_rate": 9.61633083411434e-06, "loss": 0.6177, "step": 1310 }, { "epoch": 0.6185567010309279, "grad_norm": 5.623671531677246, "learning_rate": 9.613402061855671e-06, "loss": 0.6076, "step": 1320 }, { "epoch": 0.6232427366447985, "grad_norm": 3.6249349117279053, "learning_rate": 9.610473289597002e-06, "loss": 0.5987, "step": 1330 }, { "epoch": 0.6279287722586692, "grad_norm": 4.7242608070373535, "learning_rate": 9.607544517338333e-06, "loss": 0.6082, "step": 1340 }, { "epoch": 0.6326148078725399, "grad_norm": 9.071741104125977, "learning_rate": 9.604615745079663e-06, "loss": 0.6369, "step": 1350 }, { "epoch": 0.6373008434864105, "grad_norm": 5.471718788146973, "learning_rate": 9.601686972820994e-06, "loss": 0.6235, "step": 1360 }, { "epoch": 0.6419868791002812, "grad_norm": 6.0755934715271, "learning_rate": 9.598758200562325e-06, "loss": 0.6197, "step": 1370 }, { "epoch": 0.6466729147141518, "grad_norm": 5.650800704956055, "learning_rate": 9.595829428303656e-06, "loss": 0.5947, "step": 1380 }, { "epoch": 0.6513589503280225, "grad_norm": 4.409568786621094, "learning_rate": 9.592900656044986e-06, "loss": 0.6632, "step": 1390 }, { "epoch": 0.6560449859418932, "grad_norm": 6.575608730316162, "learning_rate": 9.589971883786317e-06, "loss": 0.5655, "step": 1400 }, { "epoch": 0.6607310215557638, "grad_norm": 4.897518634796143, "learning_rate": 9.587043111527648e-06, "loss": 0.6064, "step": 1410 }, { "epoch": 0.6654170571696345, "grad_norm": 4.505845546722412, "learning_rate": 9.58411433926898e-06, "loss": 0.6217, "step": 1420 }, { "epoch": 0.6701030927835051, "grad_norm": 11.04179573059082, "learning_rate": 9.58118556701031e-06, "loss": 0.626, "step": 1430 }, { "epoch": 0.6747891283973758, "grad_norm": 7.031481742858887, "learning_rate": 9.578256794751642e-06, "loss": 0.6644, "step": 1440 }, { "epoch": 0.6794751640112465, "grad_norm": 5.177082061767578, "learning_rate": 9.575328022492971e-06, "loss": 0.5794, "step": 1450 }, { "epoch": 0.6841611996251171, "grad_norm": 5.830789566040039, "learning_rate": 9.572399250234303e-06, "loss": 0.5962, "step": 1460 }, { "epoch": 0.6888472352389878, "grad_norm": 5.322279453277588, "learning_rate": 9.569470477975634e-06, "loss": 0.5528, "step": 1470 }, { "epoch": 0.6935332708528584, "grad_norm": 5.191045761108398, "learning_rate": 9.566541705716965e-06, "loss": 0.602, "step": 1480 }, { "epoch": 0.6982193064667291, "grad_norm": 4.832320213317871, "learning_rate": 9.563612933458294e-06, "loss": 0.5732, "step": 1490 }, { "epoch": 0.7029053420805998, "grad_norm": 5.9457926750183105, "learning_rate": 9.560684161199626e-06, "loss": 0.6017, "step": 1500 }, { "epoch": 0.7029053420805998, "eval_loss": 0.059113115072250366, "eval_pearson_cosine": 0.7675747794888963, "eval_pearson_dot": 0.6475892776570333, "eval_pearson_euclidean": 0.7594640382486553, "eval_pearson_manhattan": 0.7585029707701096, "eval_runtime": 45.7613, "eval_samples_per_second": 32.779, "eval_spearman_cosine": 0.768339335776319, "eval_spearman_dot": 0.655445685087582, "eval_spearman_euclidean": 0.7680811238488432, "eval_spearman_manhattan": 0.7673055147561156, "eval_steps_per_second": 32.779, "step": 1500 }, { "epoch": 0.7075913776944704, "grad_norm": 4.822035789489746, "learning_rate": 9.557755388940957e-06, "loss": 0.5891, "step": 1510 }, { "epoch": 0.7122774133083412, "grad_norm": 7.0355753898620605, "learning_rate": 9.554826616682288e-06, "loss": 0.6019, "step": 1520 }, { "epoch": 0.7169634489222118, "grad_norm": 7.064100742340088, "learning_rate": 9.55189784442362e-06, "loss": 0.5656, "step": 1530 }, { "epoch": 0.7216494845360825, "grad_norm": 4.629329204559326, "learning_rate": 9.54896907216495e-06, "loss": 0.5839, "step": 1540 }, { "epoch": 0.7263355201499532, "grad_norm": 5.421347141265869, "learning_rate": 9.54604029990628e-06, "loss": 0.5684, "step": 1550 }, { "epoch": 0.7310215557638238, "grad_norm": 4.520521640777588, "learning_rate": 9.543111527647611e-06, "loss": 0.5979, "step": 1560 }, { "epoch": 0.7357075913776945, "grad_norm": 5.172377109527588, "learning_rate": 9.540182755388942e-06, "loss": 0.5678, "step": 1570 }, { "epoch": 0.7403936269915652, "grad_norm": 5.090722560882568, "learning_rate": 9.537253983130272e-06, "loss": 0.556, "step": 1580 }, { "epoch": 0.7450796626054358, "grad_norm": 4.6714887619018555, "learning_rate": 9.534325210871603e-06, "loss": 0.564, "step": 1590 }, { "epoch": 0.7497656982193065, "grad_norm": 4.211735248565674, "learning_rate": 9.531396438612934e-06, "loss": 0.617, "step": 1600 }, { "epoch": 0.7544517338331771, "grad_norm": 4.693171501159668, "learning_rate": 9.528467666354265e-06, "loss": 0.5657, "step": 1610 }, { "epoch": 0.7591377694470478, "grad_norm": 6.890966892242432, "learning_rate": 9.525538894095597e-06, "loss": 0.5838, "step": 1620 }, { "epoch": 0.7638238050609185, "grad_norm": 3.5127806663513184, "learning_rate": 9.522610121836928e-06, "loss": 0.5669, "step": 1630 }, { "epoch": 0.7685098406747891, "grad_norm": 4.389316082000732, "learning_rate": 9.519681349578259e-06, "loss": 0.5669, "step": 1640 }, { "epoch": 0.7731958762886598, "grad_norm": 4.59335470199585, "learning_rate": 9.516752577319588e-06, "loss": 0.604, "step": 1650 }, { "epoch": 0.7778819119025304, "grad_norm": 5.345147132873535, "learning_rate": 9.51382380506092e-06, "loss": 0.6132, "step": 1660 }, { "epoch": 0.7825679475164011, "grad_norm": 5.133398532867432, "learning_rate": 9.510895032802249e-06, "loss": 0.5539, "step": 1670 }, { "epoch": 0.7872539831302718, "grad_norm": 7.907310962677002, "learning_rate": 9.50796626054358e-06, "loss": 0.61, "step": 1680 }, { "epoch": 0.7919400187441424, "grad_norm": 4.504448890686035, "learning_rate": 9.505037488284911e-06, "loss": 0.5851, "step": 1690 }, { "epoch": 0.7966260543580131, "grad_norm": 4.3662028312683105, "learning_rate": 9.502108716026243e-06, "loss": 0.5915, "step": 1700 }, { "epoch": 0.8013120899718837, "grad_norm": 5.221836566925049, "learning_rate": 9.499179943767574e-06, "loss": 0.581, "step": 1710 }, { "epoch": 0.8059981255857545, "grad_norm": 6.357667446136475, "learning_rate": 9.496251171508905e-06, "loss": 0.5937, "step": 1720 }, { "epoch": 0.8106841611996252, "grad_norm": 6.262212753295898, "learning_rate": 9.493322399250236e-06, "loss": 0.606, "step": 1730 }, { "epoch": 0.8153701968134958, "grad_norm": 4.363849639892578, "learning_rate": 9.490393626991566e-06, "loss": 0.5524, "step": 1740 }, { "epoch": 0.8200562324273665, "grad_norm": 5.514476299285889, "learning_rate": 9.487464854732897e-06, "loss": 0.5611, "step": 1750 }, { "epoch": 0.8200562324273665, "eval_loss": 0.05879165977239609, "eval_pearson_cosine": 0.7714099892705395, "eval_pearson_dot": 0.6462212772089089, "eval_pearson_euclidean": 0.7641084348061273, "eval_pearson_manhattan": 0.7629885828620147, "eval_runtime": 43.6421, "eval_samples_per_second": 34.37, "eval_spearman_cosine": 0.7720168259371313, "eval_spearman_dot": 0.6536245076677092, "eval_spearman_euclidean": 0.7726348092699838, "eval_spearman_manhattan": 0.7716062900578692, "eval_steps_per_second": 34.37, "step": 1750 }, { "epoch": 0.8247422680412371, "grad_norm": 6.260695457458496, "learning_rate": 9.484536082474226e-06, "loss": 0.5566, "step": 1760 }, { "epoch": 0.8294283036551078, "grad_norm": 4.187561511993408, "learning_rate": 9.481607310215558e-06, "loss": 0.5077, "step": 1770 }, { "epoch": 0.8341143392689785, "grad_norm": 4.611522197723389, "learning_rate": 9.478678537956889e-06, "loss": 0.5449, "step": 1780 }, { "epoch": 0.8388003748828491, "grad_norm": 12.466484069824219, "learning_rate": 9.47574976569822e-06, "loss": 0.5744, "step": 1790 }, { "epoch": 0.8434864104967198, "grad_norm": 4.683777332305908, "learning_rate": 9.472820993439551e-06, "loss": 0.5102, "step": 1800 }, { "epoch": 0.8481724461105904, "grad_norm": 5.541889190673828, "learning_rate": 9.469892221180882e-06, "loss": 0.5589, "step": 1810 }, { "epoch": 0.8528584817244611, "grad_norm": 8.524742126464844, "learning_rate": 9.466963448922214e-06, "loss": 0.5872, "step": 1820 }, { "epoch": 0.8575445173383318, "grad_norm": 7.117620944976807, "learning_rate": 9.464034676663543e-06, "loss": 0.5484, "step": 1830 }, { "epoch": 0.8622305529522024, "grad_norm": 5.3457841873168945, "learning_rate": 9.461105904404874e-06, "loss": 0.5624, "step": 1840 }, { "epoch": 0.8669165885660731, "grad_norm": 4.375561714172363, "learning_rate": 9.458177132146204e-06, "loss": 0.525, "step": 1850 }, { "epoch": 0.8716026241799437, "grad_norm": 4.6026082038879395, "learning_rate": 9.455248359887535e-06, "loss": 0.5855, "step": 1860 }, { "epoch": 0.8762886597938144, "grad_norm": 5.399001121520996, "learning_rate": 9.452319587628866e-06, "loss": 0.5775, "step": 1870 }, { "epoch": 0.8809746954076851, "grad_norm": 3.9378573894500732, "learning_rate": 9.449390815370197e-06, "loss": 0.5068, "step": 1880 }, { "epoch": 0.8856607310215557, "grad_norm": 5.515146255493164, "learning_rate": 9.446462043111529e-06, "loss": 0.5718, "step": 1890 }, { "epoch": 0.8903467666354264, "grad_norm": 4.8671345710754395, "learning_rate": 9.44353327085286e-06, "loss": 0.5552, "step": 1900 }, { "epoch": 0.895032802249297, "grad_norm": 5.388006210327148, "learning_rate": 9.440604498594191e-06, "loss": 0.5854, "step": 1910 }, { "epoch": 0.8997188378631678, "grad_norm": 6.608395099639893, "learning_rate": 9.43767572633552e-06, "loss": 0.5459, "step": 1920 }, { "epoch": 0.9044048734770385, "grad_norm": 4.6435160636901855, "learning_rate": 9.434746954076852e-06, "loss": 0.529, "step": 1930 }, { "epoch": 0.9090909090909091, "grad_norm": 4.642300605773926, "learning_rate": 9.431818181818183e-06, "loss": 0.5255, "step": 1940 }, { "epoch": 0.9137769447047798, "grad_norm": 5.40919828414917, "learning_rate": 9.428889409559512e-06, "loss": 0.5605, "step": 1950 }, { "epoch": 0.9184629803186504, "grad_norm": 4.9874467849731445, "learning_rate": 9.425960637300844e-06, "loss": 0.5798, "step": 1960 }, { "epoch": 0.9231490159325211, "grad_norm": 4.9304094314575195, "learning_rate": 9.423031865042175e-06, "loss": 0.5576, "step": 1970 }, { "epoch": 0.9278350515463918, "grad_norm": 5.080467224121094, "learning_rate": 9.420103092783506e-06, "loss": 0.5221, "step": 1980 }, { "epoch": 0.9325210871602624, "grad_norm": 5.083141326904297, "learning_rate": 9.417174320524837e-06, "loss": 0.6041, "step": 1990 }, { "epoch": 0.9372071227741331, "grad_norm": 3.8194010257720947, "learning_rate": 9.414245548266168e-06, "loss": 0.5439, "step": 2000 }, { "epoch": 0.9372071227741331, "eval_loss": 0.058015577495098114, "eval_pearson_cosine": 0.7772706274362164, "eval_pearson_dot": 0.6518150260238968, "eval_pearson_euclidean": 0.7681856098914253, "eval_pearson_manhattan": 0.7668726914631314, "eval_runtime": 45.6952, "eval_samples_per_second": 32.826, "eval_spearman_cosine": 0.7781983730395821, "eval_spearman_dot": 0.6578238148510893, "eval_spearman_euclidean": 0.7779674226973379, "eval_spearman_manhattan": 0.7766391726420421, "eval_steps_per_second": 32.826, "step": 2000 }, { "epoch": 0.9418931583880038, "grad_norm": 5.383081912994385, "learning_rate": 9.411316776007498e-06, "loss": 0.5343, "step": 2010 }, { "epoch": 0.9465791940018744, "grad_norm": 5.533719539642334, "learning_rate": 9.408388003748829e-06, "loss": 0.5313, "step": 2020 }, { "epoch": 0.9512652296157451, "grad_norm": 4.267172336578369, "learning_rate": 9.40545923149016e-06, "loss": 0.5172, "step": 2030 }, { "epoch": 0.9559512652296157, "grad_norm": 4.8553009033203125, "learning_rate": 9.402530459231491e-06, "loss": 0.5104, "step": 2040 }, { "epoch": 0.9606373008434864, "grad_norm": 6.460834503173828, "learning_rate": 9.399601686972821e-06, "loss": 0.5225, "step": 2050 }, { "epoch": 0.9653233364573571, "grad_norm": 27.46290397644043, "learning_rate": 9.396672914714152e-06, "loss": 0.544, "step": 2060 }, { "epoch": 0.9700093720712277, "grad_norm": 4.89717435836792, "learning_rate": 9.393744142455483e-06, "loss": 0.5653, "step": 2070 }, { "epoch": 0.9746954076850984, "grad_norm": 4.803583145141602, "learning_rate": 9.390815370196814e-06, "loss": 0.5739, "step": 2080 }, { "epoch": 0.979381443298969, "grad_norm": 4.121029853820801, "learning_rate": 9.387886597938146e-06, "loss": 0.5192, "step": 2090 }, { "epoch": 0.9840674789128397, "grad_norm": 4.464984893798828, "learning_rate": 9.384957825679475e-06, "loss": 0.5393, "step": 2100 }, { "epoch": 0.9887535145267105, "grad_norm": 6.364498615264893, "learning_rate": 9.382029053420806e-06, "loss": 0.5764, "step": 2110 }, { "epoch": 0.993439550140581, "grad_norm": 3.743790864944458, "learning_rate": 9.379100281162138e-06, "loss": 0.5276, "step": 2120 }, { "epoch": 0.9981255857544518, "grad_norm": 4.737389087677002, "learning_rate": 9.376171508903469e-06, "loss": 0.5211, "step": 2130 }, { "epoch": 1.0028116213683225, "grad_norm": 3.622758626937866, "learning_rate": 9.3732427366448e-06, "loss": 0.5329, "step": 2140 }, { "epoch": 1.007497656982193, "grad_norm": 3.5359978675842285, "learning_rate": 9.37031396438613e-06, "loss": 0.4941, "step": 2150 }, { "epoch": 1.0121836925960637, "grad_norm": 4.669582843780518, "learning_rate": 9.36738519212746e-06, "loss": 0.4821, "step": 2160 }, { "epoch": 1.0168697282099344, "grad_norm": 3.767122507095337, "learning_rate": 9.364456419868792e-06, "loss": 0.4886, "step": 2170 }, { "epoch": 1.021555763823805, "grad_norm": 3.9681687355041504, "learning_rate": 9.361527647610123e-06, "loss": 0.493, "step": 2180 }, { "epoch": 1.0262417994376758, "grad_norm": 3.389897108078003, "learning_rate": 9.358598875351454e-06, "loss": 0.4688, "step": 2190 }, { "epoch": 1.0309278350515463, "grad_norm": 3.5152347087860107, "learning_rate": 9.355670103092784e-06, "loss": 0.4625, "step": 2200 }, { "epoch": 1.035613870665417, "grad_norm": 3.23901629447937, "learning_rate": 9.352741330834115e-06, "loss": 0.5143, "step": 2210 }, { "epoch": 1.0402999062792877, "grad_norm": 4.617633819580078, "learning_rate": 9.349812558575446e-06, "loss": 0.4732, "step": 2220 }, { "epoch": 1.0449859418931584, "grad_norm": 5.245469570159912, "learning_rate": 9.346883786316777e-06, "loss": 0.5213, "step": 2230 }, { "epoch": 1.0496719775070291, "grad_norm": 4.20419454574585, "learning_rate": 9.343955014058108e-06, "loss": 0.5042, "step": 2240 }, { "epoch": 1.0543580131208996, "grad_norm": 4.6322102546691895, "learning_rate": 9.341026241799438e-06, "loss": 0.4982, "step": 2250 }, { "epoch": 1.0543580131208996, "eval_loss": 0.05779802054166794, "eval_pearson_cosine": 0.7770314842083366, "eval_pearson_dot": 0.6498110843024136, "eval_pearson_euclidean": 0.7709013065859232, "eval_pearson_manhattan": 0.7695278239114174, "eval_runtime": 48.4856, "eval_samples_per_second": 30.937, "eval_spearman_cosine": 0.7783328375480574, "eval_spearman_dot": 0.6551905692522538, "eval_spearman_euclidean": 0.7802862933680744, "eval_spearman_manhattan": 0.7790525675974715, "eval_steps_per_second": 30.937, "step": 2250 }, { "epoch": 1.0590440487347703, "grad_norm": 4.474431991577148, "learning_rate": 9.33809746954077e-06, "loss": 0.5227, "step": 2260 }, { "epoch": 1.063730084348641, "grad_norm": 4.538947105407715, "learning_rate": 9.3351686972821e-06, "loss": 0.5158, "step": 2270 }, { "epoch": 1.0684161199625117, "grad_norm": 6.6143693923950195, "learning_rate": 9.332239925023432e-06, "loss": 0.461, "step": 2280 }, { "epoch": 1.0731021555763824, "grad_norm": 4.316189765930176, "learning_rate": 9.329311152764761e-06, "loss": 0.5079, "step": 2290 }, { "epoch": 1.077788191190253, "grad_norm": 4.054687976837158, "learning_rate": 9.326382380506092e-06, "loss": 0.5022, "step": 2300 }, { "epoch": 1.0824742268041236, "grad_norm": 4.232051849365234, "learning_rate": 9.323453608247423e-06, "loss": 0.5096, "step": 2310 }, { "epoch": 1.0871602624179943, "grad_norm": 3.7785236835479736, "learning_rate": 9.320524835988755e-06, "loss": 0.4614, "step": 2320 }, { "epoch": 1.091846298031865, "grad_norm": 4.865905284881592, "learning_rate": 9.317596063730086e-06, "loss": 0.5135, "step": 2330 }, { "epoch": 1.0965323336457358, "grad_norm": 4.681485176086426, "learning_rate": 9.314667291471417e-06, "loss": 0.5061, "step": 2340 }, { "epoch": 1.1012183692596063, "grad_norm": 4.256619453430176, "learning_rate": 9.311738519212747e-06, "loss": 0.4627, "step": 2350 }, { "epoch": 1.105904404873477, "grad_norm": 4.459606170654297, "learning_rate": 9.308809746954078e-06, "loss": 0.5171, "step": 2360 }, { "epoch": 1.1105904404873477, "grad_norm": 4.008665084838867, "learning_rate": 9.305880974695409e-06, "loss": 0.4422, "step": 2370 }, { "epoch": 1.1152764761012184, "grad_norm": 3.674177885055542, "learning_rate": 9.302952202436738e-06, "loss": 0.5233, "step": 2380 }, { "epoch": 1.119962511715089, "grad_norm": 4.463940620422363, "learning_rate": 9.30002343017807e-06, "loss": 0.4731, "step": 2390 }, { "epoch": 1.1246485473289598, "grad_norm": 3.9289097785949707, "learning_rate": 9.2970946579194e-06, "loss": 0.4869, "step": 2400 }, { "epoch": 1.1293345829428303, "grad_norm": 4.097565174102783, "learning_rate": 9.294165885660732e-06, "loss": 0.4594, "step": 2410 }, { "epoch": 1.134020618556701, "grad_norm": 4.55318546295166, "learning_rate": 9.291237113402063e-06, "loss": 0.494, "step": 2420 }, { "epoch": 1.1387066541705717, "grad_norm": 4.425617694854736, "learning_rate": 9.288308341143394e-06, "loss": 0.4829, "step": 2430 }, { "epoch": 1.1433926897844424, "grad_norm": 3.908015489578247, "learning_rate": 9.285379568884726e-06, "loss": 0.4793, "step": 2440 }, { "epoch": 1.148078725398313, "grad_norm": 3.7293996810913086, "learning_rate": 9.282450796626055e-06, "loss": 0.5399, "step": 2450 }, { "epoch": 1.1527647610121836, "grad_norm": 4.584681034088135, "learning_rate": 9.279522024367386e-06, "loss": 0.4479, "step": 2460 }, { "epoch": 1.1574507966260543, "grad_norm": 4.109914302825928, "learning_rate": 9.276593252108716e-06, "loss": 0.4599, "step": 2470 }, { "epoch": 1.162136832239925, "grad_norm": 4.446422100067139, "learning_rate": 9.273664479850047e-06, "loss": 0.4727, "step": 2480 }, { "epoch": 1.1668228678537957, "grad_norm": 5.975160598754883, "learning_rate": 9.270735707591378e-06, "loss": 0.4509, "step": 2490 }, { "epoch": 1.1715089034676662, "grad_norm": 4.379275321960449, "learning_rate": 9.26780693533271e-06, "loss": 0.4828, "step": 2500 }, { "epoch": 1.1715089034676662, "eval_loss": 0.05214480683207512, "eval_pearson_cosine": 0.7792755247272061, "eval_pearson_dot": 0.6569300577465214, "eval_pearson_euclidean": 0.7718322585231894, "eval_pearson_manhattan": 0.7703922250718165, "eval_runtime": 47.8089, "eval_samples_per_second": 31.375, "eval_spearman_cosine": 0.7799819701975583, "eval_spearman_dot": 0.662507389274304, "eval_spearman_euclidean": 0.7818437831063969, "eval_spearman_manhattan": 0.7805341558401507, "eval_steps_per_second": 31.375, "step": 2500 }, { "epoch": 1.176194939081537, "grad_norm": 3.5287399291992188, "learning_rate": 9.26487816307404e-06, "loss": 0.4591, "step": 2510 }, { "epoch": 1.1808809746954076, "grad_norm": 3.277655601501465, "learning_rate": 9.261949390815372e-06, "loss": 0.4479, "step": 2520 }, { "epoch": 1.1855670103092784, "grad_norm": 4.732039451599121, "learning_rate": 9.259020618556703e-06, "loss": 0.461, "step": 2530 }, { "epoch": 1.190253045923149, "grad_norm": 4.4760966300964355, "learning_rate": 9.256091846298032e-06, "loss": 0.4652, "step": 2540 }, { "epoch": 1.1949390815370198, "grad_norm": 7.485498428344727, "learning_rate": 9.253163074039364e-06, "loss": 0.4779, "step": 2550 }, { "epoch": 1.1996251171508903, "grad_norm": 3.9956140518188477, "learning_rate": 9.250234301780693e-06, "loss": 0.4567, "step": 2560 }, { "epoch": 1.204311152764761, "grad_norm": 3.547563314437866, "learning_rate": 9.247305529522024e-06, "loss": 0.4988, "step": 2570 }, { "epoch": 1.2089971883786317, "grad_norm": 5.354389667510986, "learning_rate": 9.244376757263355e-06, "loss": 0.464, "step": 2580 }, { "epoch": 1.2136832239925024, "grad_norm": 3.791760206222534, "learning_rate": 9.241447985004687e-06, "loss": 0.4441, "step": 2590 }, { "epoch": 1.218369259606373, "grad_norm": 4.77889347076416, "learning_rate": 9.238519212746018e-06, "loss": 0.4655, "step": 2600 }, { "epoch": 1.2230552952202436, "grad_norm": 5.804917335510254, "learning_rate": 9.235590440487349e-06, "loss": 0.4912, "step": 2610 }, { "epoch": 1.2277413308341143, "grad_norm": 3.841860771179199, "learning_rate": 9.23266166822868e-06, "loss": 0.472, "step": 2620 }, { "epoch": 1.232427366447985, "grad_norm": 4.4197540283203125, "learning_rate": 9.22973289597001e-06, "loss": 0.4821, "step": 2630 }, { "epoch": 1.2371134020618557, "grad_norm": 5.844490051269531, "learning_rate": 9.226804123711341e-06, "loss": 0.5655, "step": 2640 }, { "epoch": 1.2417994376757264, "grad_norm": 3.5442116260528564, "learning_rate": 9.223875351452672e-06, "loss": 0.4532, "step": 2650 }, { "epoch": 1.246485473289597, "grad_norm": 5.259571075439453, "learning_rate": 9.220946579194002e-06, "loss": 0.4856, "step": 2660 }, { "epoch": 1.2511715089034676, "grad_norm": 4.675846576690674, "learning_rate": 9.218017806935333e-06, "loss": 0.4576, "step": 2670 }, { "epoch": 1.2558575445173383, "grad_norm": 5.236482620239258, "learning_rate": 9.215089034676664e-06, "loss": 0.513, "step": 2680 }, { "epoch": 1.260543580131209, "grad_norm": 4.658278465270996, "learning_rate": 9.212160262417995e-06, "loss": 0.4734, "step": 2690 }, { "epoch": 1.2652296157450795, "grad_norm": 3.7085494995117188, "learning_rate": 9.209231490159326e-06, "loss": 0.5279, "step": 2700 }, { "epoch": 1.2699156513589505, "grad_norm": 3.4627673625946045, "learning_rate": 9.206302717900658e-06, "loss": 0.4773, "step": 2710 }, { "epoch": 1.274601686972821, "grad_norm": 4.618409633636475, "learning_rate": 9.203373945641987e-06, "loss": 0.4354, "step": 2720 }, { "epoch": 1.2792877225866917, "grad_norm": 3.1090590953826904, "learning_rate": 9.200445173383318e-06, "loss": 0.4409, "step": 2730 }, { "epoch": 1.2839737582005624, "grad_norm": 4.328725337982178, "learning_rate": 9.19751640112465e-06, "loss": 0.4799, "step": 2740 }, { "epoch": 1.2886597938144329, "grad_norm": 3.8362419605255127, "learning_rate": 9.194587628865979e-06, "loss": 0.5062, "step": 2750 }, { "epoch": 1.2886597938144329, "eval_loss": 0.05263364687561989, "eval_pearson_cosine": 0.7755555336434341, "eval_pearson_dot": 0.6502184577290961, "eval_pearson_euclidean": 0.7709853609297426, "eval_pearson_manhattan": 0.769572635033791, "eval_runtime": 44.8508, "eval_samples_per_second": 33.444, "eval_spearman_cosine": 0.7765036654281985, "eval_spearman_dot": 0.6558936409143281, "eval_spearman_euclidean": 0.7808945633743188, "eval_spearman_manhattan": 0.7795729380744477, "eval_steps_per_second": 33.444, "step": 2750 }, { "epoch": 1.2933458294283038, "grad_norm": 3.6972432136535645, "learning_rate": 9.19165885660731e-06, "loss": 0.488, "step": 2760 }, { "epoch": 1.2980318650421743, "grad_norm": 6.73103141784668, "learning_rate": 9.188730084348641e-06, "loss": 0.4553, "step": 2770 }, { "epoch": 1.302717900656045, "grad_norm": 4.371028423309326, "learning_rate": 9.185801312089973e-06, "loss": 0.4555, "step": 2780 }, { "epoch": 1.3074039362699157, "grad_norm": 3.4788401126861572, "learning_rate": 9.182872539831304e-06, "loss": 0.4561, "step": 2790 }, { "epoch": 1.3120899718837864, "grad_norm": 3.832277774810791, "learning_rate": 9.179943767572635e-06, "loss": 0.4838, "step": 2800 }, { "epoch": 1.316776007497657, "grad_norm": 3.5579423904418945, "learning_rate": 9.177014995313966e-06, "loss": 0.4404, "step": 2810 }, { "epoch": 1.3214620431115276, "grad_norm": 3.7768073081970215, "learning_rate": 9.174086223055296e-06, "loss": 0.4724, "step": 2820 }, { "epoch": 1.3261480787253983, "grad_norm": 3.957035779953003, "learning_rate": 9.171157450796627e-06, "loss": 0.471, "step": 2830 }, { "epoch": 1.330834114339269, "grad_norm": 3.6035895347595215, "learning_rate": 9.168228678537958e-06, "loss": 0.4645, "step": 2840 }, { "epoch": 1.3355201499531397, "grad_norm": 4.358327388763428, "learning_rate": 9.165299906279288e-06, "loss": 0.4301, "step": 2850 }, { "epoch": 1.3402061855670104, "grad_norm": 3.4666709899902344, "learning_rate": 9.162371134020619e-06, "loss": 0.4508, "step": 2860 }, { "epoch": 1.344892221180881, "grad_norm": 3.912290096282959, "learning_rate": 9.15944236176195e-06, "loss": 0.4379, "step": 2870 }, { "epoch": 1.3495782567947516, "grad_norm": 4.305796146392822, "learning_rate": 9.156513589503281e-06, "loss": 0.4194, "step": 2880 }, { "epoch": 1.3542642924086223, "grad_norm": 4.231681823730469, "learning_rate": 9.153584817244612e-06, "loss": 0.4017, "step": 2890 }, { "epoch": 1.358950328022493, "grad_norm": 4.43821382522583, "learning_rate": 9.150656044985944e-06, "loss": 0.4185, "step": 2900 }, { "epoch": 1.3636363636363638, "grad_norm": 4.922164440155029, "learning_rate": 9.147727272727273e-06, "loss": 0.5199, "step": 2910 }, { "epoch": 1.3683223992502342, "grad_norm": 4.577489852905273, "learning_rate": 9.144798500468604e-06, "loss": 0.4237, "step": 2920 }, { "epoch": 1.373008434864105, "grad_norm": 3.9537651538848877, "learning_rate": 9.141869728209935e-06, "loss": 0.4888, "step": 2930 }, { "epoch": 1.3776944704779757, "grad_norm": 4.165870189666748, "learning_rate": 9.138940955951267e-06, "loss": 0.4476, "step": 2940 }, { "epoch": 1.3823805060918464, "grad_norm": 4.492893218994141, "learning_rate": 9.136012183692596e-06, "loss": 0.5159, "step": 2950 }, { "epoch": 1.387066541705717, "grad_norm": 3.847490072250366, "learning_rate": 9.133083411433927e-06, "loss": 0.4497, "step": 2960 }, { "epoch": 1.3917525773195876, "grad_norm": 6.766137599945068, "learning_rate": 9.130154639175258e-06, "loss": 0.4379, "step": 2970 }, { "epoch": 1.3964386129334583, "grad_norm": 3.9198007583618164, "learning_rate": 9.12722586691659e-06, "loss": 0.4519, "step": 2980 }, { "epoch": 1.401124648547329, "grad_norm": 3.67480731010437, "learning_rate": 9.124297094657921e-06, "loss": 0.4108, "step": 2990 }, { "epoch": 1.4058106841611997, "grad_norm": 3.3013832569122314, "learning_rate": 9.12136832239925e-06, "loss": 0.433, "step": 3000 }, { "epoch": 1.4058106841611997, "eval_loss": 0.0497601218521595, "eval_pearson_cosine": 0.7834985989633054, "eval_pearson_dot": 0.6669524421664974, "eval_pearson_euclidean": 0.7743874834934843, "eval_pearson_manhattan": 0.7730376146204847, "eval_runtime": 47.8141, "eval_samples_per_second": 31.371, "eval_spearman_cosine": 0.7845889452017747, "eval_spearman_dot": 0.6729435548765089, "eval_spearman_euclidean": 0.784591658726837, "eval_spearman_manhattan": 0.7832975474858643, "eval_steps_per_second": 31.371, "step": 3000 }, { "epoch": 1.4104967197750704, "grad_norm": 4.2792487144470215, "learning_rate": 9.118439550140582e-06, "loss": 0.4878, "step": 3010 }, { "epoch": 1.415182755388941, "grad_norm": 3.8892383575439453, "learning_rate": 9.115510777881913e-06, "loss": 0.4676, "step": 3020 }, { "epoch": 1.4198687910028116, "grad_norm": 5.0008745193481445, "learning_rate": 9.112582005623244e-06, "loss": 0.4729, "step": 3030 }, { "epoch": 1.4245548266166823, "grad_norm": 5.607409477233887, "learning_rate": 9.109653233364575e-06, "loss": 0.4762, "step": 3040 }, { "epoch": 1.429240862230553, "grad_norm": 3.0340139865875244, "learning_rate": 9.106724461105905e-06, "loss": 0.4438, "step": 3050 }, { "epoch": 1.4339268978444237, "grad_norm": 4.310724258422852, "learning_rate": 9.103795688847236e-06, "loss": 0.4499, "step": 3060 }, { "epoch": 1.4386129334582942, "grad_norm": 4.481917381286621, "learning_rate": 9.100866916588567e-06, "loss": 0.4493, "step": 3070 }, { "epoch": 1.443298969072165, "grad_norm": 4.330621719360352, "learning_rate": 9.097938144329898e-06, "loss": 0.4505, "step": 3080 }, { "epoch": 1.4479850046860356, "grad_norm": 4.335081577301025, "learning_rate": 9.095009372071228e-06, "loss": 0.446, "step": 3090 }, { "epoch": 1.4526710402999063, "grad_norm": 3.0894672870635986, "learning_rate": 9.092080599812559e-06, "loss": 0.4404, "step": 3100 }, { "epoch": 1.457357075913777, "grad_norm": 4.6363983154296875, "learning_rate": 9.08915182755389e-06, "loss": 0.5358, "step": 3110 }, { "epoch": 1.4620431115276475, "grad_norm": 3.80387806892395, "learning_rate": 9.086223055295221e-06, "loss": 0.4374, "step": 3120 }, { "epoch": 1.4667291471415183, "grad_norm": 3.276442289352417, "learning_rate": 9.083294283036552e-06, "loss": 0.5013, "step": 3130 }, { "epoch": 1.471415182755389, "grad_norm": 3.843419075012207, "learning_rate": 9.080365510777884e-06, "loss": 0.4694, "step": 3140 }, { "epoch": 1.4761012183692597, "grad_norm": 4.7606730461120605, "learning_rate": 9.077436738519213e-06, "loss": 0.4215, "step": 3150 }, { "epoch": 1.4807872539831304, "grad_norm": 3.739225149154663, "learning_rate": 9.074507966260544e-06, "loss": 0.4756, "step": 3160 }, { "epoch": 1.4854732895970009, "grad_norm": 3.36938214302063, "learning_rate": 9.071579194001876e-06, "loss": 0.4243, "step": 3170 }, { "epoch": 1.4901593252108716, "grad_norm": 6.589993476867676, "learning_rate": 9.068650421743205e-06, "loss": 0.4698, "step": 3180 }, { "epoch": 1.4948453608247423, "grad_norm": 3.8416695594787598, "learning_rate": 9.065721649484536e-06, "loss": 0.4964, "step": 3190 }, { "epoch": 1.499531396438613, "grad_norm": 4.367741584777832, "learning_rate": 9.062792877225867e-06, "loss": 0.4417, "step": 3200 }, { "epoch": 1.5042174320524837, "grad_norm": 3.500617742538452, "learning_rate": 9.059864104967199e-06, "loss": 0.4522, "step": 3210 }, { "epoch": 1.5089034676663542, "grad_norm": 3.5349769592285156, "learning_rate": 9.05693533270853e-06, "loss": 0.4393, "step": 3220 }, { "epoch": 1.513589503280225, "grad_norm": 3.8469526767730713, "learning_rate": 9.054006560449861e-06, "loss": 0.4453, "step": 3230 }, { "epoch": 1.5182755388940956, "grad_norm": 3.209933280944824, "learning_rate": 9.051077788191192e-06, "loss": 0.4599, "step": 3240 }, { "epoch": 1.522961574507966, "grad_norm": 3.7976036071777344, "learning_rate": 9.048149015932522e-06, "loss": 0.4373, "step": 3250 }, { "epoch": 1.522961574507966, "eval_loss": 0.049798864871263504, "eval_pearson_cosine": 0.7866421286010308, "eval_pearson_dot": 0.6641640853451243, "eval_pearson_euclidean": 0.7777378719378305, "eval_pearson_manhattan": 0.7764827785285746, "eval_runtime": 43.7509, "eval_samples_per_second": 34.285, "eval_spearman_cosine": 0.7870351053050699, "eval_spearman_dot": 0.6708598238937284, "eval_spearman_euclidean": 0.7874683707378692, "eval_spearman_manhattan": 0.7865203522698128, "eval_steps_per_second": 34.285, "step": 3250 }, { "epoch": 1.527647610121837, "grad_norm": 4.851262092590332, "learning_rate": 9.045220243673853e-06, "loss": 0.491, "step": 3260 }, { "epoch": 1.5323336457357075, "grad_norm": 4.183891773223877, "learning_rate": 9.042291471415184e-06, "loss": 0.453, "step": 3270 }, { "epoch": 1.5370196813495782, "grad_norm": 4.280774116516113, "learning_rate": 9.039362699156514e-06, "loss": 0.4413, "step": 3280 }, { "epoch": 1.541705716963449, "grad_norm": 4.118307113647461, "learning_rate": 9.036433926897845e-06, "loss": 0.4661, "step": 3290 }, { "epoch": 1.5463917525773194, "grad_norm": 5.99712610244751, "learning_rate": 9.033505154639176e-06, "loss": 0.5205, "step": 3300 }, { "epoch": 1.5510777881911904, "grad_norm": 4.146691799163818, "learning_rate": 9.030576382380507e-06, "loss": 0.428, "step": 3310 }, { "epoch": 1.5557638238050608, "grad_norm": 3.899887800216675, "learning_rate": 9.027647610121838e-06, "loss": 0.4564, "step": 3320 }, { "epoch": 1.5604498594189316, "grad_norm": 3.9663302898406982, "learning_rate": 9.02471883786317e-06, "loss": 0.4539, "step": 3330 }, { "epoch": 1.5651358950328023, "grad_norm": 3.526458263397217, "learning_rate": 9.021790065604499e-06, "loss": 0.4844, "step": 3340 }, { "epoch": 1.569821930646673, "grad_norm": 4.192911624908447, "learning_rate": 9.01886129334583e-06, "loss": 0.4278, "step": 3350 }, { "epoch": 1.5745079662605437, "grad_norm": 4.185749530792236, "learning_rate": 9.015932521087161e-06, "loss": 0.4632, "step": 3360 }, { "epoch": 1.5791940018744142, "grad_norm": 3.411773204803467, "learning_rate": 9.013003748828491e-06, "loss": 0.436, "step": 3370 }, { "epoch": 1.5838800374882849, "grad_norm": 4.467881679534912, "learning_rate": 9.010074976569822e-06, "loss": 0.4133, "step": 3380 }, { "epoch": 1.5885660731021556, "grad_norm": 3.77736496925354, "learning_rate": 9.007146204311153e-06, "loss": 0.4452, "step": 3390 }, { "epoch": 1.5932521087160263, "grad_norm": 4.084095478057861, "learning_rate": 9.004217432052485e-06, "loss": 0.4605, "step": 3400 }, { "epoch": 1.597938144329897, "grad_norm": 3.3393008708953857, "learning_rate": 9.001288659793816e-06, "loss": 0.4157, "step": 3410 }, { "epoch": 1.6026241799437675, "grad_norm": 3.096881151199341, "learning_rate": 8.998359887535147e-06, "loss": 0.4478, "step": 3420 }, { "epoch": 1.6073102155576382, "grad_norm": 3.0557243824005127, "learning_rate": 8.995431115276478e-06, "loss": 0.4452, "step": 3430 }, { "epoch": 1.611996251171509, "grad_norm": 3.7997219562530518, "learning_rate": 8.992502343017808e-06, "loss": 0.4287, "step": 3440 }, { "epoch": 1.6166822867853796, "grad_norm": 3.6995465755462646, "learning_rate": 8.989573570759139e-06, "loss": 0.4423, "step": 3450 }, { "epoch": 1.6213683223992503, "grad_norm": 4.1384053230285645, "learning_rate": 8.986644798500468e-06, "loss": 0.4563, "step": 3460 }, { "epoch": 1.6260543580131208, "grad_norm": 4.637014865875244, "learning_rate": 8.9837160262418e-06, "loss": 0.4538, "step": 3470 }, { "epoch": 1.6307403936269915, "grad_norm": 4.30952262878418, "learning_rate": 8.98078725398313e-06, "loss": 0.3993, "step": 3480 }, { "epoch": 1.6354264292408622, "grad_norm": 4.746737003326416, "learning_rate": 8.977858481724462e-06, "loss": 0.4274, "step": 3490 }, { "epoch": 1.640112464854733, "grad_norm": 3.8592286109924316, "learning_rate": 8.974929709465793e-06, "loss": 0.4066, "step": 3500 }, { "epoch": 1.640112464854733, "eval_loss": 0.050406068563461304, "eval_pearson_cosine": 0.7840015528942317, "eval_pearson_dot": 0.659932129633507, "eval_pearson_euclidean": 0.7769297052026758, "eval_pearson_manhattan": 0.7754185185705609, "eval_runtime": 44.0859, "eval_samples_per_second": 34.024, "eval_spearman_cosine": 0.7845451302239834, "eval_spearman_dot": 0.6667296644451466, "eval_spearman_euclidean": 0.7868327314956118, "eval_spearman_manhattan": 0.7856021398727839, "eval_steps_per_second": 34.024, "step": 3500 }, { "epoch": 1.6447985004686037, "grad_norm": 5.983098030090332, "learning_rate": 8.972000937207124e-06, "loss": 0.4451, "step": 3510 }, { "epoch": 1.6494845360824741, "grad_norm": 4.052550315856934, "learning_rate": 8.969072164948455e-06, "loss": 0.4331, "step": 3520 }, { "epoch": 1.6541705716963448, "grad_norm": 3.7970380783081055, "learning_rate": 8.966143392689785e-06, "loss": 0.4427, "step": 3530 }, { "epoch": 1.6588566073102156, "grad_norm": 4.695807456970215, "learning_rate": 8.963214620431116e-06, "loss": 0.4522, "step": 3540 }, { "epoch": 1.6635426429240863, "grad_norm": 4.41202974319458, "learning_rate": 8.960285848172446e-06, "loss": 0.4275, "step": 3550 }, { "epoch": 1.668228678537957, "grad_norm": 5.364877223968506, "learning_rate": 8.957357075913777e-06, "loss": 0.4321, "step": 3560 }, { "epoch": 1.6729147141518275, "grad_norm": 3.801132917404175, "learning_rate": 8.954428303655108e-06, "loss": 0.4494, "step": 3570 }, { "epoch": 1.6776007497656982, "grad_norm": 4.197866439819336, "learning_rate": 8.95149953139644e-06, "loss": 0.4126, "step": 3580 }, { "epoch": 1.6822867853795689, "grad_norm": 5.34595251083374, "learning_rate": 8.94857075913777e-06, "loss": 0.4757, "step": 3590 }, { "epoch": 1.6869728209934396, "grad_norm": 4.772789478302002, "learning_rate": 8.945641986879102e-06, "loss": 0.4037, "step": 3600 }, { "epoch": 1.6916588566073103, "grad_norm": 4.81839656829834, "learning_rate": 8.942713214620433e-06, "loss": 0.4192, "step": 3610 }, { "epoch": 1.6963448922211808, "grad_norm": 3.470919132232666, "learning_rate": 8.939784442361762e-06, "loss": 0.4106, "step": 3620 }, { "epoch": 1.7010309278350515, "grad_norm": 3.2051522731781006, "learning_rate": 8.936855670103094e-06, "loss": 0.4162, "step": 3630 }, { "epoch": 1.7057169634489222, "grad_norm": 3.8122334480285645, "learning_rate": 8.933926897844423e-06, "loss": 0.4054, "step": 3640 }, { "epoch": 1.710402999062793, "grad_norm": 5.07956075668335, "learning_rate": 8.930998125585754e-06, "loss": 0.4164, "step": 3650 }, { "epoch": 1.7150890346766636, "grad_norm": 3.754542112350464, "learning_rate": 8.928069353327085e-06, "loss": 0.3703, "step": 3660 }, { "epoch": 1.7197750702905341, "grad_norm": 3.4620890617370605, "learning_rate": 8.925140581068417e-06, "loss": 0.4667, "step": 3670 }, { "epoch": 1.7244611059044048, "grad_norm": 4.179393768310547, "learning_rate": 8.922211808809748e-06, "loss": 0.4384, "step": 3680 }, { "epoch": 1.7291471415182755, "grad_norm": 3.0865719318389893, "learning_rate": 8.919283036551079e-06, "loss": 0.4248, "step": 3690 }, { "epoch": 1.7338331771321462, "grad_norm": 3.9282147884368896, "learning_rate": 8.91635426429241e-06, "loss": 0.4231, "step": 3700 }, { "epoch": 1.738519212746017, "grad_norm": 3.9746885299682617, "learning_rate": 8.91342549203374e-06, "loss": 0.4152, "step": 3710 }, { "epoch": 1.7432052483598874, "grad_norm": 3.8340625762939453, "learning_rate": 8.910496719775071e-06, "loss": 0.4458, "step": 3720 }, { "epoch": 1.7478912839737581, "grad_norm": 4.861859321594238, "learning_rate": 8.907567947516402e-06, "loss": 0.4274, "step": 3730 }, { "epoch": 1.7525773195876289, "grad_norm": 3.3457283973693848, "learning_rate": 8.904639175257732e-06, "loss": 0.4534, "step": 3740 }, { "epoch": 1.7572633552014996, "grad_norm": 4.057953834533691, "learning_rate": 8.901710402999063e-06, "loss": 0.484, "step": 3750 }, { "epoch": 1.7572633552014996, "eval_loss": 0.05240313336253166, "eval_pearson_cosine": 0.7879299521989642, "eval_pearson_dot": 0.6605985065084816, "eval_pearson_euclidean": 0.7797438530556207, "eval_pearson_manhattan": 0.778216782480726, "eval_runtime": 44.9916, "eval_samples_per_second": 33.34, "eval_spearman_cosine": 0.7888982276270184, "eval_spearman_dot": 0.6669965792210436, "eval_spearman_euclidean": 0.7899037728263932, "eval_spearman_manhattan": 0.7886320032383264, "eval_steps_per_second": 33.34, "step": 3750 }, { "epoch": 1.7619493908153703, "grad_norm": 3.281102418899536, "learning_rate": 8.898781630740394e-06, "loss": 0.4074, "step": 3760 }, { "epoch": 1.7666354264292408, "grad_norm": 4.710203170776367, "learning_rate": 8.895852858481725e-06, "loss": 0.4537, "step": 3770 }, { "epoch": 1.7713214620431117, "grad_norm": 4.636346817016602, "learning_rate": 8.892924086223056e-06, "loss": 0.4348, "step": 3780 }, { "epoch": 1.7760074976569822, "grad_norm": 4.518571376800537, "learning_rate": 8.889995313964388e-06, "loss": 0.4515, "step": 3790 }, { "epoch": 1.780693533270853, "grad_norm": 4.0576066970825195, "learning_rate": 8.887066541705717e-06, "loss": 0.4276, "step": 3800 }, { "epoch": 1.7853795688847236, "grad_norm": 5.657445430755615, "learning_rate": 8.884137769447048e-06, "loss": 0.4277, "step": 3810 }, { "epoch": 1.790065604498594, "grad_norm": 5.393405437469482, "learning_rate": 8.88120899718838e-06, "loss": 0.428, "step": 3820 }, { "epoch": 1.794751640112465, "grad_norm": 4.101112365722656, "learning_rate": 8.87828022492971e-06, "loss": 0.4489, "step": 3830 }, { "epoch": 1.7994376757263355, "grad_norm": 3.531888246536255, "learning_rate": 8.87535145267104e-06, "loss": 0.3673, "step": 3840 }, { "epoch": 1.8041237113402062, "grad_norm": 3.4490315914154053, "learning_rate": 8.872422680412371e-06, "loss": 0.4059, "step": 3850 }, { "epoch": 1.808809746954077, "grad_norm": 3.034252643585205, "learning_rate": 8.869493908153702e-06, "loss": 0.3832, "step": 3860 }, { "epoch": 1.8134957825679474, "grad_norm": 4.064283847808838, "learning_rate": 8.866565135895034e-06, "loss": 0.4704, "step": 3870 }, { "epoch": 1.8181818181818183, "grad_norm": 3.2689194679260254, "learning_rate": 8.863636363636365e-06, "loss": 0.4428, "step": 3880 }, { "epoch": 1.8228678537956888, "grad_norm": 3.173530101776123, "learning_rate": 8.860707591377694e-06, "loss": 0.4283, "step": 3890 }, { "epoch": 1.8275538894095595, "grad_norm": 3.638122081756592, "learning_rate": 8.857778819119026e-06, "loss": 0.4225, "step": 3900 }, { "epoch": 1.8322399250234302, "grad_norm": 3.636679172515869, "learning_rate": 8.854850046860357e-06, "loss": 0.4154, "step": 3910 }, { "epoch": 1.8369259606373007, "grad_norm": 3.810847520828247, "learning_rate": 8.851921274601688e-06, "loss": 0.3931, "step": 3920 }, { "epoch": 1.8416119962511717, "grad_norm": 3.7469394207000732, "learning_rate": 8.848992502343019e-06, "loss": 0.4472, "step": 3930 }, { "epoch": 1.8462980318650422, "grad_norm": 4.962492942810059, "learning_rate": 8.846063730084349e-06, "loss": 0.4324, "step": 3940 }, { "epoch": 1.8509840674789129, "grad_norm": 3.4641172885894775, "learning_rate": 8.84313495782568e-06, "loss": 0.4234, "step": 3950 }, { "epoch": 1.8556701030927836, "grad_norm": 3.8601555824279785, "learning_rate": 8.840206185567011e-06, "loss": 0.4045, "step": 3960 }, { "epoch": 1.860356138706654, "grad_norm": 6.290759086608887, "learning_rate": 8.837277413308342e-06, "loss": 0.4655, "step": 3970 }, { "epoch": 1.865042174320525, "grad_norm": 3.5882256031036377, "learning_rate": 8.834348641049673e-06, "loss": 0.4298, "step": 3980 }, { "epoch": 1.8697282099343955, "grad_norm": 3.133535623550415, "learning_rate": 8.831419868791003e-06, "loss": 0.4508, "step": 3990 }, { "epoch": 1.8744142455482662, "grad_norm": 3.220383644104004, "learning_rate": 8.828491096532334e-06, "loss": 0.4348, "step": 4000 }, { "epoch": 1.8744142455482662, "eval_loss": 0.04981923848390579, "eval_pearson_cosine": 0.790612878761543, "eval_pearson_dot": 0.6612786229229286, "eval_pearson_euclidean": 0.7799249806775554, "eval_pearson_manhattan": 0.7784476870813819, "eval_runtime": 45.9371, "eval_samples_per_second": 32.653, "eval_spearman_cosine": 0.7908100570922554, "eval_spearman_dot": 0.6689224987064551, "eval_spearman_euclidean": 0.7902520878335856, "eval_spearman_manhattan": 0.7892503488739743, "eval_steps_per_second": 32.653, "step": 4000 } ], "logging_steps": 10, "max_steps": 4268, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }