{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995971532160601, "eval_steps": 500, "global_step": 1861, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005371290452531221, "grad_norm": 495.1745300292969, "learning_rate": 1.7857142857142858e-10, "loss": 40.2102, "step": 10 }, { "epoch": 0.010742580905062441, "grad_norm": 477.615966796875, "learning_rate": 3.5714285714285715e-10, "loss": 40.7707, "step": 20 }, { "epoch": 0.01611387135759366, "grad_norm": 492.8292236328125, "learning_rate": 5.357142857142857e-10, "loss": 40.2476, "step": 30 }, { "epoch": 0.021485161810124883, "grad_norm": 529.812255859375, "learning_rate": 7.142857142857143e-10, "loss": 40.2061, "step": 40 }, { "epoch": 0.026856452262656105, "grad_norm": 534.6681518554688, "learning_rate": 8.92857142857143e-10, "loss": 40.3632, "step": 50 }, { "epoch": 0.03222774271518732, "grad_norm": 493.5475769042969, "learning_rate": 9.999878827638992e-10, "loss": 40.3255, "step": 60 }, { "epoch": 0.037599033167718544, "grad_norm": 514.5845336914062, "learning_rate": 9.998515706025587e-10, "loss": 40.1727, "step": 70 }, { "epoch": 0.042970323620249766, "grad_norm": 520.8786010742188, "learning_rate": 9.99563841164693e-10, "loss": 40.3397, "step": 80 }, { "epoch": 0.04834161407278099, "grad_norm": 506.258056640625, "learning_rate": 9.991247816105924e-10, "loss": 40.5221, "step": 90 }, { "epoch": 0.05371290452531221, "grad_norm": 518.2510375976562, "learning_rate": 9.985345249421433e-10, "loss": 40.3247, "step": 100 }, { "epoch": 0.059084194977843424, "grad_norm": 509.5160827636719, "learning_rate": 9.977932499625396e-10, "loss": 40.4914, "step": 110 }, { "epoch": 0.06445548543037465, "grad_norm": 521.5568237304688, "learning_rate": 9.969011812221178e-10, "loss": 40.4141, "step": 120 }, { "epoch": 0.06982677588290587, "grad_norm": 429.8058776855469, "learning_rate": 9.958585889503364e-10, "loss": 40.7874, "step": 130 }, { "epoch": 0.07519806633543709, "grad_norm": 523.8095092773438, "learning_rate": 9.946657889739163e-10, "loss": 40.9356, "step": 140 }, { "epoch": 0.08056935678796831, "grad_norm": 492.9051818847656, "learning_rate": 9.933231426211678e-10, "loss": 40.3382, "step": 150 }, { "epoch": 0.08594064724049953, "grad_norm": 455.4582824707031, "learning_rate": 9.918310566125387e-10, "loss": 39.8185, "step": 160 }, { "epoch": 0.09131193769303075, "grad_norm": 479.072265625, "learning_rate": 9.901899829374048e-10, "loss": 39.3643, "step": 170 }, { "epoch": 0.09668322814556198, "grad_norm": 472.71112060546875, "learning_rate": 9.884004187171542e-10, "loss": 39.7452, "step": 180 }, { "epoch": 0.1020545185980932, "grad_norm": 508.54547119140625, "learning_rate": 9.864629060545955e-10, "loss": 41.1781, "step": 190 }, { "epoch": 0.10742580905062442, "grad_norm": 468.7608642578125, "learning_rate": 9.843780318697425e-10, "loss": 40.0115, "step": 200 }, { "epoch": 0.11279709950315564, "grad_norm": 512.1522216796875, "learning_rate": 9.821464277220207e-10, "loss": 40.1177, "step": 210 }, { "epoch": 0.11816838995568685, "grad_norm": 483.4913635253906, "learning_rate": 9.79768769618954e-10, "loss": 40.2036, "step": 220 }, { "epoch": 0.12353968040821807, "grad_norm": 509.5931091308594, "learning_rate": 9.77245777811384e-10, "loss": 39.6307, "step": 230 }, { "epoch": 0.1289109708607493, "grad_norm": 483.1646728515625, "learning_rate": 9.745782165752891e-10, "loss": 40.7153, "step": 240 }, { "epoch": 0.13428226131328053, "grad_norm": 436.9029235839844, "learning_rate": 9.717668939802664e-10, "loss": 40.1682, "step": 250 }, { "epoch": 0.13965355176581173, "grad_norm": 467.0509338378906, "learning_rate": 9.68812661644748e-10, "loss": 40.397, "step": 260 }, { "epoch": 0.14502484221834297, "grad_norm": 496.4578857421875, "learning_rate": 9.657164144780247e-10, "loss": 40.5665, "step": 270 }, { "epoch": 0.15039613267087418, "grad_norm": 542.7703857421875, "learning_rate": 9.624790904091554e-10, "loss": 40.8245, "step": 280 }, { "epoch": 0.15576742312340539, "grad_norm": 467.4183349609375, "learning_rate": 9.59101670102847e-10, "loss": 40.5495, "step": 290 }, { "epoch": 0.16113871357593662, "grad_norm": 514.7463989257812, "learning_rate": 9.555851766623854e-10, "loss": 40.3572, "step": 300 }, { "epoch": 0.16651000402846783, "grad_norm": 479.075439453125, "learning_rate": 9.519306753197134e-10, "loss": 40.9737, "step": 310 }, { "epoch": 0.17188129448099906, "grad_norm": 480.1302185058594, "learning_rate": 9.481392731127458e-10, "loss": 40.1158, "step": 320 }, { "epoch": 0.17725258493353027, "grad_norm": 459.5805969238281, "learning_rate": 9.442121185500201e-10, "loss": 39.8788, "step": 330 }, { "epoch": 0.1826238753860615, "grad_norm": 460.0196533203125, "learning_rate": 9.401504012627857e-10, "loss": 39.5867, "step": 340 }, { "epoch": 0.18799516583859271, "grad_norm": 477.8091735839844, "learning_rate": 9.35955351644635e-10, "loss": 41.1957, "step": 350 }, { "epoch": 0.19336645629112395, "grad_norm": 475.1239929199219, "learning_rate": 9.31628240478787e-10, "loss": 40.1875, "step": 360 }, { "epoch": 0.19873774674365516, "grad_norm": 522.9048461914062, "learning_rate": 9.27170378553137e-10, "loss": 40.7773, "step": 370 }, { "epoch": 0.2041090371961864, "grad_norm": 489.0563049316406, "learning_rate": 9.225831162631853e-10, "loss": 40.9744, "step": 380 }, { "epoch": 0.2094803276487176, "grad_norm": 426.8688659667969, "learning_rate": 9.178678432029706e-10, "loss": 39.4605, "step": 390 }, { "epoch": 0.21485161810124884, "grad_norm": 479.16900634765625, "learning_rate": 9.130259877441272e-10, "loss": 39.4938, "step": 400 }, { "epoch": 0.22022290855378004, "grad_norm": 523.247802734375, "learning_rate": 9.080590166031966e-10, "loss": 40.7306, "step": 410 }, { "epoch": 0.22559419900631128, "grad_norm": 505.7490234375, "learning_rate": 9.02968434397323e-10, "loss": 40.1471, "step": 420 }, { "epoch": 0.2309654894588425, "grad_norm": 474.9671630859375, "learning_rate": 8.977557831884684e-10, "loss": 41.0007, "step": 430 }, { "epoch": 0.2363367799113737, "grad_norm": 490.84332275390625, "learning_rate": 8.924226420162834e-10, "loss": 39.3389, "step": 440 }, { "epoch": 0.24170807036390493, "grad_norm": 542.4966430664062, "learning_rate": 8.869706264197784e-10, "loss": 40.3484, "step": 450 }, { "epoch": 0.24707936081643614, "grad_norm": 480.568603515625, "learning_rate": 8.814013879479366e-10, "loss": 40.1192, "step": 460 }, { "epoch": 0.2524506512689674, "grad_norm": 496.9430236816406, "learning_rate": 8.757166136594194e-10, "loss": 40.247, "step": 470 }, { "epoch": 0.2578219417214986, "grad_norm": 519.3447265625, "learning_rate": 8.699180256115157e-10, "loss": 39.9668, "step": 480 }, { "epoch": 0.2631932321740298, "grad_norm": 462.6995849609375, "learning_rate": 8.640073803384881e-10, "loss": 40.2213, "step": 490 }, { "epoch": 0.26856452262656105, "grad_norm": 498.30548095703125, "learning_rate": 8.579864683194752e-10, "loss": 39.849, "step": 500 }, { "epoch": 0.27393581307909226, "grad_norm": 491.5065002441406, "learning_rate": 8.518571134361129e-10, "loss": 39.8567, "step": 510 }, { "epoch": 0.27930710353162347, "grad_norm": 401.1820068359375, "learning_rate": 8.456211724200347e-10, "loss": 40.0964, "step": 520 }, { "epoch": 0.2846783939841547, "grad_norm": 472.61627197265625, "learning_rate": 8.392805342904231e-10, "loss": 39.9992, "step": 530 }, { "epoch": 0.29004968443668594, "grad_norm": 504.82861328125, "learning_rate": 8.328371197817788e-10, "loss": 40.4024, "step": 540 }, { "epoch": 0.29542097488921715, "grad_norm": 460.82733154296875, "learning_rate": 8.262928807620843e-10, "loss": 41.2341, "step": 550 }, { "epoch": 0.30079226534174835, "grad_norm": 515.23583984375, "learning_rate": 8.196497996415337e-10, "loss": 40.4191, "step": 560 }, { "epoch": 0.30616355579427956, "grad_norm": 534.2943725585938, "learning_rate": 8.129098887720137e-10, "loss": 40.0219, "step": 570 }, { "epoch": 0.31153484624681077, "grad_norm": 506.2889099121094, "learning_rate": 8.060751898375115e-10, "loss": 40.2062, "step": 580 }, { "epoch": 0.31690613669934203, "grad_norm": 451.0182800292969, "learning_rate": 7.991477732356403e-10, "loss": 40.1886, "step": 590 }, { "epoch": 0.32227742715187324, "grad_norm": 497.3751525878906, "learning_rate": 7.921297374504637e-10, "loss": 40.7882, "step": 600 }, { "epoch": 0.32764871760440445, "grad_norm": 548.7998657226562, "learning_rate": 7.850232084168145e-10, "loss": 40.9427, "step": 610 }, { "epoch": 0.33302000805693566, "grad_norm": 486.71063232421875, "learning_rate": 7.778303388762966e-10, "loss": 39.4863, "step": 620 }, { "epoch": 0.3383912985094669, "grad_norm": 448.2780456542969, "learning_rate": 7.705533077251672e-10, "loss": 39.9087, "step": 630 }, { "epoch": 0.34376258896199813, "grad_norm": 526.2222900390625, "learning_rate": 7.63194319354295e-10, "loss": 39.7048, "step": 640 }, { "epoch": 0.34913387941452934, "grad_norm": 492.9909973144531, "learning_rate": 7.557556029813974e-10, "loss": 39.5465, "step": 650 }, { "epoch": 0.35450516986706054, "grad_norm": 483.2941589355469, "learning_rate": 7.482394119757546e-10, "loss": 40.6158, "step": 660 }, { "epoch": 0.3598764603195918, "grad_norm": 475.9729309082031, "learning_rate": 7.406480231756098e-10, "loss": 39.8862, "step": 670 }, { "epoch": 0.365247750772123, "grad_norm": 477.7049255371094, "learning_rate": 7.329837361984598e-10, "loss": 40.462, "step": 680 }, { "epoch": 0.3706190412246542, "grad_norm": 448.4286804199219, "learning_rate": 7.252488727444418e-10, "loss": 40.037, "step": 690 }, { "epoch": 0.37599033167718543, "grad_norm": 480.7619934082031, "learning_rate": 7.174457758930374e-10, "loss": 41.1926, "step": 700 }, { "epoch": 0.38136162212971664, "grad_norm": 452.7475280761719, "learning_rate": 7.095768093932932e-10, "loss": 39.8431, "step": 710 }, { "epoch": 0.3867329125822479, "grad_norm": 419.9246826171875, "learning_rate": 7.016443569477854e-10, "loss": 39.7369, "step": 720 }, { "epoch": 0.3921042030347791, "grad_norm": 516.64306640625, "learning_rate": 6.936508214905369e-10, "loss": 39.727, "step": 730 }, { "epoch": 0.3974754934873103, "grad_norm": 532.8106079101562, "learning_rate": 6.855986244591104e-10, "loss": 39.0725, "step": 740 }, { "epoch": 0.4028467839398415, "grad_norm": 510.8319396972656, "learning_rate": 6.774902050610951e-10, "loss": 40.6862, "step": 750 }, { "epoch": 0.4082180743923728, "grad_norm": 525.424560546875, "learning_rate": 6.693280195352114e-10, "loss": 40.5439, "step": 760 }, { "epoch": 0.413589364844904, "grad_norm": 476.6329040527344, "learning_rate": 6.61114540407256e-10, "loss": 40.4504, "step": 770 }, { "epoch": 0.4189606552974352, "grad_norm": 495.2875061035156, "learning_rate": 6.528522557411133e-10, "loss": 40.9673, "step": 780 }, { "epoch": 0.4243319457499664, "grad_norm": 468.4483642578125, "learning_rate": 6.445436683850597e-10, "loss": 40.2403, "step": 790 }, { "epoch": 0.4297032362024977, "grad_norm": 516.677490234375, "learning_rate": 6.361912952135903e-10, "loss": 40.4345, "step": 800 }, { "epoch": 0.4350745266550289, "grad_norm": 509.36138916015625, "learning_rate": 6.277976663649947e-10, "loss": 39.9229, "step": 810 }, { "epoch": 0.4404458171075601, "grad_norm": 502.6529541015625, "learning_rate": 6.193653244749179e-10, "loss": 40.928, "step": 820 }, { "epoch": 0.4458171075600913, "grad_norm": 477.757568359375, "learning_rate": 6.108968239061324e-10, "loss": 40.2371, "step": 830 }, { "epoch": 0.45118839801262256, "grad_norm": 483.0531005859375, "learning_rate": 6.023947299747592e-10, "loss": 40.7409, "step": 840 }, { "epoch": 0.45655968846515377, "grad_norm": 495.1935119628906, "learning_rate": 5.93861618173172e-10, "loss": 40.123, "step": 850 }, { "epoch": 0.461930978917685, "grad_norm": 420.5578918457031, "learning_rate": 5.853000733898161e-10, "loss": 39.6038, "step": 860 }, { "epoch": 0.4673022693702162, "grad_norm": 512.252197265625, "learning_rate": 5.767126891261828e-10, "loss": 40.0436, "step": 870 }, { "epoch": 0.4726735598227474, "grad_norm": 499.7673645019531, "learning_rate": 5.681020667111754e-10, "loss": 39.6081, "step": 880 }, { "epoch": 0.47804485027527865, "grad_norm": 454.9427185058594, "learning_rate": 5.594708145131012e-10, "loss": 39.5993, "step": 890 }, { "epoch": 0.48341614072780986, "grad_norm": 437.3612060546875, "learning_rate": 5.508215471495337e-10, "loss": 39.8825, "step": 900 }, { "epoch": 0.48878743118034107, "grad_norm": 520.9217529296875, "learning_rate": 5.421568846952822e-10, "loss": 41.5034, "step": 910 }, { "epoch": 0.4941587216328723, "grad_norm": 488.6954650878906, "learning_rate": 5.334794518887044e-10, "loss": 39.5379, "step": 920 }, { "epoch": 0.49953001208540354, "grad_norm": 519.3870849609375, "learning_rate": 5.247918773366112e-10, "loss": 39.54, "step": 930 }, { "epoch": 0.5049013025379347, "grad_norm": 497.21746826171875, "learning_rate": 5.160967927179963e-10, "loss": 40.0503, "step": 940 }, { "epoch": 0.510272592990466, "grad_norm": 476.4524841308594, "learning_rate": 5.073968319868361e-10, "loss": 39.7168, "step": 950 }, { "epoch": 0.5156438834429972, "grad_norm": 484.02777099609375, "learning_rate": 4.986946305742012e-10, "loss": 39.6419, "step": 960 }, { "epoch": 0.5210151738955284, "grad_norm": 451.2009582519531, "learning_rate": 4.899928245899194e-10, "loss": 40.7173, "step": 970 }, { "epoch": 0.5263864643480596, "grad_norm": 499.43408203125, "learning_rate": 4.812940500240333e-10, "loss": 40.2658, "step": 980 }, { "epoch": 0.5317577548005908, "grad_norm": 449.65496826171875, "learning_rate": 4.72600941948295e-10, "loss": 40.1894, "step": 990 }, { "epoch": 0.5371290452531221, "grad_norm": 492.1304016113281, "learning_rate": 4.6391613371793786e-10, "loss": 40.3246, "step": 1000 }, { "epoch": 0.5425003357056533, "grad_norm": 484.3329772949219, "learning_rate": 4.5524225617396904e-10, "loss": 40.9067, "step": 1010 }, { "epoch": 0.5478716261581845, "grad_norm": 447.1062927246094, "learning_rate": 4.4658193684622293e-10, "loss": 40.208, "step": 1020 }, { "epoch": 0.5532429166107157, "grad_norm": 466.41522216796875, "learning_rate": 4.3793779915741885e-10, "loss": 40.008, "step": 1030 }, { "epoch": 0.5586142070632469, "grad_norm": 472.39013671875, "learning_rate": 4.293124616284608e-10, "loss": 40.3332, "step": 1040 }, { "epoch": 0.5639854975157782, "grad_norm": 463.3416748046875, "learning_rate": 4.2070853708522495e-10, "loss": 40.8243, "step": 1050 }, { "epoch": 0.5693567879683094, "grad_norm": 516.2078247070312, "learning_rate": 4.1212863186706943e-10, "loss": 40.5519, "step": 1060 }, { "epoch": 0.5747280784208406, "grad_norm": 487.50628662109375, "learning_rate": 4.035753450373111e-10, "loss": 40.4969, "step": 1070 }, { "epoch": 0.5800993688733719, "grad_norm": 520.0319213867188, "learning_rate": 3.950512675959052e-10, "loss": 39.9747, "step": 1080 }, { "epoch": 0.585470659325903, "grad_norm": 543.7863159179688, "learning_rate": 3.865589816945685e-10, "loss": 40.0276, "step": 1090 }, { "epoch": 0.5908419497784343, "grad_norm": 486.9960021972656, "learning_rate": 3.7810105985458137e-10, "loss": 40.1272, "step": 1100 }, { "epoch": 0.5962132402309654, "grad_norm": 502.4769287109375, "learning_rate": 3.6968006418751e-10, "loss": 40.3276, "step": 1110 }, { "epoch": 0.6015845306834967, "grad_norm": 472.21533203125, "learning_rate": 3.6129854561907786e-10, "loss": 40.4212, "step": 1120 }, { "epoch": 0.606955821136028, "grad_norm": 434.5205078125, "learning_rate": 3.5295904311642897e-10, "loss": 39.5327, "step": 1130 }, { "epoch": 0.6123271115885591, "grad_norm": 511.1942138671875, "learning_rate": 3.446640829190133e-10, "loss": 40.5099, "step": 1140 }, { "epoch": 0.6176984020410904, "grad_norm": 479.92901611328125, "learning_rate": 3.3641617777332523e-10, "loss": 39.1485, "step": 1150 }, { "epoch": 0.6230696924936215, "grad_norm": 512.5575561523438, "learning_rate": 3.2821782617173294e-10, "loss": 41.33, "step": 1160 }, { "epoch": 0.6284409829461528, "grad_norm": 517.29833984375, "learning_rate": 3.2007151159562237e-10, "loss": 39.8799, "step": 1170 }, { "epoch": 0.6338122733986841, "grad_norm": 452.1294860839844, "learning_rate": 3.119797017630914e-10, "loss": 40.0134, "step": 1180 }, { "epoch": 0.6391835638512152, "grad_norm": 499.8146057128906, "learning_rate": 3.0394484788141616e-10, "loss": 40.4734, "step": 1190 }, { "epoch": 0.6445548543037465, "grad_norm": 551.718994140625, "learning_rate": 2.9596938390452166e-10, "loss": 39.968, "step": 1200 }, { "epoch": 0.6499261447562777, "grad_norm": 476.2742614746094, "learning_rate": 2.880557257956763e-10, "loss": 40.1534, "step": 1210 }, { "epoch": 0.6552974352088089, "grad_norm": 493.28167724609375, "learning_rate": 2.8020627079563876e-10, "loss": 39.8795, "step": 1220 }, { "epoch": 0.6606687256613402, "grad_norm": 462.6866149902344, "learning_rate": 2.7242339669647403e-10, "loss": 40.3111, "step": 1230 }, { "epoch": 0.6660400161138713, "grad_norm": 509.9576416015625, "learning_rate": 2.647094611212626e-10, "loss": 39.3712, "step": 1240 }, { "epoch": 0.6714113065664026, "grad_norm": 483.03619384765625, "learning_rate": 2.570668008099183e-10, "loss": 39.4756, "step": 1250 }, { "epoch": 0.6767825970189338, "grad_norm": 505.42071533203125, "learning_rate": 2.494977309113331e-10, "loss": 40.5326, "step": 1260 }, { "epoch": 0.682153887471465, "grad_norm": 461.374755859375, "learning_rate": 2.42004544282061e-10, "loss": 39.9911, "step": 1270 }, { "epoch": 0.6875251779239963, "grad_norm": 432.3858947753906, "learning_rate": 2.3458951079175717e-10, "loss": 40.3153, "step": 1280 }, { "epoch": 0.6928964683765274, "grad_norm": 515.9682006835938, "learning_rate": 2.2725487663557688e-10, "loss": 40.6573, "step": 1290 }, { "epoch": 0.6982677588290587, "grad_norm": 476.286865234375, "learning_rate": 2.2000286365374955e-10, "loss": 39.9867, "step": 1300 }, { "epoch": 0.7036390492815899, "grad_norm": 472.92083740234375, "learning_rate": 2.1283566865852822e-10, "loss": 40.5379, "step": 1310 }, { "epoch": 0.7090103397341211, "grad_norm": 552.19287109375, "learning_rate": 2.0575546276872166e-10, "loss": 41.3682, "step": 1320 }, { "epoch": 0.7143816301866523, "grad_norm": 462.6091003417969, "learning_rate": 1.9876439075200893e-10, "loss": 41.0671, "step": 1330 }, { "epoch": 0.7197529206391836, "grad_norm": 522.0980224609375, "learning_rate": 1.9186457037523765e-10, "loss": 40.3256, "step": 1340 }, { "epoch": 0.7251242110917148, "grad_norm": 495.66510009765625, "learning_rate": 1.8505809176289958e-10, "loss": 40.3366, "step": 1350 }, { "epoch": 0.730495501544246, "grad_norm": 536.3059692382812, "learning_rate": 1.7834701676398057e-10, "loss": 40.3298, "step": 1360 }, { "epoch": 0.7358667919967772, "grad_norm": 527.6504516601562, "learning_rate": 1.7173337832737773e-10, "loss": 39.7742, "step": 1370 }, { "epoch": 0.7412380824493084, "grad_norm": 508.9981689453125, "learning_rate": 1.6521917988606762e-10, "loss": 40.0357, "step": 1380 }, { "epoch": 0.7466093729018397, "grad_norm": 546.9842529296875, "learning_rate": 1.588063947502181e-10, "loss": 39.8671, "step": 1390 }, { "epoch": 0.7519806633543709, "grad_norm": 514.7166748046875, "learning_rate": 1.524969655094242e-10, "loss": 40.3517, "step": 1400 }, { "epoch": 0.7573519538069021, "grad_norm": 458.2544250488281, "learning_rate": 1.4629280344425106e-10, "loss": 39.5965, "step": 1410 }, { "epoch": 0.7627232442594333, "grad_norm": 509.53546142578125, "learning_rate": 1.401957879472583e-10, "loss": 40.175, "step": 1420 }, { "epoch": 0.7680945347119645, "grad_norm": 522.7682495117188, "learning_rate": 1.3420776595368834e-10, "loss": 39.9108, "step": 1430 }, { "epoch": 0.7734658251644958, "grad_norm": 528.9177856445312, "learning_rate": 1.283305513819827e-10, "loss": 39.4946, "step": 1440 }, { "epoch": 0.778837115617027, "grad_norm": 460.80340576171875, "learning_rate": 1.225659245843026e-10, "loss": 39.8654, "step": 1450 }, { "epoch": 0.7842084060695582, "grad_norm": 555.3992309570312, "learning_rate": 1.169156318072163e-10, "loss": 41.5166, "step": 1460 }, { "epoch": 0.7895796965220895, "grad_norm": 549.1808471679688, "learning_rate": 1.1138138466271913e-10, "loss": 39.4821, "step": 1470 }, { "epoch": 0.7949509869746206, "grad_norm": 529.6665649414062, "learning_rate": 1.0596485960974251e-10, "loss": 40.0072, "step": 1480 }, { "epoch": 0.8003222774271519, "grad_norm": 526.4564819335938, "learning_rate": 1.0066769744631571e-10, "loss": 39.7705, "step": 1490 }, { "epoch": 0.805693567879683, "grad_norm": 483.0169372558594, "learning_rate": 9.549150281252633e-11, "loss": 39.6957, "step": 1500 }, { "epoch": 0.8110648583322143, "grad_norm": 508.51190185546875, "learning_rate": 9.043784370443615e-11, "loss": 39.9077, "step": 1510 }, { "epoch": 0.8164361487847456, "grad_norm": 519.5391845703125, "learning_rate": 8.550825099909671e-11, "loss": 38.918, "step": 1520 }, { "epoch": 0.8218074392372767, "grad_norm": 528.9270629882812, "learning_rate": 8.070421799080951e-11, "loss": 40.3429, "step": 1530 }, { "epoch": 0.827178729689808, "grad_norm": 435.5723876953125, "learning_rate": 7.602719993876945e-11, "loss": 39.9604, "step": 1540 }, { "epoch": 0.8325500201423391, "grad_norm": 500.90625, "learning_rate": 7.147861362623287e-11, "loss": 40.3838, "step": 1550 }, { "epoch": 0.8379213105948704, "grad_norm": 495.9076232910156, "learning_rate": 6.705983693133794e-11, "loss": 40.1013, "step": 1560 }, { "epoch": 0.8432926010474017, "grad_norm": 527.7721557617188, "learning_rate": 6.277220840971198e-11, "loss": 40.5773, "step": 1570 }, { "epoch": 0.8486638914999328, "grad_norm": 552.565185546875, "learning_rate": 5.861702688899046e-11, "loss": 39.9742, "step": 1580 }, { "epoch": 0.8540351819524641, "grad_norm": 478.8362731933594, "learning_rate": 5.459555107537001e-11, "loss": 40.7994, "step": 1590 }, { "epoch": 0.8594064724049953, "grad_norm": 476.84759521484375, "learning_rate": 5.0708999172315696e-11, "loss": 40.5355, "step": 1600 }, { "epoch": 0.8647777628575265, "grad_norm": 463.87127685546875, "learning_rate": 4.695854851153714e-11, "loss": 40.8749, "step": 1610 }, { "epoch": 0.8701490533100578, "grad_norm": 455.86065673828125, "learning_rate": 4.334533519634643e-11, "loss": 40.2378, "step": 1620 }, { "epoch": 0.8755203437625889, "grad_norm": 479.13995361328125, "learning_rate": 3.9870453757503865e-11, "loss": 40.0686, "step": 1630 }, { "epoch": 0.8808916342151202, "grad_norm": 534.9734497070312, "learning_rate": 3.653495682165842e-11, "loss": 40.7489, "step": 1640 }, { "epoch": 0.8862629246676514, "grad_norm": 463.552490234375, "learning_rate": 3.333985479248103e-11, "loss": 40.446, "step": 1650 }, { "epoch": 0.8916342151201826, "grad_norm": 531.924072265625, "learning_rate": 3.0286115544588767e-11, "loss": 39.3065, "step": 1660 }, { "epoch": 0.8970055055727139, "grad_norm": 483.08502197265625, "learning_rate": 2.737466413035178e-11, "loss": 40.1395, "step": 1670 }, { "epoch": 0.9023767960252451, "grad_norm": 525.9647827148438, "learning_rate": 2.460638249967251e-11, "loss": 40.0024, "step": 1680 }, { "epoch": 0.9077480864777763, "grad_norm": 535.5175170898438, "learning_rate": 2.198210923282118e-11, "loss": 39.3654, "step": 1690 }, { "epoch": 0.9131193769303075, "grad_norm": 443.3262634277344, "learning_rate": 1.9502639286409496e-11, "loss": 40.2637, "step": 1700 }, { "epoch": 0.9184906673828387, "grad_norm": 461.6935729980469, "learning_rate": 1.7168723752578776e-11, "loss": 40.2201, "step": 1710 }, { "epoch": 0.92386195783537, "grad_norm": 450.4540100097656, "learning_rate": 1.498106963147583e-11, "loss": 40.5813, "step": 1720 }, { "epoch": 0.9292332482879012, "grad_norm": 414.77166748046875, "learning_rate": 1.294033961708513e-11, "loss": 39.9295, "step": 1730 }, { "epoch": 0.9346045387404324, "grad_norm": 539.1185302734375, "learning_rate": 1.1047151896482754e-11, "loss": 41.3669, "step": 1740 }, { "epoch": 0.9399758291929636, "grad_norm": 455.1410217285156, "learning_rate": 9.302079962572375e-12, "loss": 41.2396, "step": 1750 }, { "epoch": 0.9453471196454948, "grad_norm": 484.1607971191406, "learning_rate": 7.705652440360033e-12, "loss": 39.3638, "step": 1760 }, { "epoch": 0.950718410098026, "grad_norm": 456.8821105957031, "learning_rate": 6.258352926821032e-12, "loss": 39.8145, "step": 1770 }, { "epoch": 0.9560897005505573, "grad_norm": 504.9481506347656, "learning_rate": 4.960619844406156e-12, "loss": 40.1504, "step": 1780 }, { "epoch": 0.9614609910030885, "grad_norm": 500.57025146484375, "learning_rate": 3.812846308233031e-12, "loss": 40.1062, "step": 1790 }, { "epoch": 0.9668322814556197, "grad_norm": 494.5524597167969, "learning_rate": 2.8153800070020444e-12, "loss": 39.8419, "step": 1800 }, { "epoch": 0.972203571908151, "grad_norm": 500.6388244628906, "learning_rate": 1.9685230976726477e-12, "loss": 40.3947, "step": 1810 }, { "epoch": 0.9775748623606821, "grad_norm": 467.13702392578125, "learning_rate": 1.2725321139326896e-12, "loss": 40.5521, "step": 1820 }, { "epoch": 0.9829461528132134, "grad_norm": 460.44854736328125, "learning_rate": 7.276178884882412e-13, "loss": 40.5297, "step": 1830 }, { "epoch": 0.9883174432657446, "grad_norm": 521.1849365234375, "learning_rate": 3.3394548919707394e-13, "loss": 40.9712, "step": 1840 }, { "epoch": 0.9936887337182758, "grad_norm": 527.3604125976562, "learning_rate": 9.163416906554645e-14, "loss": 40.1704, "step": 1850 }, { "epoch": 0.9990600241708071, "grad_norm": 505.638427734375, "learning_rate": 7.573301240570985e-16, "loss": 41.4237, "step": 1860 }, { "epoch": 0.9995971532160601, "step": 1861, "total_flos": 0.0, "train_loss": 40.22760858182174, "train_runtime": 17617.7876, "train_samples_per_second": 3.381, "train_steps_per_second": 0.106 } ], "logging_steps": 10, "max_steps": 1861, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }