| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9995971532160601, | |
| "eval_steps": 500, | |
| "global_step": 1861, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005371290452531221, | |
| "grad_norm": 495.1745300292969, | |
| "learning_rate": 1.7857142857142858e-10, | |
| "loss": 40.2102, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.010742580905062441, | |
| "grad_norm": 477.615966796875, | |
| "learning_rate": 3.5714285714285715e-10, | |
| "loss": 40.7707, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.01611387135759366, | |
| "grad_norm": 492.8292236328125, | |
| "learning_rate": 5.357142857142857e-10, | |
| "loss": 40.2476, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.021485161810124883, | |
| "grad_norm": 529.812255859375, | |
| "learning_rate": 7.142857142857143e-10, | |
| "loss": 40.2061, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.026856452262656105, | |
| "grad_norm": 534.6681518554688, | |
| "learning_rate": 8.92857142857143e-10, | |
| "loss": 40.3632, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03222774271518732, | |
| "grad_norm": 493.5475769042969, | |
| "learning_rate": 9.999878827638992e-10, | |
| "loss": 40.3255, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.037599033167718544, | |
| "grad_norm": 514.5845336914062, | |
| "learning_rate": 9.998515706025587e-10, | |
| "loss": 40.1727, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.042970323620249766, | |
| "grad_norm": 520.8786010742188, | |
| "learning_rate": 9.99563841164693e-10, | |
| "loss": 40.3397, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.04834161407278099, | |
| "grad_norm": 506.258056640625, | |
| "learning_rate": 9.991247816105924e-10, | |
| "loss": 40.5221, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.05371290452531221, | |
| "grad_norm": 518.2510375976562, | |
| "learning_rate": 9.985345249421433e-10, | |
| "loss": 40.3247, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.059084194977843424, | |
| "grad_norm": 509.5160827636719, | |
| "learning_rate": 9.977932499625396e-10, | |
| "loss": 40.4914, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.06445548543037465, | |
| "grad_norm": 521.5568237304688, | |
| "learning_rate": 9.969011812221178e-10, | |
| "loss": 40.4141, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.06982677588290587, | |
| "grad_norm": 429.8058776855469, | |
| "learning_rate": 9.958585889503364e-10, | |
| "loss": 40.7874, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.07519806633543709, | |
| "grad_norm": 523.8095092773438, | |
| "learning_rate": 9.946657889739163e-10, | |
| "loss": 40.9356, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.08056935678796831, | |
| "grad_norm": 492.9051818847656, | |
| "learning_rate": 9.933231426211678e-10, | |
| "loss": 40.3382, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08594064724049953, | |
| "grad_norm": 455.4582824707031, | |
| "learning_rate": 9.918310566125387e-10, | |
| "loss": 39.8185, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.09131193769303075, | |
| "grad_norm": 479.072265625, | |
| "learning_rate": 9.901899829374048e-10, | |
| "loss": 39.3643, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.09668322814556198, | |
| "grad_norm": 472.71112060546875, | |
| "learning_rate": 9.884004187171542e-10, | |
| "loss": 39.7452, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1020545185980932, | |
| "grad_norm": 508.54547119140625, | |
| "learning_rate": 9.864629060545955e-10, | |
| "loss": 41.1781, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.10742580905062442, | |
| "grad_norm": 468.7608642578125, | |
| "learning_rate": 9.843780318697425e-10, | |
| "loss": 40.0115, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.11279709950315564, | |
| "grad_norm": 512.1522216796875, | |
| "learning_rate": 9.821464277220207e-10, | |
| "loss": 40.1177, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.11816838995568685, | |
| "grad_norm": 483.4913635253906, | |
| "learning_rate": 9.79768769618954e-10, | |
| "loss": 40.2036, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.12353968040821807, | |
| "grad_norm": 509.5931091308594, | |
| "learning_rate": 9.77245777811384e-10, | |
| "loss": 39.6307, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1289109708607493, | |
| "grad_norm": 483.1646728515625, | |
| "learning_rate": 9.745782165752891e-10, | |
| "loss": 40.7153, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.13428226131328053, | |
| "grad_norm": 436.9029235839844, | |
| "learning_rate": 9.717668939802664e-10, | |
| "loss": 40.1682, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13965355176581173, | |
| "grad_norm": 467.0509338378906, | |
| "learning_rate": 9.68812661644748e-10, | |
| "loss": 40.397, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.14502484221834297, | |
| "grad_norm": 496.4578857421875, | |
| "learning_rate": 9.657164144780247e-10, | |
| "loss": 40.5665, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.15039613267087418, | |
| "grad_norm": 542.7703857421875, | |
| "learning_rate": 9.624790904091554e-10, | |
| "loss": 40.8245, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.15576742312340539, | |
| "grad_norm": 467.4183349609375, | |
| "learning_rate": 9.59101670102847e-10, | |
| "loss": 40.5495, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.16113871357593662, | |
| "grad_norm": 514.7463989257812, | |
| "learning_rate": 9.555851766623854e-10, | |
| "loss": 40.3572, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.16651000402846783, | |
| "grad_norm": 479.075439453125, | |
| "learning_rate": 9.519306753197134e-10, | |
| "loss": 40.9737, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.17188129448099906, | |
| "grad_norm": 480.1302185058594, | |
| "learning_rate": 9.481392731127458e-10, | |
| "loss": 40.1158, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.17725258493353027, | |
| "grad_norm": 459.5805969238281, | |
| "learning_rate": 9.442121185500201e-10, | |
| "loss": 39.8788, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1826238753860615, | |
| "grad_norm": 460.0196533203125, | |
| "learning_rate": 9.401504012627857e-10, | |
| "loss": 39.5867, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.18799516583859271, | |
| "grad_norm": 477.8091735839844, | |
| "learning_rate": 9.35955351644635e-10, | |
| "loss": 41.1957, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.19336645629112395, | |
| "grad_norm": 475.1239929199219, | |
| "learning_rate": 9.31628240478787e-10, | |
| "loss": 40.1875, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.19873774674365516, | |
| "grad_norm": 522.9048461914062, | |
| "learning_rate": 9.27170378553137e-10, | |
| "loss": 40.7773, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2041090371961864, | |
| "grad_norm": 489.0563049316406, | |
| "learning_rate": 9.225831162631853e-10, | |
| "loss": 40.9744, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2094803276487176, | |
| "grad_norm": 426.8688659667969, | |
| "learning_rate": 9.178678432029706e-10, | |
| "loss": 39.4605, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.21485161810124884, | |
| "grad_norm": 479.16900634765625, | |
| "learning_rate": 9.130259877441272e-10, | |
| "loss": 39.4938, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.22022290855378004, | |
| "grad_norm": 523.247802734375, | |
| "learning_rate": 9.080590166031966e-10, | |
| "loss": 40.7306, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.22559419900631128, | |
| "grad_norm": 505.7490234375, | |
| "learning_rate": 9.02968434397323e-10, | |
| "loss": 40.1471, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2309654894588425, | |
| "grad_norm": 474.9671630859375, | |
| "learning_rate": 8.977557831884684e-10, | |
| "loss": 41.0007, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2363367799113737, | |
| "grad_norm": 490.84332275390625, | |
| "learning_rate": 8.924226420162834e-10, | |
| "loss": 39.3389, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.24170807036390493, | |
| "grad_norm": 542.4966430664062, | |
| "learning_rate": 8.869706264197784e-10, | |
| "loss": 40.3484, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.24707936081643614, | |
| "grad_norm": 480.568603515625, | |
| "learning_rate": 8.814013879479366e-10, | |
| "loss": 40.1192, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.2524506512689674, | |
| "grad_norm": 496.9430236816406, | |
| "learning_rate": 8.757166136594194e-10, | |
| "loss": 40.247, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.2578219417214986, | |
| "grad_norm": 519.3447265625, | |
| "learning_rate": 8.699180256115157e-10, | |
| "loss": 39.9668, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.2631932321740298, | |
| "grad_norm": 462.6995849609375, | |
| "learning_rate": 8.640073803384881e-10, | |
| "loss": 40.2213, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.26856452262656105, | |
| "grad_norm": 498.30548095703125, | |
| "learning_rate": 8.579864683194752e-10, | |
| "loss": 39.849, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.27393581307909226, | |
| "grad_norm": 491.5065002441406, | |
| "learning_rate": 8.518571134361129e-10, | |
| "loss": 39.8567, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.27930710353162347, | |
| "grad_norm": 401.1820068359375, | |
| "learning_rate": 8.456211724200347e-10, | |
| "loss": 40.0964, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.2846783939841547, | |
| "grad_norm": 472.61627197265625, | |
| "learning_rate": 8.392805342904231e-10, | |
| "loss": 39.9992, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.29004968443668594, | |
| "grad_norm": 504.82861328125, | |
| "learning_rate": 8.328371197817788e-10, | |
| "loss": 40.4024, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.29542097488921715, | |
| "grad_norm": 460.82733154296875, | |
| "learning_rate": 8.262928807620843e-10, | |
| "loss": 41.2341, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.30079226534174835, | |
| "grad_norm": 515.23583984375, | |
| "learning_rate": 8.196497996415337e-10, | |
| "loss": 40.4191, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.30616355579427956, | |
| "grad_norm": 534.2943725585938, | |
| "learning_rate": 8.129098887720137e-10, | |
| "loss": 40.0219, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.31153484624681077, | |
| "grad_norm": 506.2889099121094, | |
| "learning_rate": 8.060751898375115e-10, | |
| "loss": 40.2062, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.31690613669934203, | |
| "grad_norm": 451.0182800292969, | |
| "learning_rate": 7.991477732356403e-10, | |
| "loss": 40.1886, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.32227742715187324, | |
| "grad_norm": 497.3751525878906, | |
| "learning_rate": 7.921297374504637e-10, | |
| "loss": 40.7882, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.32764871760440445, | |
| "grad_norm": 548.7998657226562, | |
| "learning_rate": 7.850232084168145e-10, | |
| "loss": 40.9427, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.33302000805693566, | |
| "grad_norm": 486.71063232421875, | |
| "learning_rate": 7.778303388762966e-10, | |
| "loss": 39.4863, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.3383912985094669, | |
| "grad_norm": 448.2780456542969, | |
| "learning_rate": 7.705533077251672e-10, | |
| "loss": 39.9087, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.34376258896199813, | |
| "grad_norm": 526.2222900390625, | |
| "learning_rate": 7.63194319354295e-10, | |
| "loss": 39.7048, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.34913387941452934, | |
| "grad_norm": 492.9909973144531, | |
| "learning_rate": 7.557556029813974e-10, | |
| "loss": 39.5465, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.35450516986706054, | |
| "grad_norm": 483.2941589355469, | |
| "learning_rate": 7.482394119757546e-10, | |
| "loss": 40.6158, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.3598764603195918, | |
| "grad_norm": 475.9729309082031, | |
| "learning_rate": 7.406480231756098e-10, | |
| "loss": 39.8862, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.365247750772123, | |
| "grad_norm": 477.7049255371094, | |
| "learning_rate": 7.329837361984598e-10, | |
| "loss": 40.462, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.3706190412246542, | |
| "grad_norm": 448.4286804199219, | |
| "learning_rate": 7.252488727444418e-10, | |
| "loss": 40.037, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.37599033167718543, | |
| "grad_norm": 480.7619934082031, | |
| "learning_rate": 7.174457758930374e-10, | |
| "loss": 41.1926, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.38136162212971664, | |
| "grad_norm": 452.7475280761719, | |
| "learning_rate": 7.095768093932932e-10, | |
| "loss": 39.8431, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.3867329125822479, | |
| "grad_norm": 419.9246826171875, | |
| "learning_rate": 7.016443569477854e-10, | |
| "loss": 39.7369, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.3921042030347791, | |
| "grad_norm": 516.64306640625, | |
| "learning_rate": 6.936508214905369e-10, | |
| "loss": 39.727, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.3974754934873103, | |
| "grad_norm": 532.8106079101562, | |
| "learning_rate": 6.855986244591104e-10, | |
| "loss": 39.0725, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4028467839398415, | |
| "grad_norm": 510.8319396972656, | |
| "learning_rate": 6.774902050610951e-10, | |
| "loss": 40.6862, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4082180743923728, | |
| "grad_norm": 525.424560546875, | |
| "learning_rate": 6.693280195352114e-10, | |
| "loss": 40.5439, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.413589364844904, | |
| "grad_norm": 476.6329040527344, | |
| "learning_rate": 6.61114540407256e-10, | |
| "loss": 40.4504, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4189606552974352, | |
| "grad_norm": 495.2875061035156, | |
| "learning_rate": 6.528522557411133e-10, | |
| "loss": 40.9673, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.4243319457499664, | |
| "grad_norm": 468.4483642578125, | |
| "learning_rate": 6.445436683850597e-10, | |
| "loss": 40.2403, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.4297032362024977, | |
| "grad_norm": 516.677490234375, | |
| "learning_rate": 6.361912952135903e-10, | |
| "loss": 40.4345, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.4350745266550289, | |
| "grad_norm": 509.36138916015625, | |
| "learning_rate": 6.277976663649947e-10, | |
| "loss": 39.9229, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.4404458171075601, | |
| "grad_norm": 502.6529541015625, | |
| "learning_rate": 6.193653244749179e-10, | |
| "loss": 40.928, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.4458171075600913, | |
| "grad_norm": 477.757568359375, | |
| "learning_rate": 6.108968239061324e-10, | |
| "loss": 40.2371, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.45118839801262256, | |
| "grad_norm": 483.0531005859375, | |
| "learning_rate": 6.023947299747592e-10, | |
| "loss": 40.7409, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.45655968846515377, | |
| "grad_norm": 495.1935119628906, | |
| "learning_rate": 5.93861618173172e-10, | |
| "loss": 40.123, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.461930978917685, | |
| "grad_norm": 420.5578918457031, | |
| "learning_rate": 5.853000733898161e-10, | |
| "loss": 39.6038, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.4673022693702162, | |
| "grad_norm": 512.252197265625, | |
| "learning_rate": 5.767126891261828e-10, | |
| "loss": 40.0436, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.4726735598227474, | |
| "grad_norm": 499.7673645019531, | |
| "learning_rate": 5.681020667111754e-10, | |
| "loss": 39.6081, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.47804485027527865, | |
| "grad_norm": 454.9427185058594, | |
| "learning_rate": 5.594708145131012e-10, | |
| "loss": 39.5993, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.48341614072780986, | |
| "grad_norm": 437.3612060546875, | |
| "learning_rate": 5.508215471495337e-10, | |
| "loss": 39.8825, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.48878743118034107, | |
| "grad_norm": 520.9217529296875, | |
| "learning_rate": 5.421568846952822e-10, | |
| "loss": 41.5034, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.4941587216328723, | |
| "grad_norm": 488.6954650878906, | |
| "learning_rate": 5.334794518887044e-10, | |
| "loss": 39.5379, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.49953001208540354, | |
| "grad_norm": 519.3870849609375, | |
| "learning_rate": 5.247918773366112e-10, | |
| "loss": 39.54, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5049013025379347, | |
| "grad_norm": 497.21746826171875, | |
| "learning_rate": 5.160967927179963e-10, | |
| "loss": 40.0503, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.510272592990466, | |
| "grad_norm": 476.4524841308594, | |
| "learning_rate": 5.073968319868361e-10, | |
| "loss": 39.7168, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.5156438834429972, | |
| "grad_norm": 484.02777099609375, | |
| "learning_rate": 4.986946305742012e-10, | |
| "loss": 39.6419, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.5210151738955284, | |
| "grad_norm": 451.2009582519531, | |
| "learning_rate": 4.899928245899194e-10, | |
| "loss": 40.7173, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.5263864643480596, | |
| "grad_norm": 499.43408203125, | |
| "learning_rate": 4.812940500240333e-10, | |
| "loss": 40.2658, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.5317577548005908, | |
| "grad_norm": 449.65496826171875, | |
| "learning_rate": 4.72600941948295e-10, | |
| "loss": 40.1894, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.5371290452531221, | |
| "grad_norm": 492.1304016113281, | |
| "learning_rate": 4.6391613371793786e-10, | |
| "loss": 40.3246, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.5425003357056533, | |
| "grad_norm": 484.3329772949219, | |
| "learning_rate": 4.5524225617396904e-10, | |
| "loss": 40.9067, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.5478716261581845, | |
| "grad_norm": 447.1062927246094, | |
| "learning_rate": 4.4658193684622293e-10, | |
| "loss": 40.208, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.5532429166107157, | |
| "grad_norm": 466.41522216796875, | |
| "learning_rate": 4.3793779915741885e-10, | |
| "loss": 40.008, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.5586142070632469, | |
| "grad_norm": 472.39013671875, | |
| "learning_rate": 4.293124616284608e-10, | |
| "loss": 40.3332, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.5639854975157782, | |
| "grad_norm": 463.3416748046875, | |
| "learning_rate": 4.2070853708522495e-10, | |
| "loss": 40.8243, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.5693567879683094, | |
| "grad_norm": 516.2078247070312, | |
| "learning_rate": 4.1212863186706943e-10, | |
| "loss": 40.5519, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.5747280784208406, | |
| "grad_norm": 487.50628662109375, | |
| "learning_rate": 4.035753450373111e-10, | |
| "loss": 40.4969, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.5800993688733719, | |
| "grad_norm": 520.0319213867188, | |
| "learning_rate": 3.950512675959052e-10, | |
| "loss": 39.9747, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.585470659325903, | |
| "grad_norm": 543.7863159179688, | |
| "learning_rate": 3.865589816945685e-10, | |
| "loss": 40.0276, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.5908419497784343, | |
| "grad_norm": 486.9960021972656, | |
| "learning_rate": 3.7810105985458137e-10, | |
| "loss": 40.1272, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.5962132402309654, | |
| "grad_norm": 502.4769287109375, | |
| "learning_rate": 3.6968006418751e-10, | |
| "loss": 40.3276, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.6015845306834967, | |
| "grad_norm": 472.21533203125, | |
| "learning_rate": 3.6129854561907786e-10, | |
| "loss": 40.4212, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.606955821136028, | |
| "grad_norm": 434.5205078125, | |
| "learning_rate": 3.5295904311642897e-10, | |
| "loss": 39.5327, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.6123271115885591, | |
| "grad_norm": 511.1942138671875, | |
| "learning_rate": 3.446640829190133e-10, | |
| "loss": 40.5099, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.6176984020410904, | |
| "grad_norm": 479.92901611328125, | |
| "learning_rate": 3.3641617777332523e-10, | |
| "loss": 39.1485, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.6230696924936215, | |
| "grad_norm": 512.5575561523438, | |
| "learning_rate": 3.2821782617173294e-10, | |
| "loss": 41.33, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.6284409829461528, | |
| "grad_norm": 517.29833984375, | |
| "learning_rate": 3.2007151159562237e-10, | |
| "loss": 39.8799, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.6338122733986841, | |
| "grad_norm": 452.1294860839844, | |
| "learning_rate": 3.119797017630914e-10, | |
| "loss": 40.0134, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.6391835638512152, | |
| "grad_norm": 499.8146057128906, | |
| "learning_rate": 3.0394484788141616e-10, | |
| "loss": 40.4734, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.6445548543037465, | |
| "grad_norm": 551.718994140625, | |
| "learning_rate": 2.9596938390452166e-10, | |
| "loss": 39.968, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.6499261447562777, | |
| "grad_norm": 476.2742614746094, | |
| "learning_rate": 2.880557257956763e-10, | |
| "loss": 40.1534, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.6552974352088089, | |
| "grad_norm": 493.28167724609375, | |
| "learning_rate": 2.8020627079563876e-10, | |
| "loss": 39.8795, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.6606687256613402, | |
| "grad_norm": 462.6866149902344, | |
| "learning_rate": 2.7242339669647403e-10, | |
| "loss": 40.3111, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.6660400161138713, | |
| "grad_norm": 509.9576416015625, | |
| "learning_rate": 2.647094611212626e-10, | |
| "loss": 39.3712, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.6714113065664026, | |
| "grad_norm": 483.03619384765625, | |
| "learning_rate": 2.570668008099183e-10, | |
| "loss": 39.4756, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.6767825970189338, | |
| "grad_norm": 505.42071533203125, | |
| "learning_rate": 2.494977309113331e-10, | |
| "loss": 40.5326, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.682153887471465, | |
| "grad_norm": 461.374755859375, | |
| "learning_rate": 2.42004544282061e-10, | |
| "loss": 39.9911, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.6875251779239963, | |
| "grad_norm": 432.3858947753906, | |
| "learning_rate": 2.3458951079175717e-10, | |
| "loss": 40.3153, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.6928964683765274, | |
| "grad_norm": 515.9682006835938, | |
| "learning_rate": 2.2725487663557688e-10, | |
| "loss": 40.6573, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.6982677588290587, | |
| "grad_norm": 476.286865234375, | |
| "learning_rate": 2.2000286365374955e-10, | |
| "loss": 39.9867, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.7036390492815899, | |
| "grad_norm": 472.92083740234375, | |
| "learning_rate": 2.1283566865852822e-10, | |
| "loss": 40.5379, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.7090103397341211, | |
| "grad_norm": 552.19287109375, | |
| "learning_rate": 2.0575546276872166e-10, | |
| "loss": 41.3682, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.7143816301866523, | |
| "grad_norm": 462.6091003417969, | |
| "learning_rate": 1.9876439075200893e-10, | |
| "loss": 41.0671, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.7197529206391836, | |
| "grad_norm": 522.0980224609375, | |
| "learning_rate": 1.9186457037523765e-10, | |
| "loss": 40.3256, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.7251242110917148, | |
| "grad_norm": 495.66510009765625, | |
| "learning_rate": 1.8505809176289958e-10, | |
| "loss": 40.3366, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.730495501544246, | |
| "grad_norm": 536.3059692382812, | |
| "learning_rate": 1.7834701676398057e-10, | |
| "loss": 40.3298, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.7358667919967772, | |
| "grad_norm": 527.6504516601562, | |
| "learning_rate": 1.7173337832737773e-10, | |
| "loss": 39.7742, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.7412380824493084, | |
| "grad_norm": 508.9981689453125, | |
| "learning_rate": 1.6521917988606762e-10, | |
| "loss": 40.0357, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.7466093729018397, | |
| "grad_norm": 546.9842529296875, | |
| "learning_rate": 1.588063947502181e-10, | |
| "loss": 39.8671, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.7519806633543709, | |
| "grad_norm": 514.7166748046875, | |
| "learning_rate": 1.524969655094242e-10, | |
| "loss": 40.3517, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.7573519538069021, | |
| "grad_norm": 458.2544250488281, | |
| "learning_rate": 1.4629280344425106e-10, | |
| "loss": 39.5965, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.7627232442594333, | |
| "grad_norm": 509.53546142578125, | |
| "learning_rate": 1.401957879472583e-10, | |
| "loss": 40.175, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.7680945347119645, | |
| "grad_norm": 522.7682495117188, | |
| "learning_rate": 1.3420776595368834e-10, | |
| "loss": 39.9108, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.7734658251644958, | |
| "grad_norm": 528.9177856445312, | |
| "learning_rate": 1.283305513819827e-10, | |
| "loss": 39.4946, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.778837115617027, | |
| "grad_norm": 460.80340576171875, | |
| "learning_rate": 1.225659245843026e-10, | |
| "loss": 39.8654, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.7842084060695582, | |
| "grad_norm": 555.3992309570312, | |
| "learning_rate": 1.169156318072163e-10, | |
| "loss": 41.5166, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.7895796965220895, | |
| "grad_norm": 549.1808471679688, | |
| "learning_rate": 1.1138138466271913e-10, | |
| "loss": 39.4821, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.7949509869746206, | |
| "grad_norm": 529.6665649414062, | |
| "learning_rate": 1.0596485960974251e-10, | |
| "loss": 40.0072, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.8003222774271519, | |
| "grad_norm": 526.4564819335938, | |
| "learning_rate": 1.0066769744631571e-10, | |
| "loss": 39.7705, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.805693567879683, | |
| "grad_norm": 483.0169372558594, | |
| "learning_rate": 9.549150281252633e-11, | |
| "loss": 39.6957, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.8110648583322143, | |
| "grad_norm": 508.51190185546875, | |
| "learning_rate": 9.043784370443615e-11, | |
| "loss": 39.9077, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.8164361487847456, | |
| "grad_norm": 519.5391845703125, | |
| "learning_rate": 8.550825099909671e-11, | |
| "loss": 38.918, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.8218074392372767, | |
| "grad_norm": 528.9270629882812, | |
| "learning_rate": 8.070421799080951e-11, | |
| "loss": 40.3429, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.827178729689808, | |
| "grad_norm": 435.5723876953125, | |
| "learning_rate": 7.602719993876945e-11, | |
| "loss": 39.9604, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.8325500201423391, | |
| "grad_norm": 500.90625, | |
| "learning_rate": 7.147861362623287e-11, | |
| "loss": 40.3838, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.8379213105948704, | |
| "grad_norm": 495.9076232910156, | |
| "learning_rate": 6.705983693133794e-11, | |
| "loss": 40.1013, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.8432926010474017, | |
| "grad_norm": 527.7721557617188, | |
| "learning_rate": 6.277220840971198e-11, | |
| "loss": 40.5773, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 0.8486638914999328, | |
| "grad_norm": 552.565185546875, | |
| "learning_rate": 5.861702688899046e-11, | |
| "loss": 39.9742, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 0.8540351819524641, | |
| "grad_norm": 478.8362731933594, | |
| "learning_rate": 5.459555107537001e-11, | |
| "loss": 40.7994, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 0.8594064724049953, | |
| "grad_norm": 476.84759521484375, | |
| "learning_rate": 5.0708999172315696e-11, | |
| "loss": 40.5355, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.8647777628575265, | |
| "grad_norm": 463.87127685546875, | |
| "learning_rate": 4.695854851153714e-11, | |
| "loss": 40.8749, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 0.8701490533100578, | |
| "grad_norm": 455.86065673828125, | |
| "learning_rate": 4.334533519634643e-11, | |
| "loss": 40.2378, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 0.8755203437625889, | |
| "grad_norm": 479.13995361328125, | |
| "learning_rate": 3.9870453757503865e-11, | |
| "loss": 40.0686, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 0.8808916342151202, | |
| "grad_norm": 534.9734497070312, | |
| "learning_rate": 3.653495682165842e-11, | |
| "loss": 40.7489, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 0.8862629246676514, | |
| "grad_norm": 463.552490234375, | |
| "learning_rate": 3.333985479248103e-11, | |
| "loss": 40.446, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.8916342151201826, | |
| "grad_norm": 531.924072265625, | |
| "learning_rate": 3.0286115544588767e-11, | |
| "loss": 39.3065, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 0.8970055055727139, | |
| "grad_norm": 483.08502197265625, | |
| "learning_rate": 2.737466413035178e-11, | |
| "loss": 40.1395, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 0.9023767960252451, | |
| "grad_norm": 525.9647827148438, | |
| "learning_rate": 2.460638249967251e-11, | |
| "loss": 40.0024, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 0.9077480864777763, | |
| "grad_norm": 535.5175170898438, | |
| "learning_rate": 2.198210923282118e-11, | |
| "loss": 39.3654, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 0.9131193769303075, | |
| "grad_norm": 443.3262634277344, | |
| "learning_rate": 1.9502639286409496e-11, | |
| "loss": 40.2637, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.9184906673828387, | |
| "grad_norm": 461.6935729980469, | |
| "learning_rate": 1.7168723752578776e-11, | |
| "loss": 40.2201, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 0.92386195783537, | |
| "grad_norm": 450.4540100097656, | |
| "learning_rate": 1.498106963147583e-11, | |
| "loss": 40.5813, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 0.9292332482879012, | |
| "grad_norm": 414.77166748046875, | |
| "learning_rate": 1.294033961708513e-11, | |
| "loss": 39.9295, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 0.9346045387404324, | |
| "grad_norm": 539.1185302734375, | |
| "learning_rate": 1.1047151896482754e-11, | |
| "loss": 41.3669, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 0.9399758291929636, | |
| "grad_norm": 455.1410217285156, | |
| "learning_rate": 9.302079962572375e-12, | |
| "loss": 41.2396, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.9453471196454948, | |
| "grad_norm": 484.1607971191406, | |
| "learning_rate": 7.705652440360033e-12, | |
| "loss": 39.3638, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 0.950718410098026, | |
| "grad_norm": 456.8821105957031, | |
| "learning_rate": 6.258352926821032e-12, | |
| "loss": 39.8145, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 0.9560897005505573, | |
| "grad_norm": 504.9481506347656, | |
| "learning_rate": 4.960619844406156e-12, | |
| "loss": 40.1504, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 0.9614609910030885, | |
| "grad_norm": 500.57025146484375, | |
| "learning_rate": 3.812846308233031e-12, | |
| "loss": 40.1062, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 0.9668322814556197, | |
| "grad_norm": 494.5524597167969, | |
| "learning_rate": 2.8153800070020444e-12, | |
| "loss": 39.8419, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.972203571908151, | |
| "grad_norm": 500.6388244628906, | |
| "learning_rate": 1.9685230976726477e-12, | |
| "loss": 40.3947, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 0.9775748623606821, | |
| "grad_norm": 467.13702392578125, | |
| "learning_rate": 1.2725321139326896e-12, | |
| "loss": 40.5521, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 0.9829461528132134, | |
| "grad_norm": 460.44854736328125, | |
| "learning_rate": 7.276178884882412e-13, | |
| "loss": 40.5297, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 0.9883174432657446, | |
| "grad_norm": 521.1849365234375, | |
| "learning_rate": 3.3394548919707394e-13, | |
| "loss": 40.9712, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 0.9936887337182758, | |
| "grad_norm": 527.3604125976562, | |
| "learning_rate": 9.163416906554645e-14, | |
| "loss": 40.1704, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.9990600241708071, | |
| "grad_norm": 505.638427734375, | |
| "learning_rate": 7.573301240570985e-16, | |
| "loss": 41.4237, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 0.9995971532160601, | |
| "step": 1861, | |
| "total_flos": 0.0, | |
| "train_loss": 40.22760858182174, | |
| "train_runtime": 17617.7876, | |
| "train_samples_per_second": 3.381, | |
| "train_steps_per_second": 0.106 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1861, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |