DiffPO-9B / trainer_state.json
RuizheChen's picture
Add files using upload-large-folder tool
e4b2592 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995971532160601,
"eval_steps": 500,
"global_step": 1861,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005371290452531221,
"grad_norm": 495.1745300292969,
"learning_rate": 1.7857142857142858e-10,
"loss": 40.2102,
"step": 10
},
{
"epoch": 0.010742580905062441,
"grad_norm": 477.615966796875,
"learning_rate": 3.5714285714285715e-10,
"loss": 40.7707,
"step": 20
},
{
"epoch": 0.01611387135759366,
"grad_norm": 492.8292236328125,
"learning_rate": 5.357142857142857e-10,
"loss": 40.2476,
"step": 30
},
{
"epoch": 0.021485161810124883,
"grad_norm": 529.812255859375,
"learning_rate": 7.142857142857143e-10,
"loss": 40.2061,
"step": 40
},
{
"epoch": 0.026856452262656105,
"grad_norm": 534.6681518554688,
"learning_rate": 8.92857142857143e-10,
"loss": 40.3632,
"step": 50
},
{
"epoch": 0.03222774271518732,
"grad_norm": 493.5475769042969,
"learning_rate": 9.999878827638992e-10,
"loss": 40.3255,
"step": 60
},
{
"epoch": 0.037599033167718544,
"grad_norm": 514.5845336914062,
"learning_rate": 9.998515706025587e-10,
"loss": 40.1727,
"step": 70
},
{
"epoch": 0.042970323620249766,
"grad_norm": 520.8786010742188,
"learning_rate": 9.99563841164693e-10,
"loss": 40.3397,
"step": 80
},
{
"epoch": 0.04834161407278099,
"grad_norm": 506.258056640625,
"learning_rate": 9.991247816105924e-10,
"loss": 40.5221,
"step": 90
},
{
"epoch": 0.05371290452531221,
"grad_norm": 518.2510375976562,
"learning_rate": 9.985345249421433e-10,
"loss": 40.3247,
"step": 100
},
{
"epoch": 0.059084194977843424,
"grad_norm": 509.5160827636719,
"learning_rate": 9.977932499625396e-10,
"loss": 40.4914,
"step": 110
},
{
"epoch": 0.06445548543037465,
"grad_norm": 521.5568237304688,
"learning_rate": 9.969011812221178e-10,
"loss": 40.4141,
"step": 120
},
{
"epoch": 0.06982677588290587,
"grad_norm": 429.8058776855469,
"learning_rate": 9.958585889503364e-10,
"loss": 40.7874,
"step": 130
},
{
"epoch": 0.07519806633543709,
"grad_norm": 523.8095092773438,
"learning_rate": 9.946657889739163e-10,
"loss": 40.9356,
"step": 140
},
{
"epoch": 0.08056935678796831,
"grad_norm": 492.9051818847656,
"learning_rate": 9.933231426211678e-10,
"loss": 40.3382,
"step": 150
},
{
"epoch": 0.08594064724049953,
"grad_norm": 455.4582824707031,
"learning_rate": 9.918310566125387e-10,
"loss": 39.8185,
"step": 160
},
{
"epoch": 0.09131193769303075,
"grad_norm": 479.072265625,
"learning_rate": 9.901899829374048e-10,
"loss": 39.3643,
"step": 170
},
{
"epoch": 0.09668322814556198,
"grad_norm": 472.71112060546875,
"learning_rate": 9.884004187171542e-10,
"loss": 39.7452,
"step": 180
},
{
"epoch": 0.1020545185980932,
"grad_norm": 508.54547119140625,
"learning_rate": 9.864629060545955e-10,
"loss": 41.1781,
"step": 190
},
{
"epoch": 0.10742580905062442,
"grad_norm": 468.7608642578125,
"learning_rate": 9.843780318697425e-10,
"loss": 40.0115,
"step": 200
},
{
"epoch": 0.11279709950315564,
"grad_norm": 512.1522216796875,
"learning_rate": 9.821464277220207e-10,
"loss": 40.1177,
"step": 210
},
{
"epoch": 0.11816838995568685,
"grad_norm": 483.4913635253906,
"learning_rate": 9.79768769618954e-10,
"loss": 40.2036,
"step": 220
},
{
"epoch": 0.12353968040821807,
"grad_norm": 509.5931091308594,
"learning_rate": 9.77245777811384e-10,
"loss": 39.6307,
"step": 230
},
{
"epoch": 0.1289109708607493,
"grad_norm": 483.1646728515625,
"learning_rate": 9.745782165752891e-10,
"loss": 40.7153,
"step": 240
},
{
"epoch": 0.13428226131328053,
"grad_norm": 436.9029235839844,
"learning_rate": 9.717668939802664e-10,
"loss": 40.1682,
"step": 250
},
{
"epoch": 0.13965355176581173,
"grad_norm": 467.0509338378906,
"learning_rate": 9.68812661644748e-10,
"loss": 40.397,
"step": 260
},
{
"epoch": 0.14502484221834297,
"grad_norm": 496.4578857421875,
"learning_rate": 9.657164144780247e-10,
"loss": 40.5665,
"step": 270
},
{
"epoch": 0.15039613267087418,
"grad_norm": 542.7703857421875,
"learning_rate": 9.624790904091554e-10,
"loss": 40.8245,
"step": 280
},
{
"epoch": 0.15576742312340539,
"grad_norm": 467.4183349609375,
"learning_rate": 9.59101670102847e-10,
"loss": 40.5495,
"step": 290
},
{
"epoch": 0.16113871357593662,
"grad_norm": 514.7463989257812,
"learning_rate": 9.555851766623854e-10,
"loss": 40.3572,
"step": 300
},
{
"epoch": 0.16651000402846783,
"grad_norm": 479.075439453125,
"learning_rate": 9.519306753197134e-10,
"loss": 40.9737,
"step": 310
},
{
"epoch": 0.17188129448099906,
"grad_norm": 480.1302185058594,
"learning_rate": 9.481392731127458e-10,
"loss": 40.1158,
"step": 320
},
{
"epoch": 0.17725258493353027,
"grad_norm": 459.5805969238281,
"learning_rate": 9.442121185500201e-10,
"loss": 39.8788,
"step": 330
},
{
"epoch": 0.1826238753860615,
"grad_norm": 460.0196533203125,
"learning_rate": 9.401504012627857e-10,
"loss": 39.5867,
"step": 340
},
{
"epoch": 0.18799516583859271,
"grad_norm": 477.8091735839844,
"learning_rate": 9.35955351644635e-10,
"loss": 41.1957,
"step": 350
},
{
"epoch": 0.19336645629112395,
"grad_norm": 475.1239929199219,
"learning_rate": 9.31628240478787e-10,
"loss": 40.1875,
"step": 360
},
{
"epoch": 0.19873774674365516,
"grad_norm": 522.9048461914062,
"learning_rate": 9.27170378553137e-10,
"loss": 40.7773,
"step": 370
},
{
"epoch": 0.2041090371961864,
"grad_norm": 489.0563049316406,
"learning_rate": 9.225831162631853e-10,
"loss": 40.9744,
"step": 380
},
{
"epoch": 0.2094803276487176,
"grad_norm": 426.8688659667969,
"learning_rate": 9.178678432029706e-10,
"loss": 39.4605,
"step": 390
},
{
"epoch": 0.21485161810124884,
"grad_norm": 479.16900634765625,
"learning_rate": 9.130259877441272e-10,
"loss": 39.4938,
"step": 400
},
{
"epoch": 0.22022290855378004,
"grad_norm": 523.247802734375,
"learning_rate": 9.080590166031966e-10,
"loss": 40.7306,
"step": 410
},
{
"epoch": 0.22559419900631128,
"grad_norm": 505.7490234375,
"learning_rate": 9.02968434397323e-10,
"loss": 40.1471,
"step": 420
},
{
"epoch": 0.2309654894588425,
"grad_norm": 474.9671630859375,
"learning_rate": 8.977557831884684e-10,
"loss": 41.0007,
"step": 430
},
{
"epoch": 0.2363367799113737,
"grad_norm": 490.84332275390625,
"learning_rate": 8.924226420162834e-10,
"loss": 39.3389,
"step": 440
},
{
"epoch": 0.24170807036390493,
"grad_norm": 542.4966430664062,
"learning_rate": 8.869706264197784e-10,
"loss": 40.3484,
"step": 450
},
{
"epoch": 0.24707936081643614,
"grad_norm": 480.568603515625,
"learning_rate": 8.814013879479366e-10,
"loss": 40.1192,
"step": 460
},
{
"epoch": 0.2524506512689674,
"grad_norm": 496.9430236816406,
"learning_rate": 8.757166136594194e-10,
"loss": 40.247,
"step": 470
},
{
"epoch": 0.2578219417214986,
"grad_norm": 519.3447265625,
"learning_rate": 8.699180256115157e-10,
"loss": 39.9668,
"step": 480
},
{
"epoch": 0.2631932321740298,
"grad_norm": 462.6995849609375,
"learning_rate": 8.640073803384881e-10,
"loss": 40.2213,
"step": 490
},
{
"epoch": 0.26856452262656105,
"grad_norm": 498.30548095703125,
"learning_rate": 8.579864683194752e-10,
"loss": 39.849,
"step": 500
},
{
"epoch": 0.27393581307909226,
"grad_norm": 491.5065002441406,
"learning_rate": 8.518571134361129e-10,
"loss": 39.8567,
"step": 510
},
{
"epoch": 0.27930710353162347,
"grad_norm": 401.1820068359375,
"learning_rate": 8.456211724200347e-10,
"loss": 40.0964,
"step": 520
},
{
"epoch": 0.2846783939841547,
"grad_norm": 472.61627197265625,
"learning_rate": 8.392805342904231e-10,
"loss": 39.9992,
"step": 530
},
{
"epoch": 0.29004968443668594,
"grad_norm": 504.82861328125,
"learning_rate": 8.328371197817788e-10,
"loss": 40.4024,
"step": 540
},
{
"epoch": 0.29542097488921715,
"grad_norm": 460.82733154296875,
"learning_rate": 8.262928807620843e-10,
"loss": 41.2341,
"step": 550
},
{
"epoch": 0.30079226534174835,
"grad_norm": 515.23583984375,
"learning_rate": 8.196497996415337e-10,
"loss": 40.4191,
"step": 560
},
{
"epoch": 0.30616355579427956,
"grad_norm": 534.2943725585938,
"learning_rate": 8.129098887720137e-10,
"loss": 40.0219,
"step": 570
},
{
"epoch": 0.31153484624681077,
"grad_norm": 506.2889099121094,
"learning_rate": 8.060751898375115e-10,
"loss": 40.2062,
"step": 580
},
{
"epoch": 0.31690613669934203,
"grad_norm": 451.0182800292969,
"learning_rate": 7.991477732356403e-10,
"loss": 40.1886,
"step": 590
},
{
"epoch": 0.32227742715187324,
"grad_norm": 497.3751525878906,
"learning_rate": 7.921297374504637e-10,
"loss": 40.7882,
"step": 600
},
{
"epoch": 0.32764871760440445,
"grad_norm": 548.7998657226562,
"learning_rate": 7.850232084168145e-10,
"loss": 40.9427,
"step": 610
},
{
"epoch": 0.33302000805693566,
"grad_norm": 486.71063232421875,
"learning_rate": 7.778303388762966e-10,
"loss": 39.4863,
"step": 620
},
{
"epoch": 0.3383912985094669,
"grad_norm": 448.2780456542969,
"learning_rate": 7.705533077251672e-10,
"loss": 39.9087,
"step": 630
},
{
"epoch": 0.34376258896199813,
"grad_norm": 526.2222900390625,
"learning_rate": 7.63194319354295e-10,
"loss": 39.7048,
"step": 640
},
{
"epoch": 0.34913387941452934,
"grad_norm": 492.9909973144531,
"learning_rate": 7.557556029813974e-10,
"loss": 39.5465,
"step": 650
},
{
"epoch": 0.35450516986706054,
"grad_norm": 483.2941589355469,
"learning_rate": 7.482394119757546e-10,
"loss": 40.6158,
"step": 660
},
{
"epoch": 0.3598764603195918,
"grad_norm": 475.9729309082031,
"learning_rate": 7.406480231756098e-10,
"loss": 39.8862,
"step": 670
},
{
"epoch": 0.365247750772123,
"grad_norm": 477.7049255371094,
"learning_rate": 7.329837361984598e-10,
"loss": 40.462,
"step": 680
},
{
"epoch": 0.3706190412246542,
"grad_norm": 448.4286804199219,
"learning_rate": 7.252488727444418e-10,
"loss": 40.037,
"step": 690
},
{
"epoch": 0.37599033167718543,
"grad_norm": 480.7619934082031,
"learning_rate": 7.174457758930374e-10,
"loss": 41.1926,
"step": 700
},
{
"epoch": 0.38136162212971664,
"grad_norm": 452.7475280761719,
"learning_rate": 7.095768093932932e-10,
"loss": 39.8431,
"step": 710
},
{
"epoch": 0.3867329125822479,
"grad_norm": 419.9246826171875,
"learning_rate": 7.016443569477854e-10,
"loss": 39.7369,
"step": 720
},
{
"epoch": 0.3921042030347791,
"grad_norm": 516.64306640625,
"learning_rate": 6.936508214905369e-10,
"loss": 39.727,
"step": 730
},
{
"epoch": 0.3974754934873103,
"grad_norm": 532.8106079101562,
"learning_rate": 6.855986244591104e-10,
"loss": 39.0725,
"step": 740
},
{
"epoch": 0.4028467839398415,
"grad_norm": 510.8319396972656,
"learning_rate": 6.774902050610951e-10,
"loss": 40.6862,
"step": 750
},
{
"epoch": 0.4082180743923728,
"grad_norm": 525.424560546875,
"learning_rate": 6.693280195352114e-10,
"loss": 40.5439,
"step": 760
},
{
"epoch": 0.413589364844904,
"grad_norm": 476.6329040527344,
"learning_rate": 6.61114540407256e-10,
"loss": 40.4504,
"step": 770
},
{
"epoch": 0.4189606552974352,
"grad_norm": 495.2875061035156,
"learning_rate": 6.528522557411133e-10,
"loss": 40.9673,
"step": 780
},
{
"epoch": 0.4243319457499664,
"grad_norm": 468.4483642578125,
"learning_rate": 6.445436683850597e-10,
"loss": 40.2403,
"step": 790
},
{
"epoch": 0.4297032362024977,
"grad_norm": 516.677490234375,
"learning_rate": 6.361912952135903e-10,
"loss": 40.4345,
"step": 800
},
{
"epoch": 0.4350745266550289,
"grad_norm": 509.36138916015625,
"learning_rate": 6.277976663649947e-10,
"loss": 39.9229,
"step": 810
},
{
"epoch": 0.4404458171075601,
"grad_norm": 502.6529541015625,
"learning_rate": 6.193653244749179e-10,
"loss": 40.928,
"step": 820
},
{
"epoch": 0.4458171075600913,
"grad_norm": 477.757568359375,
"learning_rate": 6.108968239061324e-10,
"loss": 40.2371,
"step": 830
},
{
"epoch": 0.45118839801262256,
"grad_norm": 483.0531005859375,
"learning_rate": 6.023947299747592e-10,
"loss": 40.7409,
"step": 840
},
{
"epoch": 0.45655968846515377,
"grad_norm": 495.1935119628906,
"learning_rate": 5.93861618173172e-10,
"loss": 40.123,
"step": 850
},
{
"epoch": 0.461930978917685,
"grad_norm": 420.5578918457031,
"learning_rate": 5.853000733898161e-10,
"loss": 39.6038,
"step": 860
},
{
"epoch": 0.4673022693702162,
"grad_norm": 512.252197265625,
"learning_rate": 5.767126891261828e-10,
"loss": 40.0436,
"step": 870
},
{
"epoch": 0.4726735598227474,
"grad_norm": 499.7673645019531,
"learning_rate": 5.681020667111754e-10,
"loss": 39.6081,
"step": 880
},
{
"epoch": 0.47804485027527865,
"grad_norm": 454.9427185058594,
"learning_rate": 5.594708145131012e-10,
"loss": 39.5993,
"step": 890
},
{
"epoch": 0.48341614072780986,
"grad_norm": 437.3612060546875,
"learning_rate": 5.508215471495337e-10,
"loss": 39.8825,
"step": 900
},
{
"epoch": 0.48878743118034107,
"grad_norm": 520.9217529296875,
"learning_rate": 5.421568846952822e-10,
"loss": 41.5034,
"step": 910
},
{
"epoch": 0.4941587216328723,
"grad_norm": 488.6954650878906,
"learning_rate": 5.334794518887044e-10,
"loss": 39.5379,
"step": 920
},
{
"epoch": 0.49953001208540354,
"grad_norm": 519.3870849609375,
"learning_rate": 5.247918773366112e-10,
"loss": 39.54,
"step": 930
},
{
"epoch": 0.5049013025379347,
"grad_norm": 497.21746826171875,
"learning_rate": 5.160967927179963e-10,
"loss": 40.0503,
"step": 940
},
{
"epoch": 0.510272592990466,
"grad_norm": 476.4524841308594,
"learning_rate": 5.073968319868361e-10,
"loss": 39.7168,
"step": 950
},
{
"epoch": 0.5156438834429972,
"grad_norm": 484.02777099609375,
"learning_rate": 4.986946305742012e-10,
"loss": 39.6419,
"step": 960
},
{
"epoch": 0.5210151738955284,
"grad_norm": 451.2009582519531,
"learning_rate": 4.899928245899194e-10,
"loss": 40.7173,
"step": 970
},
{
"epoch": 0.5263864643480596,
"grad_norm": 499.43408203125,
"learning_rate": 4.812940500240333e-10,
"loss": 40.2658,
"step": 980
},
{
"epoch": 0.5317577548005908,
"grad_norm": 449.65496826171875,
"learning_rate": 4.72600941948295e-10,
"loss": 40.1894,
"step": 990
},
{
"epoch": 0.5371290452531221,
"grad_norm": 492.1304016113281,
"learning_rate": 4.6391613371793786e-10,
"loss": 40.3246,
"step": 1000
},
{
"epoch": 0.5425003357056533,
"grad_norm": 484.3329772949219,
"learning_rate": 4.5524225617396904e-10,
"loss": 40.9067,
"step": 1010
},
{
"epoch": 0.5478716261581845,
"grad_norm": 447.1062927246094,
"learning_rate": 4.4658193684622293e-10,
"loss": 40.208,
"step": 1020
},
{
"epoch": 0.5532429166107157,
"grad_norm": 466.41522216796875,
"learning_rate": 4.3793779915741885e-10,
"loss": 40.008,
"step": 1030
},
{
"epoch": 0.5586142070632469,
"grad_norm": 472.39013671875,
"learning_rate": 4.293124616284608e-10,
"loss": 40.3332,
"step": 1040
},
{
"epoch": 0.5639854975157782,
"grad_norm": 463.3416748046875,
"learning_rate": 4.2070853708522495e-10,
"loss": 40.8243,
"step": 1050
},
{
"epoch": 0.5693567879683094,
"grad_norm": 516.2078247070312,
"learning_rate": 4.1212863186706943e-10,
"loss": 40.5519,
"step": 1060
},
{
"epoch": 0.5747280784208406,
"grad_norm": 487.50628662109375,
"learning_rate": 4.035753450373111e-10,
"loss": 40.4969,
"step": 1070
},
{
"epoch": 0.5800993688733719,
"grad_norm": 520.0319213867188,
"learning_rate": 3.950512675959052e-10,
"loss": 39.9747,
"step": 1080
},
{
"epoch": 0.585470659325903,
"grad_norm": 543.7863159179688,
"learning_rate": 3.865589816945685e-10,
"loss": 40.0276,
"step": 1090
},
{
"epoch": 0.5908419497784343,
"grad_norm": 486.9960021972656,
"learning_rate": 3.7810105985458137e-10,
"loss": 40.1272,
"step": 1100
},
{
"epoch": 0.5962132402309654,
"grad_norm": 502.4769287109375,
"learning_rate": 3.6968006418751e-10,
"loss": 40.3276,
"step": 1110
},
{
"epoch": 0.6015845306834967,
"grad_norm": 472.21533203125,
"learning_rate": 3.6129854561907786e-10,
"loss": 40.4212,
"step": 1120
},
{
"epoch": 0.606955821136028,
"grad_norm": 434.5205078125,
"learning_rate": 3.5295904311642897e-10,
"loss": 39.5327,
"step": 1130
},
{
"epoch": 0.6123271115885591,
"grad_norm": 511.1942138671875,
"learning_rate": 3.446640829190133e-10,
"loss": 40.5099,
"step": 1140
},
{
"epoch": 0.6176984020410904,
"grad_norm": 479.92901611328125,
"learning_rate": 3.3641617777332523e-10,
"loss": 39.1485,
"step": 1150
},
{
"epoch": 0.6230696924936215,
"grad_norm": 512.5575561523438,
"learning_rate": 3.2821782617173294e-10,
"loss": 41.33,
"step": 1160
},
{
"epoch": 0.6284409829461528,
"grad_norm": 517.29833984375,
"learning_rate": 3.2007151159562237e-10,
"loss": 39.8799,
"step": 1170
},
{
"epoch": 0.6338122733986841,
"grad_norm": 452.1294860839844,
"learning_rate": 3.119797017630914e-10,
"loss": 40.0134,
"step": 1180
},
{
"epoch": 0.6391835638512152,
"grad_norm": 499.8146057128906,
"learning_rate": 3.0394484788141616e-10,
"loss": 40.4734,
"step": 1190
},
{
"epoch": 0.6445548543037465,
"grad_norm": 551.718994140625,
"learning_rate": 2.9596938390452166e-10,
"loss": 39.968,
"step": 1200
},
{
"epoch": 0.6499261447562777,
"grad_norm": 476.2742614746094,
"learning_rate": 2.880557257956763e-10,
"loss": 40.1534,
"step": 1210
},
{
"epoch": 0.6552974352088089,
"grad_norm": 493.28167724609375,
"learning_rate": 2.8020627079563876e-10,
"loss": 39.8795,
"step": 1220
},
{
"epoch": 0.6606687256613402,
"grad_norm": 462.6866149902344,
"learning_rate": 2.7242339669647403e-10,
"loss": 40.3111,
"step": 1230
},
{
"epoch": 0.6660400161138713,
"grad_norm": 509.9576416015625,
"learning_rate": 2.647094611212626e-10,
"loss": 39.3712,
"step": 1240
},
{
"epoch": 0.6714113065664026,
"grad_norm": 483.03619384765625,
"learning_rate": 2.570668008099183e-10,
"loss": 39.4756,
"step": 1250
},
{
"epoch": 0.6767825970189338,
"grad_norm": 505.42071533203125,
"learning_rate": 2.494977309113331e-10,
"loss": 40.5326,
"step": 1260
},
{
"epoch": 0.682153887471465,
"grad_norm": 461.374755859375,
"learning_rate": 2.42004544282061e-10,
"loss": 39.9911,
"step": 1270
},
{
"epoch": 0.6875251779239963,
"grad_norm": 432.3858947753906,
"learning_rate": 2.3458951079175717e-10,
"loss": 40.3153,
"step": 1280
},
{
"epoch": 0.6928964683765274,
"grad_norm": 515.9682006835938,
"learning_rate": 2.2725487663557688e-10,
"loss": 40.6573,
"step": 1290
},
{
"epoch": 0.6982677588290587,
"grad_norm": 476.286865234375,
"learning_rate": 2.2000286365374955e-10,
"loss": 39.9867,
"step": 1300
},
{
"epoch": 0.7036390492815899,
"grad_norm": 472.92083740234375,
"learning_rate": 2.1283566865852822e-10,
"loss": 40.5379,
"step": 1310
},
{
"epoch": 0.7090103397341211,
"grad_norm": 552.19287109375,
"learning_rate": 2.0575546276872166e-10,
"loss": 41.3682,
"step": 1320
},
{
"epoch": 0.7143816301866523,
"grad_norm": 462.6091003417969,
"learning_rate": 1.9876439075200893e-10,
"loss": 41.0671,
"step": 1330
},
{
"epoch": 0.7197529206391836,
"grad_norm": 522.0980224609375,
"learning_rate": 1.9186457037523765e-10,
"loss": 40.3256,
"step": 1340
},
{
"epoch": 0.7251242110917148,
"grad_norm": 495.66510009765625,
"learning_rate": 1.8505809176289958e-10,
"loss": 40.3366,
"step": 1350
},
{
"epoch": 0.730495501544246,
"grad_norm": 536.3059692382812,
"learning_rate": 1.7834701676398057e-10,
"loss": 40.3298,
"step": 1360
},
{
"epoch": 0.7358667919967772,
"grad_norm": 527.6504516601562,
"learning_rate": 1.7173337832737773e-10,
"loss": 39.7742,
"step": 1370
},
{
"epoch": 0.7412380824493084,
"grad_norm": 508.9981689453125,
"learning_rate": 1.6521917988606762e-10,
"loss": 40.0357,
"step": 1380
},
{
"epoch": 0.7466093729018397,
"grad_norm": 546.9842529296875,
"learning_rate": 1.588063947502181e-10,
"loss": 39.8671,
"step": 1390
},
{
"epoch": 0.7519806633543709,
"grad_norm": 514.7166748046875,
"learning_rate": 1.524969655094242e-10,
"loss": 40.3517,
"step": 1400
},
{
"epoch": 0.7573519538069021,
"grad_norm": 458.2544250488281,
"learning_rate": 1.4629280344425106e-10,
"loss": 39.5965,
"step": 1410
},
{
"epoch": 0.7627232442594333,
"grad_norm": 509.53546142578125,
"learning_rate": 1.401957879472583e-10,
"loss": 40.175,
"step": 1420
},
{
"epoch": 0.7680945347119645,
"grad_norm": 522.7682495117188,
"learning_rate": 1.3420776595368834e-10,
"loss": 39.9108,
"step": 1430
},
{
"epoch": 0.7734658251644958,
"grad_norm": 528.9177856445312,
"learning_rate": 1.283305513819827e-10,
"loss": 39.4946,
"step": 1440
},
{
"epoch": 0.778837115617027,
"grad_norm": 460.80340576171875,
"learning_rate": 1.225659245843026e-10,
"loss": 39.8654,
"step": 1450
},
{
"epoch": 0.7842084060695582,
"grad_norm": 555.3992309570312,
"learning_rate": 1.169156318072163e-10,
"loss": 41.5166,
"step": 1460
},
{
"epoch": 0.7895796965220895,
"grad_norm": 549.1808471679688,
"learning_rate": 1.1138138466271913e-10,
"loss": 39.4821,
"step": 1470
},
{
"epoch": 0.7949509869746206,
"grad_norm": 529.6665649414062,
"learning_rate": 1.0596485960974251e-10,
"loss": 40.0072,
"step": 1480
},
{
"epoch": 0.8003222774271519,
"grad_norm": 526.4564819335938,
"learning_rate": 1.0066769744631571e-10,
"loss": 39.7705,
"step": 1490
},
{
"epoch": 0.805693567879683,
"grad_norm": 483.0169372558594,
"learning_rate": 9.549150281252633e-11,
"loss": 39.6957,
"step": 1500
},
{
"epoch": 0.8110648583322143,
"grad_norm": 508.51190185546875,
"learning_rate": 9.043784370443615e-11,
"loss": 39.9077,
"step": 1510
},
{
"epoch": 0.8164361487847456,
"grad_norm": 519.5391845703125,
"learning_rate": 8.550825099909671e-11,
"loss": 38.918,
"step": 1520
},
{
"epoch": 0.8218074392372767,
"grad_norm": 528.9270629882812,
"learning_rate": 8.070421799080951e-11,
"loss": 40.3429,
"step": 1530
},
{
"epoch": 0.827178729689808,
"grad_norm": 435.5723876953125,
"learning_rate": 7.602719993876945e-11,
"loss": 39.9604,
"step": 1540
},
{
"epoch": 0.8325500201423391,
"grad_norm": 500.90625,
"learning_rate": 7.147861362623287e-11,
"loss": 40.3838,
"step": 1550
},
{
"epoch": 0.8379213105948704,
"grad_norm": 495.9076232910156,
"learning_rate": 6.705983693133794e-11,
"loss": 40.1013,
"step": 1560
},
{
"epoch": 0.8432926010474017,
"grad_norm": 527.7721557617188,
"learning_rate": 6.277220840971198e-11,
"loss": 40.5773,
"step": 1570
},
{
"epoch": 0.8486638914999328,
"grad_norm": 552.565185546875,
"learning_rate": 5.861702688899046e-11,
"loss": 39.9742,
"step": 1580
},
{
"epoch": 0.8540351819524641,
"grad_norm": 478.8362731933594,
"learning_rate": 5.459555107537001e-11,
"loss": 40.7994,
"step": 1590
},
{
"epoch": 0.8594064724049953,
"grad_norm": 476.84759521484375,
"learning_rate": 5.0708999172315696e-11,
"loss": 40.5355,
"step": 1600
},
{
"epoch": 0.8647777628575265,
"grad_norm": 463.87127685546875,
"learning_rate": 4.695854851153714e-11,
"loss": 40.8749,
"step": 1610
},
{
"epoch": 0.8701490533100578,
"grad_norm": 455.86065673828125,
"learning_rate": 4.334533519634643e-11,
"loss": 40.2378,
"step": 1620
},
{
"epoch": 0.8755203437625889,
"grad_norm": 479.13995361328125,
"learning_rate": 3.9870453757503865e-11,
"loss": 40.0686,
"step": 1630
},
{
"epoch": 0.8808916342151202,
"grad_norm": 534.9734497070312,
"learning_rate": 3.653495682165842e-11,
"loss": 40.7489,
"step": 1640
},
{
"epoch": 0.8862629246676514,
"grad_norm": 463.552490234375,
"learning_rate": 3.333985479248103e-11,
"loss": 40.446,
"step": 1650
},
{
"epoch": 0.8916342151201826,
"grad_norm": 531.924072265625,
"learning_rate": 3.0286115544588767e-11,
"loss": 39.3065,
"step": 1660
},
{
"epoch": 0.8970055055727139,
"grad_norm": 483.08502197265625,
"learning_rate": 2.737466413035178e-11,
"loss": 40.1395,
"step": 1670
},
{
"epoch": 0.9023767960252451,
"grad_norm": 525.9647827148438,
"learning_rate": 2.460638249967251e-11,
"loss": 40.0024,
"step": 1680
},
{
"epoch": 0.9077480864777763,
"grad_norm": 535.5175170898438,
"learning_rate": 2.198210923282118e-11,
"loss": 39.3654,
"step": 1690
},
{
"epoch": 0.9131193769303075,
"grad_norm": 443.3262634277344,
"learning_rate": 1.9502639286409496e-11,
"loss": 40.2637,
"step": 1700
},
{
"epoch": 0.9184906673828387,
"grad_norm": 461.6935729980469,
"learning_rate": 1.7168723752578776e-11,
"loss": 40.2201,
"step": 1710
},
{
"epoch": 0.92386195783537,
"grad_norm": 450.4540100097656,
"learning_rate": 1.498106963147583e-11,
"loss": 40.5813,
"step": 1720
},
{
"epoch": 0.9292332482879012,
"grad_norm": 414.77166748046875,
"learning_rate": 1.294033961708513e-11,
"loss": 39.9295,
"step": 1730
},
{
"epoch": 0.9346045387404324,
"grad_norm": 539.1185302734375,
"learning_rate": 1.1047151896482754e-11,
"loss": 41.3669,
"step": 1740
},
{
"epoch": 0.9399758291929636,
"grad_norm": 455.1410217285156,
"learning_rate": 9.302079962572375e-12,
"loss": 41.2396,
"step": 1750
},
{
"epoch": 0.9453471196454948,
"grad_norm": 484.1607971191406,
"learning_rate": 7.705652440360033e-12,
"loss": 39.3638,
"step": 1760
},
{
"epoch": 0.950718410098026,
"grad_norm": 456.8821105957031,
"learning_rate": 6.258352926821032e-12,
"loss": 39.8145,
"step": 1770
},
{
"epoch": 0.9560897005505573,
"grad_norm": 504.9481506347656,
"learning_rate": 4.960619844406156e-12,
"loss": 40.1504,
"step": 1780
},
{
"epoch": 0.9614609910030885,
"grad_norm": 500.57025146484375,
"learning_rate": 3.812846308233031e-12,
"loss": 40.1062,
"step": 1790
},
{
"epoch": 0.9668322814556197,
"grad_norm": 494.5524597167969,
"learning_rate": 2.8153800070020444e-12,
"loss": 39.8419,
"step": 1800
},
{
"epoch": 0.972203571908151,
"grad_norm": 500.6388244628906,
"learning_rate": 1.9685230976726477e-12,
"loss": 40.3947,
"step": 1810
},
{
"epoch": 0.9775748623606821,
"grad_norm": 467.13702392578125,
"learning_rate": 1.2725321139326896e-12,
"loss": 40.5521,
"step": 1820
},
{
"epoch": 0.9829461528132134,
"grad_norm": 460.44854736328125,
"learning_rate": 7.276178884882412e-13,
"loss": 40.5297,
"step": 1830
},
{
"epoch": 0.9883174432657446,
"grad_norm": 521.1849365234375,
"learning_rate": 3.3394548919707394e-13,
"loss": 40.9712,
"step": 1840
},
{
"epoch": 0.9936887337182758,
"grad_norm": 527.3604125976562,
"learning_rate": 9.163416906554645e-14,
"loss": 40.1704,
"step": 1850
},
{
"epoch": 0.9990600241708071,
"grad_norm": 505.638427734375,
"learning_rate": 7.573301240570985e-16,
"loss": 41.4237,
"step": 1860
},
{
"epoch": 0.9995971532160601,
"step": 1861,
"total_flos": 0.0,
"train_loss": 40.22760858182174,
"train_runtime": 17617.7876,
"train_samples_per_second": 3.381,
"train_steps_per_second": 0.106
}
],
"logging_steps": 10,
"max_steps": 1861,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}