SWE_Next_14B / trainer_state.json
lllqaq's picture
Upload SWE_Next_14B SFT model
bed2609 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1232,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016233766233766232,
"grad_norm": 64.73149845425546,
"learning_rate": 7.258064516129033e-07,
"loss": 3.3198,
"step": 10
},
{
"epoch": 0.032467532467532464,
"grad_norm": 33.62473079554,
"learning_rate": 1.5322580645161292e-06,
"loss": 3.0328,
"step": 20
},
{
"epoch": 0.048701298701298704,
"grad_norm": 10.429545095678938,
"learning_rate": 2.338709677419355e-06,
"loss": 1.8054,
"step": 30
},
{
"epoch": 0.06493506493506493,
"grad_norm": 2.1555639207187185,
"learning_rate": 3.145161290322581e-06,
"loss": 1.2848,
"step": 40
},
{
"epoch": 0.08116883116883117,
"grad_norm": 2.109927665115888,
"learning_rate": 3.951612903225807e-06,
"loss": 0.9412,
"step": 50
},
{
"epoch": 0.09740259740259741,
"grad_norm": 3.835176262683339,
"learning_rate": 4.758064516129033e-06,
"loss": 0.9246,
"step": 60
},
{
"epoch": 0.11363636363636363,
"grad_norm": 2.3665002147902174,
"learning_rate": 5.564516129032258e-06,
"loss": 0.6407,
"step": 70
},
{
"epoch": 0.12987012987012986,
"grad_norm": 2.0758395514553736,
"learning_rate": 6.370967741935485e-06,
"loss": 0.4818,
"step": 80
},
{
"epoch": 0.1461038961038961,
"grad_norm": 1.3197745285431102,
"learning_rate": 7.177419354838711e-06,
"loss": 0.3839,
"step": 90
},
{
"epoch": 0.16233766233766234,
"grad_norm": 1.167725739422115,
"learning_rate": 7.983870967741935e-06,
"loss": 0.3852,
"step": 100
},
{
"epoch": 0.17857142857142858,
"grad_norm": 1.4271468826660396,
"learning_rate": 8.790322580645163e-06,
"loss": 0.4254,
"step": 110
},
{
"epoch": 0.19480519480519481,
"grad_norm": 1.3041783516757963,
"learning_rate": 9.596774193548389e-06,
"loss": 0.3735,
"step": 120
},
{
"epoch": 0.21103896103896103,
"grad_norm": 1.6445704066341653,
"learning_rate": 9.999497549864013e-06,
"loss": 0.3139,
"step": 130
},
{
"epoch": 0.22727272727272727,
"grad_norm": 2.26609564584476,
"learning_rate": 9.995478554650548e-06,
"loss": 0.3639,
"step": 140
},
{
"epoch": 0.2435064935064935,
"grad_norm": 1.3241916296568497,
"learning_rate": 9.987443795012786e-06,
"loss": 0.3085,
"step": 150
},
{
"epoch": 0.2597402597402597,
"grad_norm": 1.8795485094172018,
"learning_rate": 9.975399729931894e-06,
"loss": 0.3401,
"step": 160
},
{
"epoch": 0.275974025974026,
"grad_norm": 1.2518399347916536,
"learning_rate": 9.959356041388799e-06,
"loss": 0.3232,
"step": 170
},
{
"epoch": 0.2922077922077922,
"grad_norm": 1.3254250765286968,
"learning_rate": 9.939325626581032e-06,
"loss": 0.3167,
"step": 180
},
{
"epoch": 0.30844155844155846,
"grad_norm": 1.3464203708686748,
"learning_rate": 9.915324587554933e-06,
"loss": 0.2968,
"step": 190
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.9391334903099222,
"learning_rate": 9.887372218261547e-06,
"loss": 0.2933,
"step": 200
},
{
"epoch": 0.3409090909090909,
"grad_norm": 1.3224980977395502,
"learning_rate": 9.8554909890466e-06,
"loss": 0.3062,
"step": 210
},
{
"epoch": 0.35714285714285715,
"grad_norm": 2.727455179143666,
"learning_rate": 9.819706528587036e-06,
"loss": 0.2979,
"step": 220
},
{
"epoch": 0.37337662337662336,
"grad_norm": 1.5019191545607462,
"learning_rate": 9.780047603288656e-06,
"loss": 0.2838,
"step": 230
},
{
"epoch": 0.38961038961038963,
"grad_norm": 1.6879557721862124,
"learning_rate": 9.736546094161375e-06,
"loss": 0.2995,
"step": 240
},
{
"epoch": 0.40584415584415584,
"grad_norm": 1.4809220185905931,
"learning_rate": 9.689236971190715e-06,
"loss": 0.2975,
"step": 250
},
{
"epoch": 0.42207792207792205,
"grad_norm": 1.9452778336930967,
"learning_rate": 9.638158265226155e-06,
"loss": 0.2862,
"step": 260
},
{
"epoch": 0.4383116883116883,
"grad_norm": 2.1580249714158084,
"learning_rate": 9.583351037408886e-06,
"loss": 0.2805,
"step": 270
},
{
"epoch": 0.45454545454545453,
"grad_norm": 2.2778945637898516,
"learning_rate": 9.52485934616359e-06,
"loss": 0.2783,
"step": 280
},
{
"epoch": 0.4707792207792208,
"grad_norm": 1.3731917008379857,
"learning_rate": 9.46273021178077e-06,
"loss": 0.2539,
"step": 290
},
{
"epoch": 0.487012987012987,
"grad_norm": 0.9640448637905654,
"learning_rate": 9.397013578618073e-06,
"loss": 0.2732,
"step": 300
},
{
"epoch": 0.5032467532467533,
"grad_norm": 1.879194030041846,
"learning_rate": 9.327762274951042e-06,
"loss": 0.2789,
"step": 310
},
{
"epoch": 0.5194805194805194,
"grad_norm": 2.0274067648187875,
"learning_rate": 9.255031970505518e-06,
"loss": 0.2995,
"step": 320
},
{
"epoch": 0.5357142857142857,
"grad_norm": 1.1439713018352973,
"learning_rate": 9.178881131705882e-06,
"loss": 0.2626,
"step": 330
},
{
"epoch": 0.551948051948052,
"grad_norm": 1.217033674995871,
"learning_rate": 9.099370974675074e-06,
"loss": 0.2437,
"step": 340
},
{
"epoch": 0.5681818181818182,
"grad_norm": 1.1243152390495403,
"learning_rate": 9.016565416024181e-06,
"loss": 0.2676,
"step": 350
},
{
"epoch": 0.5844155844155844,
"grad_norm": 1.4806002470901694,
"learning_rate": 8.930531021471167e-06,
"loss": 0.2656,
"step": 360
},
{
"epoch": 0.6006493506493507,
"grad_norm": 1.1288154615791715,
"learning_rate": 8.841336952330033e-06,
"loss": 0.2906,
"step": 370
},
{
"epoch": 0.6168831168831169,
"grad_norm": 1.492508732730717,
"learning_rate": 8.749054909913439e-06,
"loss": 0.2576,
"step": 380
},
{
"epoch": 0.6331168831168831,
"grad_norm": 1.1458749024369619,
"learning_rate": 8.653759077893453e-06,
"loss": 0.264,
"step": 390
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.9392039887042416,
"learning_rate": 8.555526062666803e-06,
"loss": 0.2606,
"step": 400
},
{
"epoch": 0.6655844155844156,
"grad_norm": 1.891912924932221,
"learning_rate": 8.454434831772544e-06,
"loss": 0.2685,
"step": 410
},
{
"epoch": 0.6818181818181818,
"grad_norm": 1.313388756424559,
"learning_rate": 8.350566650411633e-06,
"loss": 0.2611,
"step": 420
},
{
"epoch": 0.698051948051948,
"grad_norm": 1.0777638252230697,
"learning_rate": 8.244005016119482e-06,
"loss": 0.2475,
"step": 430
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.0188317103197562,
"learning_rate": 8.13483559164398e-06,
"loss": 0.2855,
"step": 440
},
{
"epoch": 0.7305194805194806,
"grad_norm": 0.8703924340642463,
"learning_rate": 8.02314613608292e-06,
"loss": 0.2518,
"step": 450
},
{
"epoch": 0.7467532467532467,
"grad_norm": 1.0173301268541866,
"learning_rate": 7.909026434336252e-06,
"loss": 0.2696,
"step": 460
},
{
"epoch": 0.762987012987013,
"grad_norm": 1.2218157167293324,
"learning_rate": 7.792568224929797e-06,
"loss": 0.2612,
"step": 470
},
{
"epoch": 0.7792207792207793,
"grad_norm": 1.1796113115060929,
"learning_rate": 7.673865126268506e-06,
"loss": 0.2506,
"step": 480
},
{
"epoch": 0.7954545454545454,
"grad_norm": 1.376127532504407,
"learning_rate": 7.55301256137851e-06,
"loss": 0.2459,
"step": 490
},
{
"epoch": 0.8116883116883117,
"grad_norm": 0.8520527207533138,
"learning_rate": 7.430107681198477e-06,
"loss": 0.2296,
"step": 500
},
{
"epoch": 0.827922077922078,
"grad_norm": 1.6057696993562458,
"learning_rate": 7.305249286481928e-06,
"loss": 0.2707,
"step": 510
},
{
"epoch": 0.8441558441558441,
"grad_norm": 1.1118264521022472,
"learning_rate": 7.1785377483733045e-06,
"loss": 0.2453,
"step": 520
},
{
"epoch": 0.8603896103896104,
"grad_norm": 0.986033743735511,
"learning_rate": 7.050074927721639e-06,
"loss": 0.2653,
"step": 530
},
{
"epoch": 0.8766233766233766,
"grad_norm": 1.1047991137937976,
"learning_rate": 6.9199640931966615e-06,
"loss": 0.2401,
"step": 540
},
{
"epoch": 0.8928571428571429,
"grad_norm": 0.8182265905464219,
"learning_rate": 6.788309838273211e-06,
"loss": 0.2453,
"step": 550
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.0242844709669037,
"learning_rate": 6.655217997150642e-06,
"loss": 0.2562,
"step": 560
},
{
"epoch": 0.9253246753246753,
"grad_norm": 1.7261090545065465,
"learning_rate": 6.520795559674851e-06,
"loss": 0.2618,
"step": 570
},
{
"epoch": 0.9415584415584416,
"grad_norm": 1.166838903170947,
"learning_rate": 6.385150585331299e-06,
"loss": 0.2445,
"step": 580
},
{
"epoch": 0.9577922077922078,
"grad_norm": 1.0102156282992951,
"learning_rate": 6.248392116378167e-06,
"loss": 0.2381,
"step": 590
},
{
"epoch": 0.974025974025974,
"grad_norm": 1.2728489163523269,
"learning_rate": 6.110630090189493e-06,
"loss": 0.2495,
"step": 600
},
{
"epoch": 0.9902597402597403,
"grad_norm": 1.0402345685218206,
"learning_rate": 5.971975250878722e-06,
"loss": 0.2607,
"step": 610
},
{
"epoch": 1.0064935064935066,
"grad_norm": 0.9570526081816403,
"learning_rate": 5.832539060273763e-06,
"loss": 0.2594,
"step": 620
},
{
"epoch": 1.0227272727272727,
"grad_norm": 0.8166533778024863,
"learning_rate": 5.692433608315059e-06,
"loss": 0.1734,
"step": 630
},
{
"epoch": 1.0389610389610389,
"grad_norm": 0.8521218142738738,
"learning_rate": 5.5517715229487554e-06,
"loss": 0.1661,
"step": 640
},
{
"epoch": 1.0551948051948052,
"grad_norm": 2.0344524322648225,
"learning_rate": 5.410665879587366e-06,
"loss": 0.1773,
"step": 650
},
{
"epoch": 1.0714285714285714,
"grad_norm": 1.0760586346052228,
"learning_rate": 5.269230110210725e-06,
"loss": 0.1832,
"step": 660
},
{
"epoch": 1.0876623376623376,
"grad_norm": 0.6221075887847101,
"learning_rate": 5.127577912180312e-06,
"loss": 0.171,
"step": 670
},
{
"epoch": 1.103896103896104,
"grad_norm": 2.4289309131257037,
"learning_rate": 4.9858231568402325e-06,
"loss": 0.1869,
"step": 680
},
{
"epoch": 1.12012987012987,
"grad_norm": 0.7117445693553235,
"learning_rate": 4.844079797978345e-06,
"loss": 0.1715,
"step": 690
},
{
"epoch": 1.1363636363636362,
"grad_norm": 2.0791195295672558,
"learning_rate": 4.7024617802211105e-06,
"loss": 0.1918,
"step": 700
},
{
"epoch": 1.1525974025974026,
"grad_norm": 1.3384619147891226,
"learning_rate": 4.5610829474358056e-06,
"loss": 0.1849,
"step": 710
},
{
"epoch": 1.1688311688311688,
"grad_norm": 0.7431163309985721,
"learning_rate": 4.420056951213726e-06,
"loss": 0.1706,
"step": 720
},
{
"epoch": 1.1850649350649352,
"grad_norm": 1.0917320780107198,
"learning_rate": 4.279497159507984e-06,
"loss": 0.1774,
"step": 730
},
{
"epoch": 1.2012987012987013,
"grad_norm": 0.6433212768530576,
"learning_rate": 4.139516565499277e-06,
"loss": 0.1725,
"step": 740
},
{
"epoch": 1.2175324675324675,
"grad_norm": 1.0367698466098962,
"learning_rate": 4.000227696762967e-06,
"loss": 0.2098,
"step": 750
},
{
"epoch": 1.2337662337662338,
"grad_norm": 1.1673609045846502,
"learning_rate": 3.861742524810421e-06,
"loss": 0.1837,
"step": 760
},
{
"epoch": 1.25,
"grad_norm": 1.036708817011231,
"learning_rate": 3.7241723750773812e-06,
"loss": 0.1819,
"step": 770
},
{
"epoch": 1.2662337662337662,
"grad_norm": 2.085452988223766,
"learning_rate": 3.587627837431679e-06,
"loss": 0.168,
"step": 780
},
{
"epoch": 1.2824675324675325,
"grad_norm": 1.1525296924575061,
"learning_rate": 3.4522186772722915e-06,
"loss": 0.1516,
"step": 790
},
{
"epoch": 1.2987012987012987,
"grad_norm": 1.9400938633777363,
"learning_rate": 3.3180537472911334e-06,
"loss": 0.1749,
"step": 800
},
{
"epoch": 1.314935064935065,
"grad_norm": 0.8648790732830897,
"learning_rate": 3.185240899968587e-06,
"loss": 0.1665,
"step": 810
},
{
"epoch": 1.3311688311688312,
"grad_norm": 1.1576768419228283,
"learning_rate": 3.053886900873062e-06,
"loss": 0.1847,
"step": 820
},
{
"epoch": 1.3474025974025974,
"grad_norm": 0.7801672735784758,
"learning_rate": 2.9240973428343135e-06,
"loss": 0.1852,
"step": 830
},
{
"epoch": 1.3636363636363638,
"grad_norm": 0.6679216934793272,
"learning_rate": 2.79597656105949e-06,
"loss": 0.1622,
"step": 840
},
{
"epoch": 1.37987012987013,
"grad_norm": 2.0257365743503892,
"learning_rate": 2.6696275492601726e-06,
"loss": 0.2013,
"step": 850
},
{
"epoch": 1.396103896103896,
"grad_norm": 1.3270716986636226,
"learning_rate": 2.545151876857803e-06,
"loss": 0.1926,
"step": 860
},
{
"epoch": 1.4123376623376624,
"grad_norm": 1.4591313868086215,
"learning_rate": 2.422649607334083e-06,
"loss": 0.1865,
"step": 870
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.718475522026256,
"learning_rate": 2.3022192177919465e-06,
"loss": 0.1704,
"step": 880
},
{
"epoch": 1.4448051948051948,
"grad_norm": 0.8779553819300654,
"learning_rate": 2.1839575197918156e-06,
"loss": 0.1704,
"step": 890
},
{
"epoch": 1.4610389610389611,
"grad_norm": 0.8981166608440325,
"learning_rate": 2.0679595815267395e-06,
"loss": 0.1894,
"step": 900
},
{
"epoch": 1.4772727272727273,
"grad_norm": 1.8819536746716359,
"learning_rate": 1.954318651398977e-06,
"loss": 0.1838,
"step": 910
},
{
"epoch": 1.4935064935064934,
"grad_norm": 1.2305604028991983,
"learning_rate": 1.8431260830595126e-06,
"loss": 0.1667,
"step": 920
},
{
"epoch": 1.5097402597402598,
"grad_norm": 0.8829569169119944,
"learning_rate": 1.7344712619706772e-06,
"loss": 0.1588,
"step": 930
},
{
"epoch": 1.525974025974026,
"grad_norm": 0.7948818738193303,
"learning_rate": 1.6284415335509879e-06,
"loss": 0.1743,
"step": 940
},
{
"epoch": 1.5422077922077921,
"grad_norm": 2.0055570853229376,
"learning_rate": 1.525122132959933e-06,
"loss": 0.2021,
"step": 950
},
{
"epoch": 1.5584415584415585,
"grad_norm": 1.0617311431240737,
"learning_rate": 1.4245961165791344e-06,
"loss": 0.1842,
"step": 960
},
{
"epoch": 1.5746753246753247,
"grad_norm": 1.1336171320027224,
"learning_rate": 1.326944295245009e-06,
"loss": 0.1679,
"step": 970
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.8275801739941498,
"learning_rate": 1.2322451692865617e-06,
"loss": 0.1649,
"step": 980
},
{
"epoch": 1.6071428571428572,
"grad_norm": 1.133730110796439,
"learning_rate": 1.1405748654205566e-06,
"loss": 0.1455,
"step": 990
},
{
"epoch": 1.6233766233766234,
"grad_norm": 1.4124753406859976,
"learning_rate": 1.052007075554789e-06,
"loss": 0.178,
"step": 1000
},
{
"epoch": 1.6396103896103895,
"grad_norm": 1.4093028955797295,
"learning_rate": 9.666129975486394e-07,
"loss": 0.1811,
"step": 1010
},
{
"epoch": 1.655844155844156,
"grad_norm": 1.0258966054963048,
"learning_rate": 8.844612779785583e-07,
"loss": 0.1714,
"step": 1020
},
{
"epoch": 1.672077922077922,
"grad_norm": 0.8268048059208093,
"learning_rate": 8.056179569544642e-07,
"loss": 0.1684,
"step": 1030
},
{
"epoch": 1.6883116883116882,
"grad_norm": 1.1758327197059883,
"learning_rate": 7.301464150314313e-07,
"loss": 0.1578,
"step": 1040
},
{
"epoch": 1.7045454545454546,
"grad_norm": 1.3848141640091782,
"learning_rate": 6.581073222593442e-07,
"loss": 0.1841,
"step": 1050
},
{
"epoch": 1.7207792207792207,
"grad_norm": 1.6039492944313922,
"learning_rate": 5.89558589411463e-07,
"loss": 0.1711,
"step": 1060
},
{
"epoch": 1.737012987012987,
"grad_norm": 0.9846284823379949,
"learning_rate": 5.245553214311283e-07,
"loss": 0.1839,
"step": 1070
},
{
"epoch": 1.7532467532467533,
"grad_norm": 0.8126959112794907,
"learning_rate": 4.6314977313400065e-07,
"loss": 0.1937,
"step": 1080
},
{
"epoch": 1.7694805194805194,
"grad_norm": 0.912662021903635,
"learning_rate": 4.053913072014748e-07,
"loss": 0.1858,
"step": 1090
},
{
"epoch": 1.7857142857142856,
"grad_norm": 2.2200942828573904,
"learning_rate": 3.513263544990153e-07,
"loss": 0.1668,
"step": 1100
},
{
"epoch": 1.801948051948052,
"grad_norm": 1.9601203935539797,
"learning_rate": 3.0099837675131525e-07,
"loss": 0.1825,
"step": 1110
},
{
"epoch": 1.8181818181818183,
"grad_norm": 1.8341781108000885,
"learning_rate": 2.5444783160429975e-07,
"loss": 0.1628,
"step": 1120
},
{
"epoch": 1.8344155844155843,
"grad_norm": 0.9358762565242741,
"learning_rate": 2.1171214010203723e-07,
"loss": 0.1309,
"step": 1130
},
{
"epoch": 1.8506493506493507,
"grad_norm": 0.5034065948539621,
"learning_rate": 1.7282565660471483e-07,
"loss": 0.1579,
"step": 1140
},
{
"epoch": 1.866883116883117,
"grad_norm": 1.2734857167293319,
"learning_rate": 1.3781964117186743e-07,
"loss": 0.1515,
"step": 1150
},
{
"epoch": 1.883116883116883,
"grad_norm": 0.9554751930775003,
"learning_rate": 1.0672223443304042e-07,
"loss": 0.1615,
"step": 1160
},
{
"epoch": 1.8993506493506493,
"grad_norm": 1.2542362308070503,
"learning_rate": 7.955843496610882e-08,
"loss": 0.1533,
"step": 1170
},
{
"epoch": 1.9155844155844157,
"grad_norm": 1.0939753700107246,
"learning_rate": 5.6350079201422655e-08,
"loss": 0.1799,
"step": 1180
},
{
"epoch": 1.9318181818181817,
"grad_norm": 0.853147045662764,
"learning_rate": 3.711582386794421e-08,
"loss": 0.1704,
"step": 1190
},
{
"epoch": 1.948051948051948,
"grad_norm": 1.278737413341965,
"learning_rate": 2.1871130995476665e-08,
"loss": 0.1924,
"step": 1200
},
{
"epoch": 1.9642857142857144,
"grad_norm": 2.050204404171476,
"learning_rate": 1.0628255485052308e-08,
"loss": 0.1678,
"step": 1210
},
{
"epoch": 1.9805194805194806,
"grad_norm": 0.8404109096247463,
"learning_rate": 3.396235257464575e-09,
"loss": 0.1525,
"step": 1220
},
{
"epoch": 1.9967532467532467,
"grad_norm": 0.7586755640395566,
"learning_rate": 1.8088398786586525e-10,
"loss": 0.1676,
"step": 1230
},
{
"epoch": 2.0,
"step": 1232,
"total_flos": 610243285417984.0,
"train_loss": 0.3112858794266721,
"train_runtime": 22231.3385,
"train_samples_per_second": 0.332,
"train_steps_per_second": 0.055
}
],
"logging_steps": 10,
"max_steps": 1232,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 610243285417984.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}