timhua's picture
Upload folder using huggingface_hub
795a399 verified
{
"best_global_step": 1200,
"best_metric": 1.1594480276107788,
"best_model_checkpoint": "/workspace/second_half_run/checkpoint-1200",
"epoch": 0.8658008658008658,
"eval_steps": 300,
"global_step": 1200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007215007215007215,
"grad_norm": 0.20672309398651123,
"learning_rate": 1.0714285714285714e-05,
"loss": 1.2235,
"step": 10
},
{
"epoch": 0.01443001443001443,
"grad_norm": 0.20048637688159943,
"learning_rate": 2.261904761904762e-05,
"loss": 1.1998,
"step": 20
},
{
"epoch": 0.021645021645021644,
"grad_norm": 0.2042124718427658,
"learning_rate": 3.4523809523809526e-05,
"loss": 1.1912,
"step": 30
},
{
"epoch": 0.02886002886002886,
"grad_norm": 0.2033877670764923,
"learning_rate": 4.642857142857143e-05,
"loss": 1.162,
"step": 40
},
{
"epoch": 0.03607503607503607,
"grad_norm": 0.20020800828933716,
"learning_rate": 5.833333333333334e-05,
"loss": 1.1723,
"step": 50
},
{
"epoch": 0.04329004329004329,
"grad_norm": 0.2054547369480133,
"learning_rate": 7.023809523809524e-05,
"loss": 1.1688,
"step": 60
},
{
"epoch": 0.050505050505050504,
"grad_norm": 0.20819878578186035,
"learning_rate": 8.214285714285714e-05,
"loss": 1.1621,
"step": 70
},
{
"epoch": 0.05772005772005772,
"grad_norm": 0.20804323256015778,
"learning_rate": 9.404761904761905e-05,
"loss": 1.169,
"step": 80
},
{
"epoch": 0.06493506493506493,
"grad_norm": 0.2028702348470688,
"learning_rate": 9.999914627107077e-05,
"loss": 1.1745,
"step": 90
},
{
"epoch": 0.07215007215007214,
"grad_norm": 0.20979009568691254,
"learning_rate": 9.999231661456054e-05,
"loss": 1.1829,
"step": 100
},
{
"epoch": 0.07936507936507936,
"grad_norm": 0.20109142363071442,
"learning_rate": 9.997865823444018e-05,
"loss": 1.1669,
"step": 110
},
{
"epoch": 0.08658008658008658,
"grad_norm": 0.21667157113552094,
"learning_rate": 9.995817299638243e-05,
"loss": 1.1771,
"step": 120
},
{
"epoch": 0.09379509379509379,
"grad_norm": 0.21592780947685242,
"learning_rate": 9.99308636985779e-05,
"loss": 1.1784,
"step": 130
},
{
"epoch": 0.10101010101010101,
"grad_norm": 0.21357479691505432,
"learning_rate": 9.989673407135269e-05,
"loss": 1.1827,
"step": 140
},
{
"epoch": 0.10822510822510822,
"grad_norm": 0.220377117395401,
"learning_rate": 9.985578877665905e-05,
"loss": 1.1766,
"step": 150
},
{
"epoch": 0.11544011544011544,
"grad_norm": 0.22202736139297485,
"learning_rate": 9.980803340743843e-05,
"loss": 1.1845,
"step": 160
},
{
"epoch": 0.12265512265512266,
"grad_norm": 0.22033752501010895,
"learning_rate": 9.97534744868576e-05,
"loss": 1.1784,
"step": 170
},
{
"epoch": 0.12987012987012986,
"grad_norm": 0.21248167753219604,
"learning_rate": 9.969211946741755e-05,
"loss": 1.1799,
"step": 180
},
{
"epoch": 0.1370851370851371,
"grad_norm": 0.23226460814476013,
"learning_rate": 9.96239767299355e-05,
"loss": 1.176,
"step": 190
},
{
"epoch": 0.1443001443001443,
"grad_norm": 0.21726985275745392,
"learning_rate": 9.954905558240025e-05,
"loss": 1.1885,
"step": 200
},
{
"epoch": 0.15151515151515152,
"grad_norm": 0.21341446042060852,
"learning_rate": 9.946736625870055e-05,
"loss": 1.1773,
"step": 210
},
{
"epoch": 0.15873015873015872,
"grad_norm": 0.21727606654167175,
"learning_rate": 9.937891991722736e-05,
"loss": 1.18,
"step": 220
},
{
"epoch": 0.16594516594516595,
"grad_norm": 0.2137938290834427,
"learning_rate": 9.928372863934965e-05,
"loss": 1.1852,
"step": 230
},
{
"epoch": 0.17316017316017315,
"grad_norm": 0.20841728150844574,
"learning_rate": 9.918180542776399e-05,
"loss": 1.1768,
"step": 240
},
{
"epoch": 0.18037518037518038,
"grad_norm": 0.2213844656944275,
"learning_rate": 9.907316420471863e-05,
"loss": 1.1858,
"step": 250
},
{
"epoch": 0.18759018759018758,
"grad_norm": 0.221885547041893,
"learning_rate": 9.895781981011169e-05,
"loss": 1.1796,
"step": 260
},
{
"epoch": 0.19480519480519481,
"grad_norm": 0.21279644966125488,
"learning_rate": 9.883578799946409e-05,
"loss": 1.182,
"step": 270
},
{
"epoch": 0.20202020202020202,
"grad_norm": 0.1996086686849594,
"learning_rate": 9.870708544176745e-05,
"loss": 1.1792,
"step": 280
},
{
"epoch": 0.20923520923520925,
"grad_norm": 0.20936301350593567,
"learning_rate": 9.857172971720715e-05,
"loss": 1.1862,
"step": 290
},
{
"epoch": 0.21645021645021645,
"grad_norm": 0.2194942831993103,
"learning_rate": 9.842973931476101e-05,
"loss": 1.1715,
"step": 300
},
{
"epoch": 0.21645021645021645,
"eval_loss": 1.175282597541809,
"eval_runtime": 24.0253,
"eval_samples_per_second": 15.983,
"eval_steps_per_second": 0.499,
"step": 300
},
{
"epoch": 0.22366522366522368,
"grad_norm": 0.20830006897449493,
"learning_rate": 9.828113362967372e-05,
"loss": 1.1837,
"step": 310
},
{
"epoch": 0.23088023088023088,
"grad_norm": 0.19409529864788055,
"learning_rate": 9.812593296080757e-05,
"loss": 1.1749,
"step": 320
},
{
"epoch": 0.23809523809523808,
"grad_norm": 0.21279384195804596,
"learning_rate": 9.796415850786968e-05,
"loss": 1.1752,
"step": 330
},
{
"epoch": 0.2453102453102453,
"grad_norm": 0.20255984365940094,
"learning_rate": 9.779583236851631e-05,
"loss": 1.1772,
"step": 340
},
{
"epoch": 0.25252525252525254,
"grad_norm": 0.20980049669742584,
"learning_rate": 9.76209775353343e-05,
"loss": 1.167,
"step": 350
},
{
"epoch": 0.2597402597402597,
"grad_norm": 0.2113814800977707,
"learning_rate": 9.743961789270047e-05,
"loss": 1.1735,
"step": 360
},
{
"epoch": 0.26695526695526695,
"grad_norm": 0.2063579112291336,
"learning_rate": 9.725177821351907e-05,
"loss": 1.1638,
"step": 370
},
{
"epoch": 0.2741702741702742,
"grad_norm": 0.20841461420059204,
"learning_rate": 9.705748415583797e-05,
"loss": 1.172,
"step": 380
},
{
"epoch": 0.2813852813852814,
"grad_norm": 0.21439822018146515,
"learning_rate": 9.685676225934383e-05,
"loss": 1.1929,
"step": 390
},
{
"epoch": 0.2886002886002886,
"grad_norm": 0.2044590413570404,
"learning_rate": 9.664963994173695e-05,
"loss": 1.1688,
"step": 400
},
{
"epoch": 0.2958152958152958,
"grad_norm": 0.21234646439552307,
"learning_rate": 9.643614549498609e-05,
"loss": 1.1809,
"step": 410
},
{
"epoch": 0.30303030303030304,
"grad_norm": 0.2080051600933075,
"learning_rate": 9.621630808146397e-05,
"loss": 1.1667,
"step": 420
},
{
"epoch": 0.31024531024531027,
"grad_norm": 0.20338976383209229,
"learning_rate": 9.599015772996375e-05,
"loss": 1.1836,
"step": 430
},
{
"epoch": 0.31746031746031744,
"grad_norm": 0.20682169497013092,
"learning_rate": 9.57577253315973e-05,
"loss": 1.1732,
"step": 440
},
{
"epoch": 0.3246753246753247,
"grad_norm": 0.2108171582221985,
"learning_rate": 9.551904263557558e-05,
"loss": 1.1886,
"step": 450
},
{
"epoch": 0.3318903318903319,
"grad_norm": 0.21451614797115326,
"learning_rate": 9.527414224487182e-05,
"loss": 1.1756,
"step": 460
},
{
"epoch": 0.33910533910533913,
"grad_norm": 0.2017257660627365,
"learning_rate": 9.502305761176818e-05,
"loss": 1.1884,
"step": 470
},
{
"epoch": 0.3463203463203463,
"grad_norm": 0.19700396060943604,
"learning_rate": 9.476582303328626e-05,
"loss": 1.1786,
"step": 480
},
{
"epoch": 0.35353535353535354,
"grad_norm": 0.22976048290729523,
"learning_rate": 9.450247364650227e-05,
"loss": 1.1682,
"step": 490
},
{
"epoch": 0.36075036075036077,
"grad_norm": 0.19582359492778778,
"learning_rate": 9.423304542374749e-05,
"loss": 1.1799,
"step": 500
},
{
"epoch": 0.36796536796536794,
"grad_norm": 0.21123509109020233,
"learning_rate": 9.395757516769464e-05,
"loss": 1.1892,
"step": 510
},
{
"epoch": 0.37518037518037517,
"grad_norm": 0.2202620804309845,
"learning_rate": 9.367610050633075e-05,
"loss": 1.1847,
"step": 520
},
{
"epoch": 0.3823953823953824,
"grad_norm": 0.20339728891849518,
"learning_rate": 9.338865988781736e-05,
"loss": 1.1823,
"step": 530
},
{
"epoch": 0.38961038961038963,
"grad_norm": 0.2076507806777954,
"learning_rate": 9.309529257523872e-05,
"loss": 1.1622,
"step": 540
},
{
"epoch": 0.3968253968253968,
"grad_norm": 0.21566936373710632,
"learning_rate": 9.279603864123858e-05,
"loss": 1.192,
"step": 550
},
{
"epoch": 0.40404040404040403,
"grad_norm": 0.21168456971645355,
"learning_rate": 9.249093896254643e-05,
"loss": 1.1725,
"step": 560
},
{
"epoch": 0.41125541125541126,
"grad_norm": 0.2162477672100067,
"learning_rate": 9.218003521439404e-05,
"loss": 1.1688,
"step": 570
},
{
"epoch": 0.4184704184704185,
"grad_norm": 0.20714333653450012,
"learning_rate": 9.186336986482267e-05,
"loss": 1.1738,
"step": 580
},
{
"epoch": 0.42568542568542567,
"grad_norm": 0.1988476812839508,
"learning_rate": 9.154098616888219e-05,
"loss": 1.168,
"step": 590
},
{
"epoch": 0.4329004329004329,
"grad_norm": 0.21605175733566284,
"learning_rate": 9.121292816272267e-05,
"loss": 1.1653,
"step": 600
},
{
"epoch": 0.4329004329004329,
"eval_loss": 1.1736949682235718,
"eval_runtime": 21.5888,
"eval_samples_per_second": 17.787,
"eval_steps_per_second": 0.556,
"step": 600
},
{
"epoch": 0.4401154401154401,
"grad_norm": 0.21150672435760498,
"learning_rate": 9.087924065757919e-05,
"loss": 1.1744,
"step": 610
},
{
"epoch": 0.44733044733044736,
"grad_norm": 0.1942724734544754,
"learning_rate": 9.053996923365084e-05,
"loss": 1.1798,
"step": 620
},
{
"epoch": 0.45454545454545453,
"grad_norm": 0.20731911063194275,
"learning_rate": 9.019516023387473e-05,
"loss": 1.1738,
"step": 630
},
{
"epoch": 0.46176046176046176,
"grad_norm": 0.20489269495010376,
"learning_rate": 8.98448607575956e-05,
"loss": 1.1714,
"step": 640
},
{
"epoch": 0.468975468975469,
"grad_norm": 0.20433245599269867,
"learning_rate": 8.948911865413248e-05,
"loss": 1.1719,
"step": 650
},
{
"epoch": 0.47619047619047616,
"grad_norm": 0.21800757944583893,
"learning_rate": 8.912798251624251e-05,
"loss": 1.1717,
"step": 660
},
{
"epoch": 0.4834054834054834,
"grad_norm": 0.19788488745689392,
"learning_rate": 8.876150167348348e-05,
"loss": 1.1781,
"step": 670
},
{
"epoch": 0.4906204906204906,
"grad_norm": 0.22694332897663116,
"learning_rate": 8.838972618547561e-05,
"loss": 1.1507,
"step": 680
},
{
"epoch": 0.49783549783549785,
"grad_norm": 0.21313263475894928,
"learning_rate": 8.801270683506362e-05,
"loss": 1.1711,
"step": 690
},
{
"epoch": 0.5050505050505051,
"grad_norm": 0.20376580953598022,
"learning_rate": 8.763049512138008e-05,
"loss": 1.1699,
"step": 700
},
{
"epoch": 0.5122655122655123,
"grad_norm": 0.20033618807792664,
"learning_rate": 8.724314325281078e-05,
"loss": 1.1777,
"step": 710
},
{
"epoch": 0.5194805194805194,
"grad_norm": 0.2068224996328354,
"learning_rate": 8.685070413986338e-05,
"loss": 1.1796,
"step": 720
},
{
"epoch": 0.5266955266955267,
"grad_norm": 0.2078939974308014,
"learning_rate": 8.645323138794001e-05,
"loss": 1.1748,
"step": 730
},
{
"epoch": 0.5339105339105339,
"grad_norm": 0.21399526298046112,
"learning_rate": 8.605077929001508e-05,
"loss": 1.1665,
"step": 740
},
{
"epoch": 0.5411255411255411,
"grad_norm": 0.2034529596567154,
"learning_rate": 8.5643402819219e-05,
"loss": 1.1763,
"step": 750
},
{
"epoch": 0.5483405483405484,
"grad_norm": 0.2097165733575821,
"learning_rate": 8.523115762132925e-05,
"loss": 1.1652,
"step": 760
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.20482751727104187,
"learning_rate": 8.48141000071693e-05,
"loss": 1.1604,
"step": 770
},
{
"epoch": 0.5627705627705628,
"grad_norm": 0.1976885050535202,
"learning_rate": 8.439228694491683e-05,
"loss": 1.1803,
"step": 780
},
{
"epoch": 0.56998556998557,
"grad_norm": 0.21513821184635162,
"learning_rate": 8.39657760523222e-05,
"loss": 1.1726,
"step": 790
},
{
"epoch": 0.5772005772005772,
"grad_norm": 0.20503391325473785,
"learning_rate": 8.353462558883806e-05,
"loss": 1.1788,
"step": 800
},
{
"epoch": 0.5844155844155844,
"grad_norm": 0.2092897891998291,
"learning_rate": 8.309889444766135e-05,
"loss": 1.1754,
"step": 810
},
{
"epoch": 0.5916305916305916,
"grad_norm": 0.21620312333106995,
"learning_rate": 8.265864214768883e-05,
"loss": 1.1653,
"step": 820
},
{
"epoch": 0.5988455988455988,
"grad_norm": 0.20629604160785675,
"learning_rate": 8.221392882538708e-05,
"loss": 1.157,
"step": 830
},
{
"epoch": 0.6060606060606061,
"grad_norm": 0.19690963625907898,
"learning_rate": 8.176481522657801e-05,
"loss": 1.1704,
"step": 840
},
{
"epoch": 0.6132756132756133,
"grad_norm": 0.213628888130188,
"learning_rate": 8.131136269814139e-05,
"loss": 1.1799,
"step": 850
},
{
"epoch": 0.6204906204906205,
"grad_norm": 0.2039552927017212,
"learning_rate": 8.085363317963505e-05,
"loss": 1.1731,
"step": 860
},
{
"epoch": 0.6277056277056277,
"grad_norm": 0.2059933841228485,
"learning_rate": 8.039168919483428e-05,
"loss": 1.1563,
"step": 870
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.19335155189037323,
"learning_rate": 7.992559384319137e-05,
"loss": 1.1685,
"step": 880
},
{
"epoch": 0.6421356421356421,
"grad_norm": 0.20764502882957458,
"learning_rate": 7.945541079121641e-05,
"loss": 1.162,
"step": 890
},
{
"epoch": 0.6493506493506493,
"grad_norm": 0.20053254067897797,
"learning_rate": 7.898120426378088e-05,
"loss": 1.184,
"step": 900
},
{
"epoch": 0.6493506493506493,
"eval_loss": 1.1676597595214844,
"eval_runtime": 21.5481,
"eval_samples_per_second": 17.821,
"eval_steps_per_second": 0.557,
"step": 900
},
{
"epoch": 0.6565656565656566,
"grad_norm": 0.20835046470165253,
"learning_rate": 7.850303903534473e-05,
"loss": 1.1692,
"step": 910
},
{
"epoch": 0.6637806637806638,
"grad_norm": 0.20102348923683167,
"learning_rate": 7.802098042110846e-05,
"loss": 1.1531,
"step": 920
},
{
"epoch": 0.670995670995671,
"grad_norm": 0.21355856955051422,
"learning_rate": 7.753509426809147e-05,
"loss": 1.1632,
"step": 930
},
{
"epoch": 0.6782106782106783,
"grad_norm": 0.2094719111919403,
"learning_rate": 7.704544694613755e-05,
"loss": 1.1534,
"step": 940
},
{
"epoch": 0.6854256854256854,
"grad_norm": 0.21322083473205566,
"learning_rate": 7.655210533884912e-05,
"loss": 1.1697,
"step": 950
},
{
"epoch": 0.6926406926406926,
"grad_norm": 0.20414821803569794,
"learning_rate": 7.605513683445118e-05,
"loss": 1.1677,
"step": 960
},
{
"epoch": 0.6998556998556998,
"grad_norm": 0.20397794246673584,
"learning_rate": 7.555460931658647e-05,
"loss": 1.1581,
"step": 970
},
{
"epoch": 0.7070707070707071,
"grad_norm": 0.21406404674053192,
"learning_rate": 7.505059115504279e-05,
"loss": 1.1696,
"step": 980
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.1953253448009491,
"learning_rate": 7.454315119641403e-05,
"loss": 1.1626,
"step": 990
},
{
"epoch": 0.7215007215007215,
"grad_norm": 0.2156262844800949,
"learning_rate": 7.403235875469603e-05,
"loss": 1.1674,
"step": 1000
},
{
"epoch": 0.7287157287157288,
"grad_norm": 0.19182373583316803,
"learning_rate": 7.351828360181862e-05,
"loss": 1.1735,
"step": 1010
},
{
"epoch": 0.7359307359307359,
"grad_norm": 0.1975948065519333,
"learning_rate": 7.300099595811506e-05,
"loss": 1.167,
"step": 1020
},
{
"epoch": 0.7431457431457431,
"grad_norm": 0.19967731833457947,
"learning_rate": 7.248056648273034e-05,
"loss": 1.1631,
"step": 1030
},
{
"epoch": 0.7503607503607503,
"grad_norm": 0.19803431630134583,
"learning_rate": 7.19570662639693e-05,
"loss": 1.1638,
"step": 1040
},
{
"epoch": 0.7575757575757576,
"grad_norm": 0.20940466225147247,
"learning_rate": 7.14305668095865e-05,
"loss": 1.1676,
"step": 1050
},
{
"epoch": 0.7647907647907648,
"grad_norm": 0.20945611596107483,
"learning_rate": 7.090114003701838e-05,
"loss": 1.1798,
"step": 1060
},
{
"epoch": 0.772005772005772,
"grad_norm": 0.20469647645950317,
"learning_rate": 7.03688582635598e-05,
"loss": 1.1502,
"step": 1070
},
{
"epoch": 0.7792207792207793,
"grad_norm": 0.19986766576766968,
"learning_rate": 6.983379419648586e-05,
"loss": 1.159,
"step": 1080
},
{
"epoch": 0.7864357864357865,
"grad_norm": 0.20784717798233032,
"learning_rate": 6.929602092312023e-05,
"loss": 1.1593,
"step": 1090
},
{
"epoch": 0.7936507936507936,
"grad_norm": 0.20664818584918976,
"learning_rate": 6.87556119008519e-05,
"loss": 1.17,
"step": 1100
},
{
"epoch": 0.8008658008658008,
"grad_norm": 0.20311331748962402,
"learning_rate": 6.821264094710125e-05,
"loss": 1.1516,
"step": 1110
},
{
"epoch": 0.8080808080808081,
"grad_norm": 0.20795664191246033,
"learning_rate": 6.76671822292368e-05,
"loss": 1.1488,
"step": 1120
},
{
"epoch": 0.8152958152958153,
"grad_norm": 0.1966182142496109,
"learning_rate": 6.711931025444444e-05,
"loss": 1.1633,
"step": 1130
},
{
"epoch": 0.8225108225108225,
"grad_norm": 0.22438089549541473,
"learning_rate": 6.656909985954994e-05,
"loss": 1.1661,
"step": 1140
},
{
"epoch": 0.8297258297258298,
"grad_norm": 0.2064308524131775,
"learning_rate": 6.601662620079669e-05,
"loss": 1.1622,
"step": 1150
},
{
"epoch": 0.836940836940837,
"grad_norm": 0.20877982676029205,
"learning_rate": 6.546196474357961e-05,
"loss": 1.1759,
"step": 1160
},
{
"epoch": 0.8441558441558441,
"grad_norm": 0.21362321078777313,
"learning_rate": 6.490519125213701e-05,
"loss": 1.161,
"step": 1170
},
{
"epoch": 0.8513708513708513,
"grad_norm": 0.20032328367233276,
"learning_rate": 6.434638177920144e-05,
"loss": 1.1633,
"step": 1180
},
{
"epoch": 0.8585858585858586,
"grad_norm": 0.21117402613162994,
"learning_rate": 6.378561265561134e-05,
"loss": 1.1693,
"step": 1190
},
{
"epoch": 0.8658008658008658,
"grad_norm": 0.19220539927482605,
"learning_rate": 6.32229604798845e-05,
"loss": 1.1766,
"step": 1200
},
{
"epoch": 0.8658008658008658,
"eval_loss": 1.1594480276107788,
"eval_runtime": 21.5758,
"eval_samples_per_second": 17.798,
"eval_steps_per_second": 0.556,
"step": 1200
}
],
"logging_steps": 10,
"max_steps": 2772,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1659221988131471e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}