llama3-1_8b_math_500000_samples / trainer_state.json
gsmyrnis's picture
End of training
850c9d0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1281,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0234192037470726,
"grad_norm": 2.481647447902812,
"learning_rate": 5e-06,
"loss": 0.7124,
"step": 10
},
{
"epoch": 0.0468384074941452,
"grad_norm": 0.7262736099420435,
"learning_rate": 5e-06,
"loss": 0.6153,
"step": 20
},
{
"epoch": 0.0702576112412178,
"grad_norm": 0.7790978795240339,
"learning_rate": 5e-06,
"loss": 0.5859,
"step": 30
},
{
"epoch": 0.0936768149882904,
"grad_norm": 0.6467962423003429,
"learning_rate": 5e-06,
"loss": 0.5711,
"step": 40
},
{
"epoch": 0.117096018735363,
"grad_norm": 0.5143977117058031,
"learning_rate": 5e-06,
"loss": 0.5668,
"step": 50
},
{
"epoch": 0.1405152224824356,
"grad_norm": 0.600469324649081,
"learning_rate": 5e-06,
"loss": 0.559,
"step": 60
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.5054683279603859,
"learning_rate": 5e-06,
"loss": 0.555,
"step": 70
},
{
"epoch": 0.1873536299765808,
"grad_norm": 0.520518786315232,
"learning_rate": 5e-06,
"loss": 0.548,
"step": 80
},
{
"epoch": 0.2107728337236534,
"grad_norm": 1.008487324396366,
"learning_rate": 5e-06,
"loss": 0.5481,
"step": 90
},
{
"epoch": 0.234192037470726,
"grad_norm": 0.6357300755902857,
"learning_rate": 5e-06,
"loss": 0.5449,
"step": 100
},
{
"epoch": 0.2576112412177986,
"grad_norm": 0.4471586394958172,
"learning_rate": 5e-06,
"loss": 0.541,
"step": 110
},
{
"epoch": 0.2810304449648712,
"grad_norm": 0.42874334693852684,
"learning_rate": 5e-06,
"loss": 0.5397,
"step": 120
},
{
"epoch": 0.3044496487119438,
"grad_norm": 0.5493546613118618,
"learning_rate": 5e-06,
"loss": 0.5357,
"step": 130
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.7309908680131434,
"learning_rate": 5e-06,
"loss": 0.535,
"step": 140
},
{
"epoch": 0.351288056206089,
"grad_norm": 0.5231239547569523,
"learning_rate": 5e-06,
"loss": 0.5373,
"step": 150
},
{
"epoch": 0.3747072599531616,
"grad_norm": 0.47995952741525777,
"learning_rate": 5e-06,
"loss": 0.5346,
"step": 160
},
{
"epoch": 0.3981264637002342,
"grad_norm": 0.562861554643795,
"learning_rate": 5e-06,
"loss": 0.5312,
"step": 170
},
{
"epoch": 0.4215456674473068,
"grad_norm": 0.6323157343840412,
"learning_rate": 5e-06,
"loss": 0.5332,
"step": 180
},
{
"epoch": 0.4449648711943794,
"grad_norm": 0.6022835533604722,
"learning_rate": 5e-06,
"loss": 0.5368,
"step": 190
},
{
"epoch": 0.468384074941452,
"grad_norm": 0.5241376216453505,
"learning_rate": 5e-06,
"loss": 0.5348,
"step": 200
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.4560855929707371,
"learning_rate": 5e-06,
"loss": 0.5298,
"step": 210
},
{
"epoch": 0.5152224824355972,
"grad_norm": 0.4775160386802804,
"learning_rate": 5e-06,
"loss": 0.527,
"step": 220
},
{
"epoch": 0.5386416861826698,
"grad_norm": 0.5062169069603658,
"learning_rate": 5e-06,
"loss": 0.5295,
"step": 230
},
{
"epoch": 0.5620608899297423,
"grad_norm": 0.43311731742710935,
"learning_rate": 5e-06,
"loss": 0.5264,
"step": 240
},
{
"epoch": 0.585480093676815,
"grad_norm": 0.4911544553586758,
"learning_rate": 5e-06,
"loss": 0.5292,
"step": 250
},
{
"epoch": 0.6088992974238876,
"grad_norm": 0.5749030738074241,
"learning_rate": 5e-06,
"loss": 0.5253,
"step": 260
},
{
"epoch": 0.6323185011709602,
"grad_norm": 0.5580576917700846,
"learning_rate": 5e-06,
"loss": 0.5238,
"step": 270
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.4760038513178861,
"learning_rate": 5e-06,
"loss": 0.522,
"step": 280
},
{
"epoch": 0.6791569086651054,
"grad_norm": 0.5216361788356643,
"learning_rate": 5e-06,
"loss": 0.5243,
"step": 290
},
{
"epoch": 0.702576112412178,
"grad_norm": 0.46276681576162904,
"learning_rate": 5e-06,
"loss": 0.5217,
"step": 300
},
{
"epoch": 0.7259953161592506,
"grad_norm": 0.4747845254798478,
"learning_rate": 5e-06,
"loss": 0.5226,
"step": 310
},
{
"epoch": 0.7494145199063232,
"grad_norm": 0.5960080896982851,
"learning_rate": 5e-06,
"loss": 0.521,
"step": 320
},
{
"epoch": 0.7728337236533958,
"grad_norm": 0.5212901742846952,
"learning_rate": 5e-06,
"loss": 0.5201,
"step": 330
},
{
"epoch": 0.7962529274004684,
"grad_norm": 0.5133380400247927,
"learning_rate": 5e-06,
"loss": 0.5199,
"step": 340
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.4368489322167074,
"learning_rate": 5e-06,
"loss": 0.5204,
"step": 350
},
{
"epoch": 0.8430913348946136,
"grad_norm": 0.5101656467162784,
"learning_rate": 5e-06,
"loss": 0.5221,
"step": 360
},
{
"epoch": 0.8665105386416861,
"grad_norm": 0.5226647593757339,
"learning_rate": 5e-06,
"loss": 0.519,
"step": 370
},
{
"epoch": 0.8899297423887588,
"grad_norm": 0.4265472443674912,
"learning_rate": 5e-06,
"loss": 0.5193,
"step": 380
},
{
"epoch": 0.9133489461358314,
"grad_norm": 0.42184387318773725,
"learning_rate": 5e-06,
"loss": 0.5178,
"step": 390
},
{
"epoch": 0.936768149882904,
"grad_norm": 0.4773759814767807,
"learning_rate": 5e-06,
"loss": 0.5201,
"step": 400
},
{
"epoch": 0.9601873536299765,
"grad_norm": 0.5049194555570107,
"learning_rate": 5e-06,
"loss": 0.5146,
"step": 410
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.4836846129062704,
"learning_rate": 5e-06,
"loss": 0.514,
"step": 420
},
{
"epoch": 1.0,
"eval_loss": 0.5167434215545654,
"eval_runtime": 41.6336,
"eval_samples_per_second": 276.291,
"eval_steps_per_second": 1.081,
"step": 427
},
{
"epoch": 1.0070257611241218,
"grad_norm": 0.4520838332937754,
"learning_rate": 5e-06,
"loss": 0.51,
"step": 430
},
{
"epoch": 1.0304449648711944,
"grad_norm": 0.4866921792731558,
"learning_rate": 5e-06,
"loss": 0.4977,
"step": 440
},
{
"epoch": 1.053864168618267,
"grad_norm": 0.46612714801971344,
"learning_rate": 5e-06,
"loss": 0.4974,
"step": 450
},
{
"epoch": 1.0772833723653397,
"grad_norm": 0.47487180548015817,
"learning_rate": 5e-06,
"loss": 0.4947,
"step": 460
},
{
"epoch": 1.100702576112412,
"grad_norm": 0.5023694335040125,
"learning_rate": 5e-06,
"loss": 0.4909,
"step": 470
},
{
"epoch": 1.1241217798594847,
"grad_norm": 0.5257140618717152,
"learning_rate": 5e-06,
"loss": 0.4932,
"step": 480
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.39736116687044365,
"learning_rate": 5e-06,
"loss": 0.4894,
"step": 490
},
{
"epoch": 1.17096018735363,
"grad_norm": 0.42753669693798274,
"learning_rate": 5e-06,
"loss": 0.4943,
"step": 500
},
{
"epoch": 1.1943793911007026,
"grad_norm": 0.4539073963232022,
"learning_rate": 5e-06,
"loss": 0.497,
"step": 510
},
{
"epoch": 1.2177985948477752,
"grad_norm": 0.40221054966287245,
"learning_rate": 5e-06,
"loss": 0.4916,
"step": 520
},
{
"epoch": 1.2412177985948478,
"grad_norm": 0.40071702689512967,
"learning_rate": 5e-06,
"loss": 0.4904,
"step": 530
},
{
"epoch": 1.2646370023419204,
"grad_norm": 0.4914284242936407,
"learning_rate": 5e-06,
"loss": 0.4941,
"step": 540
},
{
"epoch": 1.288056206088993,
"grad_norm": 0.4409757840452587,
"learning_rate": 5e-06,
"loss": 0.4924,
"step": 550
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.4137810147419586,
"learning_rate": 5e-06,
"loss": 0.4925,
"step": 560
},
{
"epoch": 1.334894613583138,
"grad_norm": 0.48726970931068914,
"learning_rate": 5e-06,
"loss": 0.4943,
"step": 570
},
{
"epoch": 1.3583138173302107,
"grad_norm": 0.5494057601615131,
"learning_rate": 5e-06,
"loss": 0.4935,
"step": 580
},
{
"epoch": 1.3817330210772834,
"grad_norm": 0.41251177208283474,
"learning_rate": 5e-06,
"loss": 0.4956,
"step": 590
},
{
"epoch": 1.405152224824356,
"grad_norm": 0.39118790995394576,
"learning_rate": 5e-06,
"loss": 0.4883,
"step": 600
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.49321602369191925,
"learning_rate": 5e-06,
"loss": 0.496,
"step": 610
},
{
"epoch": 1.4519906323185012,
"grad_norm": 0.5208838513759695,
"learning_rate": 5e-06,
"loss": 0.4953,
"step": 620
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.43794255642976027,
"learning_rate": 5e-06,
"loss": 0.4951,
"step": 630
},
{
"epoch": 1.4988290398126463,
"grad_norm": 0.5557497245301519,
"learning_rate": 5e-06,
"loss": 0.4929,
"step": 640
},
{
"epoch": 1.5222482435597189,
"grad_norm": 0.5087889678152296,
"learning_rate": 5e-06,
"loss": 0.4896,
"step": 650
},
{
"epoch": 1.5456674473067915,
"grad_norm": 0.38311143724497726,
"learning_rate": 5e-06,
"loss": 0.4928,
"step": 660
},
{
"epoch": 1.5690866510538641,
"grad_norm": 0.48635422306380777,
"learning_rate": 5e-06,
"loss": 0.488,
"step": 670
},
{
"epoch": 1.5925058548009368,
"grad_norm": 0.6136317892186699,
"learning_rate": 5e-06,
"loss": 0.4949,
"step": 680
},
{
"epoch": 1.6159250585480094,
"grad_norm": 0.47270773640049263,
"learning_rate": 5e-06,
"loss": 0.4883,
"step": 690
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.5783159568227515,
"learning_rate": 5e-06,
"loss": 0.4936,
"step": 700
},
{
"epoch": 1.6627634660421546,
"grad_norm": 0.43336426848256004,
"learning_rate": 5e-06,
"loss": 0.4938,
"step": 710
},
{
"epoch": 1.6861826697892273,
"grad_norm": 0.4314026408601509,
"learning_rate": 5e-06,
"loss": 0.4967,
"step": 720
},
{
"epoch": 1.7096018735362999,
"grad_norm": 0.38409075682155797,
"learning_rate": 5e-06,
"loss": 0.4891,
"step": 730
},
{
"epoch": 1.7330210772833725,
"grad_norm": 0.5211936246593988,
"learning_rate": 5e-06,
"loss": 0.491,
"step": 740
},
{
"epoch": 1.756440281030445,
"grad_norm": 0.46808950576809366,
"learning_rate": 5e-06,
"loss": 0.4949,
"step": 750
},
{
"epoch": 1.7798594847775175,
"grad_norm": 0.3970884421781967,
"learning_rate": 5e-06,
"loss": 0.4864,
"step": 760
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.4338133573209166,
"learning_rate": 5e-06,
"loss": 0.4877,
"step": 770
},
{
"epoch": 1.8266978922716628,
"grad_norm": 0.5302929743875919,
"learning_rate": 5e-06,
"loss": 0.491,
"step": 780
},
{
"epoch": 1.8501170960187352,
"grad_norm": 0.48309786628518675,
"learning_rate": 5e-06,
"loss": 0.4919,
"step": 790
},
{
"epoch": 1.8735362997658078,
"grad_norm": 0.44165745399512696,
"learning_rate": 5e-06,
"loss": 0.4875,
"step": 800
},
{
"epoch": 1.8969555035128804,
"grad_norm": 0.46223180789120477,
"learning_rate": 5e-06,
"loss": 0.4936,
"step": 810
},
{
"epoch": 1.920374707259953,
"grad_norm": 0.48226738955985293,
"learning_rate": 5e-06,
"loss": 0.4888,
"step": 820
},
{
"epoch": 1.9437939110070257,
"grad_norm": 0.39089405316919673,
"learning_rate": 5e-06,
"loss": 0.4893,
"step": 830
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.3967255297864678,
"learning_rate": 5e-06,
"loss": 0.4915,
"step": 840
},
{
"epoch": 1.990632318501171,
"grad_norm": 0.4053292002238054,
"learning_rate": 5e-06,
"loss": 0.4902,
"step": 850
},
{
"epoch": 2.0,
"eval_loss": 0.5077147483825684,
"eval_runtime": 41.7501,
"eval_samples_per_second": 275.521,
"eval_steps_per_second": 1.078,
"step": 854
},
{
"epoch": 2.0140515222482436,
"grad_norm": 0.5268697371178085,
"learning_rate": 5e-06,
"loss": 0.4752,
"step": 860
},
{
"epoch": 2.037470725995316,
"grad_norm": 0.5447609017298904,
"learning_rate": 5e-06,
"loss": 0.4669,
"step": 870
},
{
"epoch": 2.060889929742389,
"grad_norm": 0.5637527273857,
"learning_rate": 5e-06,
"loss": 0.4614,
"step": 880
},
{
"epoch": 2.0843091334894615,
"grad_norm": 0.3841529769069848,
"learning_rate": 5e-06,
"loss": 0.4659,
"step": 890
},
{
"epoch": 2.107728337236534,
"grad_norm": 0.42444670935905104,
"learning_rate": 5e-06,
"loss": 0.4593,
"step": 900
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.4111116772372611,
"learning_rate": 5e-06,
"loss": 0.4677,
"step": 910
},
{
"epoch": 2.1545667447306793,
"grad_norm": 0.6373094219643812,
"learning_rate": 5e-06,
"loss": 0.4652,
"step": 920
},
{
"epoch": 2.177985948477752,
"grad_norm": 0.6160202621538906,
"learning_rate": 5e-06,
"loss": 0.4681,
"step": 930
},
{
"epoch": 2.201405152224824,
"grad_norm": 0.37701921381345616,
"learning_rate": 5e-06,
"loss": 0.4586,
"step": 940
},
{
"epoch": 2.2248243559718968,
"grad_norm": 0.4220733688149805,
"learning_rate": 5e-06,
"loss": 0.4669,
"step": 950
},
{
"epoch": 2.2482435597189694,
"grad_norm": 0.5051333977952606,
"learning_rate": 5e-06,
"loss": 0.4665,
"step": 960
},
{
"epoch": 2.271662763466042,
"grad_norm": 0.45928248167044633,
"learning_rate": 5e-06,
"loss": 0.4665,
"step": 970
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.4495617426425146,
"learning_rate": 5e-06,
"loss": 0.4662,
"step": 980
},
{
"epoch": 2.3185011709601873,
"grad_norm": 0.4517571833465374,
"learning_rate": 5e-06,
"loss": 0.4646,
"step": 990
},
{
"epoch": 2.34192037470726,
"grad_norm": 0.4138564151461986,
"learning_rate": 5e-06,
"loss": 0.4647,
"step": 1000
},
{
"epoch": 2.3653395784543325,
"grad_norm": 0.5738034318601656,
"learning_rate": 5e-06,
"loss": 0.4659,
"step": 1010
},
{
"epoch": 2.388758782201405,
"grad_norm": 0.631953187344588,
"learning_rate": 5e-06,
"loss": 0.4663,
"step": 1020
},
{
"epoch": 2.4121779859484778,
"grad_norm": 0.46549566538492926,
"learning_rate": 5e-06,
"loss": 0.4676,
"step": 1030
},
{
"epoch": 2.4355971896955504,
"grad_norm": 0.43668516137249047,
"learning_rate": 5e-06,
"loss": 0.4673,
"step": 1040
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.42948375636759983,
"learning_rate": 5e-06,
"loss": 0.4654,
"step": 1050
},
{
"epoch": 2.4824355971896956,
"grad_norm": 0.4391102312189122,
"learning_rate": 5e-06,
"loss": 0.469,
"step": 1060
},
{
"epoch": 2.5058548009367683,
"grad_norm": 0.49842008382902114,
"learning_rate": 5e-06,
"loss": 0.47,
"step": 1070
},
{
"epoch": 2.529274004683841,
"grad_norm": 0.5588818151397613,
"learning_rate": 5e-06,
"loss": 0.4685,
"step": 1080
},
{
"epoch": 2.552693208430913,
"grad_norm": 0.4037726802957124,
"learning_rate": 5e-06,
"loss": 0.4677,
"step": 1090
},
{
"epoch": 2.576112412177986,
"grad_norm": 0.48821405764171133,
"learning_rate": 5e-06,
"loss": 0.4695,
"step": 1100
},
{
"epoch": 2.5995316159250583,
"grad_norm": 0.3858402261553971,
"learning_rate": 5e-06,
"loss": 0.4677,
"step": 1110
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.4203466943287186,
"learning_rate": 5e-06,
"loss": 0.4696,
"step": 1120
},
{
"epoch": 2.6463700234192036,
"grad_norm": 0.48915413911853883,
"learning_rate": 5e-06,
"loss": 0.4664,
"step": 1130
},
{
"epoch": 2.669789227166276,
"grad_norm": 0.43449855854006547,
"learning_rate": 5e-06,
"loss": 0.4693,
"step": 1140
},
{
"epoch": 2.693208430913349,
"grad_norm": 0.5093894446854779,
"learning_rate": 5e-06,
"loss": 0.4692,
"step": 1150
},
{
"epoch": 2.7166276346604215,
"grad_norm": 0.44982206423576104,
"learning_rate": 5e-06,
"loss": 0.4704,
"step": 1160
},
{
"epoch": 2.740046838407494,
"grad_norm": 0.40753147448920485,
"learning_rate": 5e-06,
"loss": 0.4648,
"step": 1170
},
{
"epoch": 2.7634660421545667,
"grad_norm": 0.36811758162537594,
"learning_rate": 5e-06,
"loss": 0.4629,
"step": 1180
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.3949349460619723,
"learning_rate": 5e-06,
"loss": 0.4704,
"step": 1190
},
{
"epoch": 2.810304449648712,
"grad_norm": 0.41402013254949793,
"learning_rate": 5e-06,
"loss": 0.4707,
"step": 1200
},
{
"epoch": 2.8337236533957846,
"grad_norm": 0.5432806369874894,
"learning_rate": 5e-06,
"loss": 0.4679,
"step": 1210
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.3788514557857583,
"learning_rate": 5e-06,
"loss": 0.4711,
"step": 1220
},
{
"epoch": 2.88056206088993,
"grad_norm": 0.39636258806401947,
"learning_rate": 5e-06,
"loss": 0.4653,
"step": 1230
},
{
"epoch": 2.9039812646370025,
"grad_norm": 0.3936100271064975,
"learning_rate": 5e-06,
"loss": 0.4667,
"step": 1240
},
{
"epoch": 2.927400468384075,
"grad_norm": 0.4271813759224886,
"learning_rate": 5e-06,
"loss": 0.4664,
"step": 1250
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.4720023108766376,
"learning_rate": 5e-06,
"loss": 0.468,
"step": 1260
},
{
"epoch": 2.9742388758782203,
"grad_norm": 0.42625818049428776,
"learning_rate": 5e-06,
"loss": 0.4654,
"step": 1270
},
{
"epoch": 2.9976580796252925,
"grad_norm": 0.4282654656901964,
"learning_rate": 5e-06,
"loss": 0.47,
"step": 1280
},
{
"epoch": 3.0,
"eval_loss": 0.5069288611412048,
"eval_runtime": 41.6671,
"eval_samples_per_second": 276.069,
"eval_steps_per_second": 1.08,
"step": 1281
},
{
"epoch": 3.0,
"step": 1281,
"total_flos": 2145722711408640.0,
"train_loss": 0.4994005665799587,
"train_runtime": 8118.1452,
"train_samples_per_second": 80.761,
"train_steps_per_second": 0.158
}
],
"logging_steps": 10,
"max_steps": 1281,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2145722711408640.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}