Llama-3.1-8B-math-reasoning / trainer_state.json
pmahdavi's picture
Upload model with optimizer states
1d4abd6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998683627921449,
"eval_steps": 500,
"global_step": 2611,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0038294460466953076,
"grad_norm": 5.426074325401742,
"learning_rate": 5.69620253164557e-07,
"loss": 0.596,
"step": 10
},
{
"epoch": 0.007658892093390615,
"grad_norm": 1.7300049140411797,
"learning_rate": 1.2025316455696204e-06,
"loss": 0.4935,
"step": 20
},
{
"epoch": 0.011488338140085923,
"grad_norm": 1.0566614384820672,
"learning_rate": 1.8354430379746838e-06,
"loss": 0.4179,
"step": 30
},
{
"epoch": 0.01531778418678123,
"grad_norm": 1.1239284474445808,
"learning_rate": 2.4683544303797473e-06,
"loss": 0.3786,
"step": 40
},
{
"epoch": 0.019147230233476538,
"grad_norm": 0.8236458977275282,
"learning_rate": 3.10126582278481e-06,
"loss": 0.3645,
"step": 50
},
{
"epoch": 0.022976676280171845,
"grad_norm": 0.939010895376297,
"learning_rate": 3.7341772151898737e-06,
"loss": 0.3472,
"step": 60
},
{
"epoch": 0.026806122326867153,
"grad_norm": 0.7558771041067948,
"learning_rate": 4.367088607594937e-06,
"loss": 0.3234,
"step": 70
},
{
"epoch": 0.03063556837356246,
"grad_norm": 0.8507599742006083,
"learning_rate": 5e-06,
"loss": 0.3273,
"step": 80
},
{
"epoch": 0.03446501442025777,
"grad_norm": 0.9322867483149297,
"learning_rate": 4.999807568225742e-06,
"loss": 0.3269,
"step": 90
},
{
"epoch": 0.038294460466953076,
"grad_norm": 0.8060180679646447,
"learning_rate": 4.999230302526956e-06,
"loss": 0.338,
"step": 100
},
{
"epoch": 0.04212390651364838,
"grad_norm": 0.8178741447804784,
"learning_rate": 4.998268291771053e-06,
"loss": 0.3232,
"step": 110
},
{
"epoch": 0.04595335256034369,
"grad_norm": 0.8264908445738317,
"learning_rate": 4.9969216840551815e-06,
"loss": 0.3239,
"step": 120
},
{
"epoch": 0.049782798607039,
"grad_norm": 0.8185737508740193,
"learning_rate": 4.995190686683432e-06,
"loss": 0.3164,
"step": 130
},
{
"epoch": 0.053612244653734306,
"grad_norm": 0.8016123974519153,
"learning_rate": 4.9930755661349215e-06,
"loss": 0.3227,
"step": 140
},
{
"epoch": 0.057441690700429614,
"grad_norm": 0.8304543418674754,
"learning_rate": 4.990576648022768e-06,
"loss": 0.3136,
"step": 150
},
{
"epoch": 0.06127113674712492,
"grad_norm": 0.7555621229293721,
"learning_rate": 4.98769431704397e-06,
"loss": 0.3033,
"step": 160
},
{
"epoch": 0.06510058279382024,
"grad_norm": 0.8637646507692361,
"learning_rate": 4.984429016920178e-06,
"loss": 0.3231,
"step": 170
},
{
"epoch": 0.06893002884051554,
"grad_norm": 0.7476252312804286,
"learning_rate": 4.980781250329389e-06,
"loss": 0.309,
"step": 180
},
{
"epoch": 0.07275947488721085,
"grad_norm": 0.8589580099130042,
"learning_rate": 4.976751578828562e-06,
"loss": 0.3122,
"step": 190
},
{
"epoch": 0.07658892093390615,
"grad_norm": 0.8355945240964501,
"learning_rate": 4.9723406227671645e-06,
"loss": 0.3109,
"step": 200
},
{
"epoch": 0.08041836698060147,
"grad_norm": 0.7902874463038752,
"learning_rate": 4.967549061191679e-06,
"loss": 0.3118,
"step": 210
},
{
"epoch": 0.08424781302729677,
"grad_norm": 0.8876153745394134,
"learning_rate": 4.962377631741061e-06,
"loss": 0.306,
"step": 220
},
{
"epoch": 0.08807725907399208,
"grad_norm": 0.866679126237035,
"learning_rate": 4.956827130533185e-06,
"loss": 0.3135,
"step": 230
},
{
"epoch": 0.09190670512068738,
"grad_norm": 0.8629644307536233,
"learning_rate": 4.95089841204229e-06,
"loss": 0.302,
"step": 240
},
{
"epoch": 0.0957361511673827,
"grad_norm": 0.7338565663547403,
"learning_rate": 4.9445923889674285e-06,
"loss": 0.303,
"step": 250
},
{
"epoch": 0.099565597214078,
"grad_norm": 0.7305791719581167,
"learning_rate": 4.937910032091968e-06,
"loss": 0.3009,
"step": 260
},
{
"epoch": 0.10339504326077331,
"grad_norm": 0.6695669065750437,
"learning_rate": 4.9308523701341415e-06,
"loss": 0.305,
"step": 270
},
{
"epoch": 0.10722448930746861,
"grad_norm": 0.7251397159069465,
"learning_rate": 4.923420489588677e-06,
"loss": 0.3005,
"step": 280
},
{
"epoch": 0.11105393535416393,
"grad_norm": 0.7899848638296046,
"learning_rate": 4.915615534559545e-06,
"loss": 0.3036,
"step": 290
},
{
"epoch": 0.11488338140085923,
"grad_norm": 0.8159662777762657,
"learning_rate": 4.907438706583818e-06,
"loss": 0.2997,
"step": 300
},
{
"epoch": 0.11871282744755454,
"grad_norm": 0.7496451104541622,
"learning_rate": 4.898891264446709e-06,
"loss": 0.2984,
"step": 310
},
{
"epoch": 0.12254227349424984,
"grad_norm": 0.7090938182919049,
"learning_rate": 4.889974523987784e-06,
"loss": 0.3037,
"step": 320
},
{
"epoch": 0.12637171954094514,
"grad_norm": 0.7534479964128541,
"learning_rate": 4.880689857898392e-06,
"loss": 0.2907,
"step": 330
},
{
"epoch": 0.13020116558764047,
"grad_norm": 0.7967215720335111,
"learning_rate": 4.871038695510347e-06,
"loss": 0.3035,
"step": 340
},
{
"epoch": 0.13403061163433577,
"grad_norm": 0.8589787634591548,
"learning_rate": 4.861022522575892e-06,
"loss": 0.2917,
"step": 350
},
{
"epoch": 0.13786005768103107,
"grad_norm": 0.77962150007173,
"learning_rate": 4.850642881038969e-06,
"loss": 0.3019,
"step": 360
},
{
"epoch": 0.14168950372772637,
"grad_norm": 0.6913568705438217,
"learning_rate": 4.839901368797849e-06,
"loss": 0.2987,
"step": 370
},
{
"epoch": 0.1455189497744217,
"grad_norm": 0.7245824514430528,
"learning_rate": 4.828799639459139e-06,
"loss": 0.2996,
"step": 380
},
{
"epoch": 0.149348395821117,
"grad_norm": 0.7331355223021702,
"learning_rate": 4.817339402083217e-06,
"loss": 0.2958,
"step": 390
},
{
"epoch": 0.1531778418678123,
"grad_norm": 0.684066179750134,
"learning_rate": 4.805522420921132e-06,
"loss": 0.2923,
"step": 400
},
{
"epoch": 0.1570072879145076,
"grad_norm": 0.7273451737034062,
"learning_rate": 4.793350515143007e-06,
"loss": 0.2955,
"step": 410
},
{
"epoch": 0.16083673396120293,
"grad_norm": 0.6922184014896344,
"learning_rate": 4.780825558557981e-06,
"loss": 0.3021,
"step": 420
},
{
"epoch": 0.16466618000789823,
"grad_norm": 0.7901617539245519,
"learning_rate": 4.767949479325749e-06,
"loss": 0.3004,
"step": 430
},
{
"epoch": 0.16849562605459353,
"grad_norm": 0.7269661753924206,
"learning_rate": 4.754724259659727e-06,
"loss": 0.2966,
"step": 440
},
{
"epoch": 0.17232507210128883,
"grad_norm": 0.7893201123959417,
"learning_rate": 4.741151935521906e-06,
"loss": 0.2985,
"step": 450
},
{
"epoch": 0.17615451814798416,
"grad_norm": 0.6946007030633581,
"learning_rate": 4.727234596309417e-06,
"loss": 0.3036,
"step": 460
},
{
"epoch": 0.17998396419467946,
"grad_norm": 0.7322226198234377,
"learning_rate": 4.71297438453288e-06,
"loss": 0.3001,
"step": 470
},
{
"epoch": 0.18381341024137476,
"grad_norm": 0.6823377208042828,
"learning_rate": 4.69837349548658e-06,
"loss": 0.2925,
"step": 480
},
{
"epoch": 0.1876428562880701,
"grad_norm": 0.7135771256714908,
"learning_rate": 4.683434176910503e-06,
"loss": 0.2939,
"step": 490
},
{
"epoch": 0.1914723023347654,
"grad_norm": 0.691020843307318,
"learning_rate": 4.668158728644315e-06,
"loss": 0.2804,
"step": 500
},
{
"epoch": 0.1953017483814607,
"grad_norm": 0.6612550303473294,
"learning_rate": 4.652549502273305e-06,
"loss": 0.2922,
"step": 510
},
{
"epoch": 0.199131194428156,
"grad_norm": 0.7364276724142655,
"learning_rate": 4.636608900766372e-06,
"loss": 0.2891,
"step": 520
},
{
"epoch": 0.20296064047485132,
"grad_norm": 0.6721354413833053,
"learning_rate": 4.620339378106103e-06,
"loss": 0.2809,
"step": 530
},
{
"epoch": 0.20679008652154662,
"grad_norm": 0.6653652380595567,
"learning_rate": 4.6037434389109855e-06,
"loss": 0.2983,
"step": 540
},
{
"epoch": 0.21061953256824192,
"grad_norm": 0.7449501156668481,
"learning_rate": 4.586823638049841e-06,
"loss": 0.2903,
"step": 550
},
{
"epoch": 0.21444897861493722,
"grad_norm": 0.715508013931221,
"learning_rate": 4.569582580248509e-06,
"loss": 0.2923,
"step": 560
},
{
"epoch": 0.21827842466163255,
"grad_norm": 0.6641963861933619,
"learning_rate": 4.552022919688861e-06,
"loss": 0.2924,
"step": 570
},
{
"epoch": 0.22210787070832785,
"grad_norm": 0.7113518461330477,
"learning_rate": 4.534147359600211e-06,
"loss": 0.2819,
"step": 580
},
{
"epoch": 0.22593731675502315,
"grad_norm": 0.7089839605701392,
"learning_rate": 4.515958651843151e-06,
"loss": 0.2939,
"step": 590
},
{
"epoch": 0.22976676280171845,
"grad_norm": 0.677078858734863,
"learning_rate": 4.497459596485924e-06,
"loss": 0.2835,
"step": 600
},
{
"epoch": 0.23359620884841378,
"grad_norm": 0.7703623199956556,
"learning_rate": 4.478653041373371e-06,
"loss": 0.2854,
"step": 610
},
{
"epoch": 0.23742565489510908,
"grad_norm": 0.6407221872558565,
"learning_rate": 4.459541881688501e-06,
"loss": 0.2872,
"step": 620
},
{
"epoch": 0.24125510094180438,
"grad_norm": 0.7625245912179776,
"learning_rate": 4.440129059506808e-06,
"loss": 0.2852,
"step": 630
},
{
"epoch": 0.24508454698849969,
"grad_norm": 0.6533872161632752,
"learning_rate": 4.420417563343347e-06,
"loss": 0.2883,
"step": 640
},
{
"epoch": 0.248913993035195,
"grad_norm": 0.6347508565680315,
"learning_rate": 4.40041042769266e-06,
"loss": 0.2818,
"step": 650
},
{
"epoch": 0.2527434390818903,
"grad_norm": 0.6956608959261769,
"learning_rate": 4.380110732561636e-06,
"loss": 0.2858,
"step": 660
},
{
"epoch": 0.2565728851285856,
"grad_norm": 0.7174236888577228,
"learning_rate": 4.3595216029953575e-06,
"loss": 0.2948,
"step": 670
},
{
"epoch": 0.26040233117528094,
"grad_norm": 0.6538209955447881,
"learning_rate": 4.338646208596009e-06,
"loss": 0.2901,
"step": 680
},
{
"epoch": 0.2642317772219762,
"grad_norm": 0.6777945072272051,
"learning_rate": 4.317487763034936e-06,
"loss": 0.2848,
"step": 690
},
{
"epoch": 0.26806122326867154,
"grad_norm": 0.6915180680337352,
"learning_rate": 4.296049523557917e-06,
"loss": 0.294,
"step": 700
},
{
"epoch": 0.2718906693153669,
"grad_norm": 0.6811198761407046,
"learning_rate": 4.274334790483718e-06,
"loss": 0.2925,
"step": 710
},
{
"epoch": 0.27572011536206215,
"grad_norm": 0.6682149146681646,
"learning_rate": 4.2523469066960295e-06,
"loss": 0.2832,
"step": 720
},
{
"epoch": 0.2795495614087575,
"grad_norm": 0.6343382096231662,
"learning_rate": 4.230089257128842e-06,
"loss": 0.2865,
"step": 730
},
{
"epoch": 0.28337900745545275,
"grad_norm": 0.7142478024296977,
"learning_rate": 4.207565268245356e-06,
"loss": 0.2852,
"step": 740
},
{
"epoch": 0.2872084535021481,
"grad_norm": 0.6678411720839094,
"learning_rate": 4.184778407510484e-06,
"loss": 0.2924,
"step": 750
},
{
"epoch": 0.2910378995488434,
"grad_norm": 0.7689563352484293,
"learning_rate": 4.16173218285706e-06,
"loss": 0.2901,
"step": 760
},
{
"epoch": 0.2948673455955387,
"grad_norm": 0.749864633773232,
"learning_rate": 4.138430142145805e-06,
"loss": 0.2839,
"step": 770
},
{
"epoch": 0.298696791642234,
"grad_norm": 0.6699595192464245,
"learning_rate": 4.114875872619147e-06,
"loss": 0.2951,
"step": 780
},
{
"epoch": 0.30252623768892933,
"grad_norm": 0.691214596956005,
"learning_rate": 4.091073000348989e-06,
"loss": 0.2874,
"step": 790
},
{
"epoch": 0.3063556837356246,
"grad_norm": 0.6554956219244137,
"learning_rate": 4.067025189678485e-06,
"loss": 0.286,
"step": 800
},
{
"epoch": 0.31018512978231993,
"grad_norm": 0.6954151666813602,
"learning_rate": 4.042736142657936e-06,
"loss": 0.2834,
"step": 810
},
{
"epoch": 0.3140145758290152,
"grad_norm": 0.7196715435903528,
"learning_rate": 4.018209598474869e-06,
"loss": 0.284,
"step": 820
},
{
"epoch": 0.31784402187571054,
"grad_norm": 0.7723328668264622,
"learning_rate": 3.9934493328784185e-06,
"loss": 0.2777,
"step": 830
},
{
"epoch": 0.32167346792240586,
"grad_norm": 0.6919247319317805,
"learning_rate": 3.9684591575980546e-06,
"loss": 0.2893,
"step": 840
},
{
"epoch": 0.32550291396910114,
"grad_norm": 0.612288507127871,
"learning_rate": 3.943242919756792e-06,
"loss": 0.2891,
"step": 850
},
{
"epoch": 0.32933236001579647,
"grad_norm": 0.7304106009933916,
"learning_rate": 3.917804501278942e-06,
"loss": 0.2838,
"step": 860
},
{
"epoch": 0.3331618060624918,
"grad_norm": 0.6961425637816675,
"learning_rate": 3.892147818292505e-06,
"loss": 0.2818,
"step": 870
},
{
"epoch": 0.33699125210918707,
"grad_norm": 0.6415090586497654,
"learning_rate": 3.866276820526305e-06,
"loss": 0.2826,
"step": 880
},
{
"epoch": 0.3408206981558824,
"grad_norm": 0.7319097656364029,
"learning_rate": 3.840195490701943e-06,
"loss": 0.2797,
"step": 890
},
{
"epoch": 0.34465014420257767,
"grad_norm": 0.6643906637801983,
"learning_rate": 3.8139078439206755e-06,
"loss": 0.2823,
"step": 900
},
{
"epoch": 0.348479590249273,
"grad_norm": 0.651442028320178,
"learning_rate": 3.787417927045315e-06,
"loss": 0.2845,
"step": 910
},
{
"epoch": 0.3523090362959683,
"grad_norm": 0.703399820267242,
"learning_rate": 3.760729818077224e-06,
"loss": 0.2782,
"step": 920
},
{
"epoch": 0.3561384823426636,
"grad_norm": 0.6339374061657803,
"learning_rate": 3.7338476255285295e-06,
"loss": 0.2809,
"step": 930
},
{
"epoch": 0.3599679283893589,
"grad_norm": 0.6650804278367294,
"learning_rate": 3.7067754877896388e-06,
"loss": 0.288,
"step": 940
},
{
"epoch": 0.36379737443605425,
"grad_norm": 0.6645625939049019,
"learning_rate": 3.6795175724921506e-06,
"loss": 0.2821,
"step": 950
},
{
"epoch": 0.36762682048274953,
"grad_norm": 0.6819651400048093,
"learning_rate": 3.652078075867267e-06,
"loss": 0.2759,
"step": 960
},
{
"epoch": 0.37145626652944486,
"grad_norm": 0.6767426872168217,
"learning_rate": 3.624461222099804e-06,
"loss": 0.28,
"step": 970
},
{
"epoch": 0.3752857125761402,
"grad_norm": 0.7310278431962209,
"learning_rate": 3.596671262677898e-06,
"loss": 0.2883,
"step": 980
},
{
"epoch": 0.37911515862283546,
"grad_norm": 0.6791506792289325,
"learning_rate": 3.5687124757385084e-06,
"loss": 0.2885,
"step": 990
},
{
"epoch": 0.3829446046695308,
"grad_norm": 0.7086896739862765,
"learning_rate": 3.5405891654088154e-06,
"loss": 0.2815,
"step": 1000
},
{
"epoch": 0.38677405071622606,
"grad_norm": 0.6280354477468107,
"learning_rate": 3.5123056611436224e-06,
"loss": 0.2807,
"step": 1010
},
{
"epoch": 0.3906034967629214,
"grad_norm": 0.6538192887744636,
"learning_rate": 3.4838663170588573e-06,
"loss": 0.2723,
"step": 1020
},
{
"epoch": 0.3944329428096167,
"grad_norm": 0.6999366929435851,
"learning_rate": 3.455275511261272e-06,
"loss": 0.2804,
"step": 1030
},
{
"epoch": 0.398262388856312,
"grad_norm": 0.6406720215436563,
"learning_rate": 3.4265376451744564e-06,
"loss": 0.2776,
"step": 1040
},
{
"epoch": 0.4020918349030073,
"grad_norm": 0.6809060571142534,
"learning_rate": 3.3976571428612583e-06,
"loss": 0.2823,
"step": 1050
},
{
"epoch": 0.40592128094970265,
"grad_norm": 0.6506142391402524,
"learning_rate": 3.3686384503427177e-06,
"loss": 0.2785,
"step": 1060
},
{
"epoch": 0.4097507269963979,
"grad_norm": 0.6623408933951855,
"learning_rate": 3.339486034913627e-06,
"loss": 0.2781,
"step": 1070
},
{
"epoch": 0.41358017304309325,
"grad_norm": 0.675859964827601,
"learning_rate": 3.310204384454805e-06,
"loss": 0.2776,
"step": 1080
},
{
"epoch": 0.4174096190897885,
"grad_norm": 0.7354171342985883,
"learning_rate": 3.280798006742213e-06,
"loss": 0.2929,
"step": 1090
},
{
"epoch": 0.42123906513648385,
"grad_norm": 0.6457062459707484,
"learning_rate": 3.2512714287530007e-06,
"loss": 0.2743,
"step": 1100
},
{
"epoch": 0.4250685111831792,
"grad_norm": 0.641329871481826,
"learning_rate": 3.2216291959686007e-06,
"loss": 0.2737,
"step": 1110
},
{
"epoch": 0.42889795722987445,
"grad_norm": 0.6566421664911357,
"learning_rate": 3.191875871674971e-06,
"loss": 0.2838,
"step": 1120
},
{
"epoch": 0.4327274032765698,
"grad_norm": 0.6027701960697498,
"learning_rate": 3.162016036260098e-06,
"loss": 0.2752,
"step": 1130
},
{
"epoch": 0.4365568493232651,
"grad_norm": 0.648907215118306,
"learning_rate": 3.1320542865088695e-06,
"loss": 0.2667,
"step": 1140
},
{
"epoch": 0.4403862953699604,
"grad_norm": 0.6077801768682932,
"learning_rate": 3.1019952348954163e-06,
"loss": 0.2747,
"step": 1150
},
{
"epoch": 0.4442157414166557,
"grad_norm": 0.6576708995713357,
"learning_rate": 3.071843508873046e-06,
"loss": 0.2836,
"step": 1160
},
{
"epoch": 0.448045187463351,
"grad_norm": 0.695646974082748,
"learning_rate": 3.0416037501618676e-06,
"loss": 0.2732,
"step": 1170
},
{
"epoch": 0.4518746335100463,
"grad_norm": 0.616879255204133,
"learning_rate": 3.0112806140342176e-06,
"loss": 0.2759,
"step": 1180
},
{
"epoch": 0.45570407955674164,
"grad_norm": 0.6008817125153927,
"learning_rate": 2.9808787685980054e-06,
"loss": 0.2769,
"step": 1190
},
{
"epoch": 0.4595335256034369,
"grad_norm": 0.5972779982979275,
"learning_rate": 2.9504028940780777e-06,
"loss": 0.2836,
"step": 1200
},
{
"epoch": 0.46336297165013224,
"grad_norm": 0.6541064777899037,
"learning_rate": 2.9198576820957188e-06,
"loss": 0.2678,
"step": 1210
},
{
"epoch": 0.46719241769682757,
"grad_norm": 0.7461980100750918,
"learning_rate": 2.8892478349463987e-06,
"loss": 0.279,
"step": 1220
},
{
"epoch": 0.47102186374352284,
"grad_norm": 0.697749095404112,
"learning_rate": 2.8585780648758745e-06,
"loss": 0.2774,
"step": 1230
},
{
"epoch": 0.47485130979021817,
"grad_norm": 0.6901917627087478,
"learning_rate": 2.827853093354763e-06,
"loss": 0.2731,
"step": 1240
},
{
"epoch": 0.47868075583691344,
"grad_norm": 0.6167921413072504,
"learning_rate": 2.79707765035169e-06,
"loss": 0.2781,
"step": 1250
},
{
"epoch": 0.48251020188360877,
"grad_norm": 0.6877679962661163,
"learning_rate": 2.7662564736051378e-06,
"loss": 0.2779,
"step": 1260
},
{
"epoch": 0.4863396479303041,
"grad_norm": 0.6529732798686552,
"learning_rate": 2.7353943078940876e-06,
"loss": 0.2755,
"step": 1270
},
{
"epoch": 0.49016909397699937,
"grad_norm": 0.573925656257561,
"learning_rate": 2.7044959043075815e-06,
"loss": 0.2781,
"step": 1280
},
{
"epoch": 0.4939985400236947,
"grad_norm": 0.7060010201146872,
"learning_rate": 2.67356601951332e-06,
"loss": 0.2885,
"step": 1290
},
{
"epoch": 0.49782798607039,
"grad_norm": 0.6706570115076154,
"learning_rate": 2.64260941502539e-06,
"loss": 0.2823,
"step": 1300
},
{
"epoch": 0.5016574321170854,
"grad_norm": 0.6816750561081921,
"learning_rate": 2.611630856471252e-06,
"loss": 0.2734,
"step": 1310
},
{
"epoch": 0.5054868781637806,
"grad_norm": 0.6503962781113031,
"learning_rate": 2.5806351128580963e-06,
"loss": 0.2775,
"step": 1320
},
{
"epoch": 0.5093163242104759,
"grad_norm": 0.6770280496930409,
"learning_rate": 2.549626955838673e-06,
"loss": 0.2805,
"step": 1330
},
{
"epoch": 0.5131457702571712,
"grad_norm": 0.6025144898257462,
"learning_rate": 2.5186111589767187e-06,
"loss": 0.2715,
"step": 1340
},
{
"epoch": 0.5169752163038666,
"grad_norm": 0.6707574673231309,
"learning_rate": 2.487592497012089e-06,
"loss": 0.2763,
"step": 1350
},
{
"epoch": 0.5208046623505619,
"grad_norm": 0.6063466305966893,
"learning_rate": 2.456575745125713e-06,
"loss": 0.2845,
"step": 1360
},
{
"epoch": 0.5246341083972571,
"grad_norm": 0.6060316435488056,
"learning_rate": 2.4255656782044644e-06,
"loss": 0.2772,
"step": 1370
},
{
"epoch": 0.5284635544439524,
"grad_norm": 0.6023582378786108,
"learning_rate": 2.3945670701061033e-06,
"loss": 0.267,
"step": 1380
},
{
"epoch": 0.5322930004906478,
"grad_norm": 0.672852399998615,
"learning_rate": 2.3635846929243536e-06,
"loss": 0.2757,
"step": 1390
},
{
"epoch": 0.5361224465373431,
"grad_norm": 0.671673917828738,
"learning_rate": 2.3326233162542655e-06,
"loss": 0.2772,
"step": 1400
},
{
"epoch": 0.5399518925840384,
"grad_norm": 0.6908191182674716,
"learning_rate": 2.3016877064579564e-06,
"loss": 0.2752,
"step": 1410
},
{
"epoch": 0.5437813386307337,
"grad_norm": 0.6483112345732687,
"learning_rate": 2.2707826259308493e-06,
"loss": 0.2773,
"step": 1420
},
{
"epoch": 0.547610784677429,
"grad_norm": 0.6527686650275873,
"learning_rate": 2.2399128323685287e-06,
"loss": 0.2711,
"step": 1430
},
{
"epoch": 0.5514402307241243,
"grad_norm": 0.6402708780992856,
"learning_rate": 2.2090830780343116e-06,
"loss": 0.2774,
"step": 1440
},
{
"epoch": 0.5552696767708196,
"grad_norm": 0.7121133583105477,
"learning_rate": 2.178298109027659e-06,
"loss": 0.2789,
"step": 1450
},
{
"epoch": 0.559099122817515,
"grad_norm": 0.6936411537357103,
"learning_rate": 2.147562664553537e-06,
"loss": 0.2744,
"step": 1460
},
{
"epoch": 0.5629285688642103,
"grad_norm": 0.6126488504445933,
"learning_rate": 2.116881476192834e-06,
"loss": 0.2698,
"step": 1470
},
{
"epoch": 0.5667580149109055,
"grad_norm": 0.6045796996159061,
"learning_rate": 2.086259267173961e-06,
"loss": 0.2756,
"step": 1480
},
{
"epoch": 0.5705874609576008,
"grad_norm": 0.6331045988234285,
"learning_rate": 2.0557007516457287e-06,
"loss": 0.2813,
"step": 1490
},
{
"epoch": 0.5744169070042962,
"grad_norm": 0.6250063298087053,
"learning_rate": 2.025210633951627e-06,
"loss": 0.2659,
"step": 1500
},
{
"epoch": 0.5782463530509915,
"grad_norm": 0.6244702166810576,
"learning_rate": 1.9947936079056118e-06,
"loss": 0.2691,
"step": 1510
},
{
"epoch": 0.5820757990976868,
"grad_norm": 0.6645304037867747,
"learning_rate": 1.964454356069514e-06,
"loss": 0.2653,
"step": 1520
},
{
"epoch": 0.5859052451443821,
"grad_norm": 0.6422648719414517,
"learning_rate": 1.934197549032183e-06,
"loss": 0.2753,
"step": 1530
},
{
"epoch": 0.5897346911910774,
"grad_norm": 0.6796801504650317,
"learning_rate": 1.904027844690468e-06,
"loss": 0.2756,
"step": 1540
},
{
"epoch": 0.5935641372377727,
"grad_norm": 0.612092392514174,
"learning_rate": 1.8739498875321563e-06,
"loss": 0.2781,
"step": 1550
},
{
"epoch": 0.597393583284468,
"grad_norm": 0.6072776324810985,
"learning_rate": 1.8439683079209789e-06,
"loss": 0.2762,
"step": 1560
},
{
"epoch": 0.6012230293311633,
"grad_norm": 0.6756861517531914,
"learning_rate": 1.8140877213837823e-06,
"loss": 0.2671,
"step": 1570
},
{
"epoch": 0.6050524753778587,
"grad_norm": 0.6297494292950692,
"learning_rate": 1.7843127278999944e-06,
"loss": 0.2656,
"step": 1580
},
{
"epoch": 0.6088819214245539,
"grad_norm": 0.6384980188983074,
"learning_rate": 1.7546479111934733e-06,
"loss": 0.2742,
"step": 1590
},
{
"epoch": 0.6127113674712492,
"grad_norm": 0.6354445299662702,
"learning_rate": 1.7250978380268696e-06,
"loss": 0.2703,
"step": 1600
},
{
"epoch": 0.6165408135179445,
"grad_norm": 0.6499360638842633,
"learning_rate": 1.6956670574985909e-06,
"loss": 0.2778,
"step": 1610
},
{
"epoch": 0.6203702595646399,
"grad_norm": 0.6611350047172592,
"learning_rate": 1.6663601003424884e-06,
"loss": 0.2751,
"step": 1620
},
{
"epoch": 0.6241997056113352,
"grad_norm": 0.6676095929381155,
"learning_rate": 1.6371814782303723e-06,
"loss": 0.2697,
"step": 1630
},
{
"epoch": 0.6280291516580304,
"grad_norm": 0.6627616428191541,
"learning_rate": 1.6081356830774625e-06,
"loss": 0.2728,
"step": 1640
},
{
"epoch": 0.6318585977047257,
"grad_norm": 0.6297343768461555,
"learning_rate": 1.5792271863508751e-06,
"loss": 0.2725,
"step": 1650
},
{
"epoch": 0.6356880437514211,
"grad_norm": 0.6164109978910287,
"learning_rate": 1.5504604383812646e-06,
"loss": 0.2665,
"step": 1660
},
{
"epoch": 0.6395174897981164,
"grad_norm": 0.6163778395985405,
"learning_rate": 1.5218398676777103e-06,
"loss": 0.2676,
"step": 1670
},
{
"epoch": 0.6433469358448117,
"grad_norm": 0.7111914438547673,
"learning_rate": 1.493369880245973e-06,
"loss": 0.2682,
"step": 1680
},
{
"epoch": 0.6471763818915071,
"grad_norm": 0.5951988021270335,
"learning_rate": 1.4650548589102092e-06,
"loss": 0.2725,
"step": 1690
},
{
"epoch": 0.6510058279382023,
"grad_norm": 0.615884409391351,
"learning_rate": 1.436899162638255e-06,
"loss": 0.2693,
"step": 1700
},
{
"epoch": 0.6548352739848976,
"grad_norm": 0.5615244588845161,
"learning_rate": 1.4089071258705782e-06,
"loss": 0.2717,
"step": 1710
},
{
"epoch": 0.6586647200315929,
"grad_norm": 0.613522242640938,
"learning_rate": 1.3810830578530226e-06,
"loss": 0.2645,
"step": 1720
},
{
"epoch": 0.6624941660782883,
"grad_norm": 0.6255188764708021,
"learning_rate": 1.3534312419734066e-06,
"loss": 0.2619,
"step": 1730
},
{
"epoch": 0.6663236121249836,
"grad_norm": 0.6219089714484455,
"learning_rate": 1.3259559351021249e-06,
"loss": 0.2706,
"step": 1740
},
{
"epoch": 0.6701530581716788,
"grad_norm": 0.7068243904113033,
"learning_rate": 1.2986613669368159e-06,
"loss": 0.2724,
"step": 1750
},
{
"epoch": 0.6739825042183741,
"grad_norm": 0.6239904301630281,
"learning_rate": 1.2715517393512239e-06,
"loss": 0.2699,
"step": 1760
},
{
"epoch": 0.6778119502650695,
"grad_norm": 0.636573560785032,
"learning_rate": 1.2446312257483358e-06,
"loss": 0.2606,
"step": 1770
},
{
"epoch": 0.6816413963117648,
"grad_norm": 0.5822319215351535,
"learning_rate": 1.2179039704179119e-06,
"loss": 0.2671,
"step": 1780
},
{
"epoch": 0.6854708423584601,
"grad_norm": 0.6414100583030806,
"learning_rate": 1.1913740878984818e-06,
"loss": 0.2728,
"step": 1790
},
{
"epoch": 0.6893002884051553,
"grad_norm": 0.6019132802768392,
"learning_rate": 1.1650456623439368e-06,
"loss": 0.2684,
"step": 1800
},
{
"epoch": 0.6931297344518507,
"grad_norm": 0.6936171991583808,
"learning_rate": 1.1389227468947905e-06,
"loss": 0.271,
"step": 1810
},
{
"epoch": 0.696959180498546,
"grad_norm": 0.621045757650744,
"learning_rate": 1.11300936305422e-06,
"loss": 0.2657,
"step": 1820
},
{
"epoch": 0.7007886265452413,
"grad_norm": 0.6554333537644303,
"learning_rate": 1.0873095000689676e-06,
"loss": 0.2666,
"step": 1830
},
{
"epoch": 0.7046180725919367,
"grad_norm": 0.6286228416556564,
"learning_rate": 1.0618271143152185e-06,
"loss": 0.2714,
"step": 1840
},
{
"epoch": 0.708447518638632,
"grad_norm": 0.6221920366356362,
"learning_rate": 1.0365661286895364e-06,
"loss": 0.2672,
"step": 1850
},
{
"epoch": 0.7122769646853272,
"grad_norm": 0.6038564143991549,
"learning_rate": 1.011530432004948e-06,
"loss": 0.2639,
"step": 1860
},
{
"epoch": 0.7161064107320225,
"grad_norm": 0.6754360798564671,
"learning_rate": 9.86723878392279e-07,
"loss": 0.2675,
"step": 1870
},
{
"epoch": 0.7199358567787179,
"grad_norm": 0.5875424854805807,
"learning_rate": 9.621502867068286e-07,
"loss": 0.2592,
"step": 1880
},
{
"epoch": 0.7237653028254132,
"grad_norm": 0.6032756771398841,
"learning_rate": 9.378134399404768e-07,
"loss": 0.2676,
"step": 1890
},
{
"epoch": 0.7275947488721085,
"grad_norm": 0.597275111662435,
"learning_rate": 9.137170846393054e-07,
"loss": 0.268,
"step": 1900
},
{
"epoch": 0.7314241949188037,
"grad_norm": 0.610745344226226,
"learning_rate": 8.898649303268373e-07,
"loss": 0.2752,
"step": 1910
},
{
"epoch": 0.7352536409654991,
"grad_norm": 0.7020431210410311,
"learning_rate": 8.662606489329712e-07,
"loss": 0.2793,
"step": 1920
},
{
"epoch": 0.7390830870121944,
"grad_norm": 0.6300673226887155,
"learning_rate": 8.429078742287072e-07,
"loss": 0.2673,
"step": 1930
},
{
"epoch": 0.7429125330588897,
"grad_norm": 0.6440907203937188,
"learning_rate": 8.198102012667409e-07,
"loss": 0.2662,
"step": 1940
},
{
"epoch": 0.746741979105585,
"grad_norm": 0.6258159414377766,
"learning_rate": 7.969711858280251e-07,
"loss": 0.2712,
"step": 1950
},
{
"epoch": 0.7505714251522804,
"grad_norm": 0.5989574886855157,
"learning_rate": 7.743943438743676e-07,
"loss": 0.2634,
"step": 1960
},
{
"epoch": 0.7544008711989756,
"grad_norm": 0.7834580590666472,
"learning_rate": 7.520831510071744e-07,
"loss": 0.2632,
"step": 1970
},
{
"epoch": 0.7582303172456709,
"grad_norm": 0.6728771166977499,
"learning_rate": 7.30041041932387e-07,
"loss": 0.2756,
"step": 1980
},
{
"epoch": 0.7620597632923662,
"grad_norm": 0.5993928197071544,
"learning_rate": 7.082714099317334e-07,
"loss": 0.2664,
"step": 1990
},
{
"epoch": 0.7658892093390616,
"grad_norm": 0.5848853677834179,
"learning_rate": 6.867776063403411e-07,
"loss": 0.2628,
"step": 2000
},
{
"epoch": 0.7697186553857569,
"grad_norm": 0.629330394414076,
"learning_rate": 6.655629400308191e-07,
"loss": 0.2658,
"step": 2010
},
{
"epoch": 0.7735481014324521,
"grad_norm": 0.5924525230054234,
"learning_rate": 6.44630676903869e-07,
"loss": 0.2669,
"step": 2020
},
{
"epoch": 0.7773775474791474,
"grad_norm": 0.6760063055600575,
"learning_rate": 6.239840393855185e-07,
"loss": 0.2692,
"step": 2030
},
{
"epoch": 0.7812069935258428,
"grad_norm": 0.6602651610212074,
"learning_rate": 6.036262059310383e-07,
"loss": 0.2629,
"step": 2040
},
{
"epoch": 0.7850364395725381,
"grad_norm": 0.6327130212823728,
"learning_rate": 5.835603105356396e-07,
"loss": 0.2678,
"step": 2050
},
{
"epoch": 0.7888658856192334,
"grad_norm": 0.68617187633267,
"learning_rate": 5.637894422520027e-07,
"loss": 0.268,
"step": 2060
},
{
"epoch": 0.7926953316659286,
"grad_norm": 0.6854582170736043,
"learning_rate": 5.443166447147392e-07,
"loss": 0.2652,
"step": 2070
},
{
"epoch": 0.796524777712624,
"grad_norm": 0.6454661983770266,
"learning_rate": 5.251449156718313e-07,
"loss": 0.2616,
"step": 2080
},
{
"epoch": 0.8003542237593193,
"grad_norm": 0.6204018065147047,
"learning_rate": 5.062772065231492e-07,
"loss": 0.2664,
"step": 2090
},
{
"epoch": 0.8041836698060146,
"grad_norm": 0.6500266139560574,
"learning_rate": 4.877164218660901e-07,
"loss": 0.2656,
"step": 2100
},
{
"epoch": 0.80801311585271,
"grad_norm": 0.6419744527723766,
"learning_rate": 4.694654190484327e-07,
"loss": 0.2612,
"step": 2110
},
{
"epoch": 0.8118425618994053,
"grad_norm": 0.6238307107139875,
"learning_rate": 4.5152700772845947e-07,
"loss": 0.2676,
"step": 2120
},
{
"epoch": 0.8156720079461005,
"grad_norm": 0.6514460031994411,
"learning_rate": 4.339039494424263e-07,
"loss": 0.2755,
"step": 2130
},
{
"epoch": 0.8195014539927958,
"grad_norm": 0.5974777353730542,
"learning_rate": 4.16598957179431e-07,
"loss": 0.2597,
"step": 2140
},
{
"epoch": 0.8233309000394912,
"grad_norm": 0.6009135110985171,
"learning_rate": 3.9961469496376584e-07,
"loss": 0.2592,
"step": 2150
},
{
"epoch": 0.8271603460861865,
"grad_norm": 0.6454195389454723,
"learning_rate": 3.829537774448e-07,
"loss": 0.2714,
"step": 2160
},
{
"epoch": 0.8309897921328818,
"grad_norm": 0.5913764986372555,
"learning_rate": 3.6661876949447006e-07,
"loss": 0.2637,
"step": 2170
},
{
"epoch": 0.834819238179577,
"grad_norm": 0.6714610867965414,
"learning_rate": 3.506121858124253e-07,
"loss": 0.2652,
"step": 2180
},
{
"epoch": 0.8386486842262724,
"grad_norm": 0.6260094170090917,
"learning_rate": 3.3493649053890325e-07,
"loss": 0.2642,
"step": 2190
},
{
"epoch": 0.8424781302729677,
"grad_norm": 0.5797985857131752,
"learning_rate": 3.1959409687538854e-07,
"loss": 0.2632,
"step": 2200
},
{
"epoch": 0.846307576319663,
"grad_norm": 0.6189077791942977,
"learning_rate": 3.04587366713108e-07,
"loss": 0.2648,
"step": 2210
},
{
"epoch": 0.8501370223663584,
"grad_norm": 0.6122894183484023,
"learning_rate": 2.8991861026943015e-07,
"loss": 0.2741,
"step": 2220
},
{
"epoch": 0.8539664684130536,
"grad_norm": 0.6661899719747609,
"learning_rate": 2.755900857322172e-07,
"loss": 0.2645,
"step": 2230
},
{
"epoch": 0.8577959144597489,
"grad_norm": 0.5568358628059588,
"learning_rate": 2.616039989121899e-07,
"loss": 0.2546,
"step": 2240
},
{
"epoch": 0.8616253605064442,
"grad_norm": 0.6975565699445695,
"learning_rate": 2.479625029033489e-07,
"loss": 0.2774,
"step": 2250
},
{
"epoch": 0.8654548065531396,
"grad_norm": 0.6137153739809891,
"learning_rate": 2.3466769775151887e-07,
"loss": 0.266,
"step": 2260
},
{
"epoch": 0.8692842525998349,
"grad_norm": 0.6065270965865103,
"learning_rate": 2.21721630131054e-07,
"loss": 0.2717,
"step": 2270
},
{
"epoch": 0.8731136986465302,
"grad_norm": 0.6194228612620867,
"learning_rate": 2.0912629302976494e-07,
"loss": 0.2656,
"step": 2280
},
{
"epoch": 0.8769431446932254,
"grad_norm": 0.6721041148274991,
"learning_rate": 1.968836254421036e-07,
"loss": 0.2653,
"step": 2290
},
{
"epoch": 0.8807725907399208,
"grad_norm": 0.664610505857682,
"learning_rate": 1.849955120706673e-07,
"loss": 0.2677,
"step": 2300
},
{
"epoch": 0.8846020367866161,
"grad_norm": 0.6321996875856193,
"learning_rate": 1.734637830360536e-07,
"loss": 0.2645,
"step": 2310
},
{
"epoch": 0.8884314828333114,
"grad_norm": 0.6365093137379149,
"learning_rate": 1.6229021359512626e-07,
"loss": 0.2658,
"step": 2320
},
{
"epoch": 0.8922609288800067,
"grad_norm": 0.6539770975459238,
"learning_rate": 1.514765238677185e-07,
"loss": 0.259,
"step": 2330
},
{
"epoch": 0.896090374926702,
"grad_norm": 0.5990361778489132,
"learning_rate": 1.4102437857183155e-07,
"loss": 0.265,
"step": 2340
},
{
"epoch": 0.8999198209733973,
"grad_norm": 0.6484099138497742,
"learning_rate": 1.30935386767356e-07,
"loss": 0.2667,
"step": 2350
},
{
"epoch": 0.9037492670200926,
"grad_norm": 0.5867117908179056,
"learning_rate": 1.2121110160836697e-07,
"loss": 0.2634,
"step": 2360
},
{
"epoch": 0.907578713066788,
"grad_norm": 0.5796597988618597,
"learning_rate": 1.1185302010402105e-07,
"loss": 0.2719,
"step": 2370
},
{
"epoch": 0.9114081591134833,
"grad_norm": 0.6237999983224305,
"learning_rate": 1.0286258288810108e-07,
"loss": 0.2627,
"step": 2380
},
{
"epoch": 0.9152376051601786,
"grad_norm": 0.603524669140342,
"learning_rate": 9.424117399723432e-08,
"loss": 0.262,
"step": 2390
},
{
"epoch": 0.9190670512068738,
"grad_norm": 0.620014776939978,
"learning_rate": 8.599012065782924e-08,
"loss": 0.271,
"step": 2400
},
{
"epoch": 0.9228964972535691,
"grad_norm": 0.6276616849770017,
"learning_rate": 7.811069308175156e-08,
"loss": 0.2692,
"step": 2410
},
{
"epoch": 0.9267259433002645,
"grad_norm": 0.6160811115395115,
"learning_rate": 7.060410427078473e-08,
"loss": 0.2674,
"step": 2420
},
{
"epoch": 0.9305553893469598,
"grad_norm": 0.5848887503471439,
"learning_rate": 6.347150982989159e-08,
"loss": 0.2625,
"step": 2430
},
{
"epoch": 0.9343848353936551,
"grad_norm": 0.5883046146508241,
"learning_rate": 5.6714007789314686e-08,
"loss": 0.2621,
"step": 2440
},
{
"epoch": 0.9382142814403504,
"grad_norm": 0.6940261485803617,
"learning_rate": 5.033263843554015e-08,
"loss": 0.2646,
"step": 2450
},
{
"epoch": 0.9420437274870457,
"grad_norm": 0.6682532368250712,
"learning_rate": 4.4328384151149094e-08,
"loss": 0.2667,
"step": 2460
},
{
"epoch": 0.945873173533741,
"grad_norm": 0.6267836020326658,
"learning_rate": 3.870216926358555e-08,
"loss": 0.2643,
"step": 2470
},
{
"epoch": 0.9497026195804363,
"grad_norm": 0.5575161118815479,
"learning_rate": 3.3454859902860295e-08,
"loss": 0.2641,
"step": 2480
},
{
"epoch": 0.9535320656271317,
"grad_norm": 0.5714423960482532,
"learning_rate": 2.858726386821359e-08,
"loss": 0.2707,
"step": 2490
},
{
"epoch": 0.9573615116738269,
"grad_norm": 0.6111083062299648,
"learning_rate": 2.410013050375859e-08,
"loss": 0.2709,
"step": 2500
},
{
"epoch": 0.9611909577205222,
"grad_norm": 0.642938384488523,
"learning_rate": 1.999415058312276e-08,
"loss": 0.271,
"step": 2510
},
{
"epoch": 0.9650204037672175,
"grad_norm": 0.6112837503032897,
"learning_rate": 1.6269956203107117e-08,
"loss": 0.2512,
"step": 2520
},
{
"epoch": 0.9688498498139129,
"grad_norm": 0.5889833725999716,
"learning_rate": 1.2928120686377388e-08,
"loss": 0.2661,
"step": 2530
},
{
"epoch": 0.9726792958606082,
"grad_norm": 0.5773801445867635,
"learning_rate": 9.969158493204067e-09,
"loss": 0.2653,
"step": 2540
},
{
"epoch": 0.9765087419073035,
"grad_norm": 0.6062759480472552,
"learning_rate": 7.393525142262992e-09,
"loss": 0.2691,
"step": 2550
},
{
"epoch": 0.9803381879539987,
"grad_norm": 0.5770795227743729,
"learning_rate": 5.201617140510318e-09,
"loss": 0.2694,
"step": 2560
},
{
"epoch": 0.9841676340006941,
"grad_norm": 0.6054174692665726,
"learning_rate": 3.3937719221427413e-09,
"loss": 0.2592,
"step": 2570
},
{
"epoch": 0.9879970800473894,
"grad_norm": 0.5960623948954498,
"learning_rate": 1.9702677966507157e-09,
"loss": 0.2641,
"step": 2580
},
{
"epoch": 0.9918265260940847,
"grad_norm": 0.6702560926553222,
"learning_rate": 9.31323905974113e-10,
"loss": 0.2621,
"step": 2590
},
{
"epoch": 0.99565597214078,
"grad_norm": 0.623171089038422,
"learning_rate": 2.7710019076532257e-10,
"loss": 0.2672,
"step": 2600
},
{
"epoch": 0.9994854181874753,
"grad_norm": 0.6269492106617217,
"learning_rate": 7.697365768943865e-12,
"loss": 0.2736,
"step": 2610
}
],
"logging_steps": 10,
"max_steps": 2611,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 600,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 624569162006528.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}