aaaa / checkpoint-1767 /trainer_state.json
Chang-Hoo's picture
Upload folder using huggingface_hub
3fb9526 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999081207152449,
"eval_steps": 500,
"global_step": 1767,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016962329493250407,
"grad_norm": 3.53125,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.1453,
"mean_token_accuracy": 0.5800132380177577,
"step": 10
},
{
"epoch": 0.033924658986500815,
"grad_norm": 2.53125,
"learning_rate": 3.518518518518519e-05,
"loss": 1.713,
"mean_token_accuracy": 0.6173386509219806,
"step": 20
},
{
"epoch": 0.05088698847975122,
"grad_norm": 1.671875,
"learning_rate": 5.370370370370371e-05,
"loss": 1.5855,
"mean_token_accuracy": 0.6299800969660282,
"step": 30
},
{
"epoch": 0.06784931797300163,
"grad_norm": 1.90625,
"learning_rate": 7.222222222222222e-05,
"loss": 1.5199,
"mean_token_accuracy": 0.6394588612020016,
"step": 40
},
{
"epoch": 0.08481164746625203,
"grad_norm": 1.5546875,
"learning_rate": 9.074074074074075e-05,
"loss": 1.5192,
"mean_token_accuracy": 0.6397132301082213,
"step": 50
},
{
"epoch": 0.10177397695950244,
"grad_norm": 1.6875,
"learning_rate": 9.999789785826972e-05,
"loss": 1.5276,
"mean_token_accuracy": 0.6391662692030271,
"step": 60
},
{
"epoch": 0.11873630645275285,
"grad_norm": 1.7734375,
"learning_rate": 9.998108178497258e-05,
"loss": 1.5183,
"mean_token_accuracy": 0.6404511784513791,
"step": 70
},
{
"epoch": 0.13569863594600326,
"grad_norm": 6.3125,
"learning_rate": 9.99474552942225e-05,
"loss": 1.5112,
"mean_token_accuracy": 0.6418155938386917,
"step": 80
},
{
"epoch": 0.15266096543925367,
"grad_norm": 1.4765625,
"learning_rate": 9.989702969580565e-05,
"loss": 1.5161,
"mean_token_accuracy": 0.6407449401915073,
"step": 90
},
{
"epoch": 0.16962329493250405,
"grad_norm": 1.421875,
"learning_rate": 9.982982194964625e-05,
"loss": 1.4945,
"mean_token_accuracy": 0.6452280322710673,
"step": 100
},
{
"epoch": 0.18658562442575446,
"grad_norm": 1.3203125,
"learning_rate": 9.974585466010236e-05,
"loss": 1.4927,
"mean_token_accuracy": 0.6472473913182815,
"step": 110
},
{
"epoch": 0.20354795391900488,
"grad_norm": 1.390625,
"learning_rate": 9.964515606836321e-05,
"loss": 1.4876,
"mean_token_accuracy": 0.646234019100666,
"step": 120
},
{
"epoch": 0.2205102834122553,
"grad_norm": 1.390625,
"learning_rate": 9.952776004295077e-05,
"loss": 1.4768,
"mean_token_accuracy": 0.6493511145313581,
"step": 130
},
{
"epoch": 0.2374726129055057,
"grad_norm": 1.4375,
"learning_rate": 9.939370606832841e-05,
"loss": 1.4627,
"mean_token_accuracy": 0.6511956502993902,
"step": 140
},
{
"epoch": 0.2544349423987561,
"grad_norm": 1.3515625,
"learning_rate": 9.924303923162097e-05,
"loss": 1.4855,
"mean_token_accuracy": 0.650639555354913,
"step": 150
},
{
"epoch": 0.2713972718920065,
"grad_norm": 1.1875,
"learning_rate": 9.907581020745037e-05,
"loss": 1.4492,
"mean_token_accuracy": 0.6547389343380928,
"step": 160
},
{
"epoch": 0.2883596013852569,
"grad_norm": 1.2734375,
"learning_rate": 9.889207524089187e-05,
"loss": 1.4298,
"mean_token_accuracy": 0.65942026724418,
"step": 170
},
{
"epoch": 0.30532193087850734,
"grad_norm": 1.2109375,
"learning_rate": 9.869189612855685e-05,
"loss": 1.4457,
"mean_token_accuracy": 0.6557608020802339,
"step": 180
},
{
"epoch": 0.3222842603717577,
"grad_norm": 1.1328125,
"learning_rate": 9.847534019780848e-05,
"loss": 1.4136,
"mean_token_accuracy": 0.6603276548286279,
"step": 190
},
{
"epoch": 0.3392465898650081,
"grad_norm": 1.2421875,
"learning_rate": 9.824248028411703e-05,
"loss": 1.4263,
"mean_token_accuracy": 0.6607817115883032,
"step": 200
},
{
"epoch": 0.35620891935825855,
"grad_norm": 1.1015625,
"learning_rate": 9.79933947065628e-05,
"loss": 1.4148,
"mean_token_accuracy": 0.6610926086703937,
"step": 210
},
{
"epoch": 0.37317124885150893,
"grad_norm": 1.046875,
"learning_rate": 9.772816724149459e-05,
"loss": 1.3812,
"mean_token_accuracy": 0.6686675310134887,
"step": 220
},
{
"epoch": 0.39013357834475937,
"grad_norm": 1.1328125,
"learning_rate": 9.744688709435268e-05,
"loss": 1.4174,
"mean_token_accuracy": 0.6620935648679733,
"step": 230
},
{
"epoch": 0.40709590783800975,
"grad_norm": 1.125,
"learning_rate": 9.714964886966598e-05,
"loss": 1.4102,
"mean_token_accuracy": 0.6628513303895791,
"step": 240
},
{
"epoch": 0.42405823733126013,
"grad_norm": 1.1484375,
"learning_rate": 9.6836552539233e-05,
"loss": 1.3909,
"mean_token_accuracy": 0.6663404104610284,
"step": 250
},
{
"epoch": 0.4410205668245106,
"grad_norm": 1.03125,
"learning_rate": 9.650770340849796e-05,
"loss": 1.4062,
"mean_token_accuracy": 0.6653375633060932,
"step": 260
},
{
"epoch": 0.45798289631776096,
"grad_norm": 1.265625,
"learning_rate": 9.616321208113262e-05,
"loss": 1.4091,
"mean_token_accuracy": 0.6641857360800107,
"step": 270
},
{
"epoch": 0.4749452258110114,
"grad_norm": 1.078125,
"learning_rate": 9.580319442183654e-05,
"loss": 1.3828,
"mean_token_accuracy": 0.6680692491432031,
"step": 280
},
{
"epoch": 0.4919075553042618,
"grad_norm": 1.1484375,
"learning_rate": 9.542777151736746e-05,
"loss": 1.3675,
"mean_token_accuracy": 0.6707451656460762,
"step": 290
},
{
"epoch": 0.5088698847975122,
"grad_norm": 1.0625,
"learning_rate": 9.503706963581562e-05,
"loss": 1.3782,
"mean_token_accuracy": 0.6665912042061488,
"step": 300
},
{
"epoch": 0.5258322142907625,
"grad_norm": 1.171875,
"learning_rate": 9.463122018413532e-05,
"loss": 1.3536,
"mean_token_accuracy": 0.6734739691019058,
"step": 310
},
{
"epoch": 0.542794543784013,
"grad_norm": 1.015625,
"learning_rate": 9.4210359663948e-05,
"loss": 1.3887,
"mean_token_accuracy": 0.668233826259772,
"step": 320
},
{
"epoch": 0.5597568732772634,
"grad_norm": 1.0078125,
"learning_rate": 9.377462962563195e-05,
"loss": 1.3819,
"mean_token_accuracy": 0.6673479390641054,
"step": 330
},
{
"epoch": 0.5767192027705138,
"grad_norm": 1.015625,
"learning_rate": 9.332417662071385e-05,
"loss": 1.3679,
"mean_token_accuracy": 0.6706842251121998,
"step": 340
},
{
"epoch": 0.5936815322637642,
"grad_norm": 0.9609375,
"learning_rate": 9.285915215257828e-05,
"loss": 1.3615,
"mean_token_accuracy": 0.6692039887110393,
"step": 350
},
{
"epoch": 0.6106438617570147,
"grad_norm": 1.078125,
"learning_rate": 9.237971262551175e-05,
"loss": 1.3566,
"mean_token_accuracy": 0.6739495868484179,
"step": 360
},
{
"epoch": 0.6276061912502651,
"grad_norm": 1.1875,
"learning_rate": 9.188601929209835e-05,
"loss": 1.3103,
"mean_token_accuracy": 0.6818199207385381,
"step": 370
},
{
"epoch": 0.6445685207435154,
"grad_norm": 1.0,
"learning_rate": 9.137823819898477e-05,
"loss": 1.3527,
"mean_token_accuracy": 0.6732211743791898,
"step": 380
},
{
"epoch": 0.6615308502367658,
"grad_norm": 0.9453125,
"learning_rate": 9.08565401310329e-05,
"loss": 1.3332,
"mean_token_accuracy": 0.6759632855653763,
"step": 390
},
{
"epoch": 0.6784931797300162,
"grad_norm": 0.98046875,
"learning_rate": 9.03211005538788e-05,
"loss": 1.3457,
"mean_token_accuracy": 0.6737116026381652,
"step": 400
},
{
"epoch": 0.6954555092232667,
"grad_norm": 1.078125,
"learning_rate": 8.977209955491739e-05,
"loss": 1.3713,
"mean_token_accuracy": 0.6700817617277305,
"step": 410
},
{
"epoch": 0.7124178387165171,
"grad_norm": 1.03125,
"learning_rate": 8.920972178273257e-05,
"loss": 1.3243,
"mean_token_accuracy": 0.6765588760375977,
"step": 420
},
{
"epoch": 0.7293801682097675,
"grad_norm": 0.98046875,
"learning_rate": 8.863415638499341e-05,
"loss": 1.3272,
"mean_token_accuracy": 0.6768454472223918,
"step": 430
},
{
"epoch": 0.7463424977030179,
"grad_norm": 1.015625,
"learning_rate": 8.8045596944837e-05,
"loss": 1.3411,
"mean_token_accuracy": 0.6759745722015699,
"step": 440
},
{
"epoch": 0.7633048271962682,
"grad_norm": 1.0,
"learning_rate": 8.744424141575959e-05,
"loss": 1.3138,
"mean_token_accuracy": 0.6793065622448922,
"step": 450
},
{
"epoch": 0.7802671566895187,
"grad_norm": 0.9765625,
"learning_rate": 8.683029205503773e-05,
"loss": 1.3562,
"mean_token_accuracy": 0.6727576293051243,
"step": 460
},
{
"epoch": 0.7972294861827691,
"grad_norm": 1.0546875,
"learning_rate": 8.620395535570198e-05,
"loss": 1.347,
"mean_token_accuracy": 0.6732823781669139,
"step": 470
},
{
"epoch": 0.8141918156760195,
"grad_norm": 0.93359375,
"learning_rate": 8.556544197708596e-05,
"loss": 1.3011,
"mean_token_accuracy": 0.6819205803175767,
"step": 480
},
{
"epoch": 0.8311541451692699,
"grad_norm": 0.96484375,
"learning_rate": 8.491496667397408e-05,
"loss": 1.3051,
"mean_token_accuracy": 0.6827595402797063,
"step": 490
},
{
"epoch": 0.8481164746625203,
"grad_norm": 1.1015625,
"learning_rate": 8.42527482243719e-05,
"loss": 1.2991,
"mean_token_accuracy": 0.6841210166613261,
"step": 500
},
{
"epoch": 0.8650788041557708,
"grad_norm": 0.9375,
"learning_rate": 8.357900935592327e-05,
"loss": 1.3055,
"mean_token_accuracy": 0.679613892485698,
"step": 510
},
{
"epoch": 0.8820411336490211,
"grad_norm": 0.95703125,
"learning_rate": 8.289397667099909e-05,
"loss": 1.3155,
"mean_token_accuracy": 0.6812887417773406,
"step": 520
},
{
"epoch": 0.8990034631422715,
"grad_norm": 0.9453125,
"learning_rate": 8.219788057048286e-05,
"loss": 1.3154,
"mean_token_accuracy": 0.6792417210837205,
"step": 530
},
{
"epoch": 0.9159657926355219,
"grad_norm": 0.9375,
"learning_rate": 8.149095517627871e-05,
"loss": 1.3034,
"mean_token_accuracy": 0.6806264075140158,
"step": 540
},
{
"epoch": 0.9329281221287723,
"grad_norm": 0.8984375,
"learning_rate": 8.077343825256783e-05,
"loss": 1.3126,
"mean_token_accuracy": 0.6810529338816802,
"step": 550
},
{
"epoch": 0.9498904516220228,
"grad_norm": 0.9609375,
"learning_rate": 8.004557112583986e-05,
"loss": 1.3134,
"mean_token_accuracy": 0.6798362337052822,
"step": 560
},
{
"epoch": 0.9668527811152732,
"grad_norm": 0.921875,
"learning_rate": 7.930759860372628e-05,
"loss": 1.2856,
"mean_token_accuracy": 0.683349988112847,
"step": 570
},
{
"epoch": 0.9838151106085236,
"grad_norm": 0.88671875,
"learning_rate": 7.855976889266288e-05,
"loss": 1.2901,
"mean_token_accuracy": 0.6849393486976624,
"step": 580
},
{
"epoch": 1.001696232949325,
"grad_norm": 2.609375,
"learning_rate": 7.780233351440903e-05,
"loss": 1.395,
"mean_token_accuracy": 0.6899135421733467,
"step": 590
},
{
"epoch": 1.0186585624425755,
"grad_norm": 0.93359375,
"learning_rate": 7.703554722145201e-05,
"loss": 1.1017,
"mean_token_accuracy": 0.7215727421144644,
"step": 600
},
{
"epoch": 1.0356208919358258,
"grad_norm": 0.9609375,
"learning_rate": 7.625966791132468e-05,
"loss": 1.0907,
"mean_token_accuracy": 0.7208627772827944,
"step": 610
},
{
"epoch": 1.0525832214290762,
"grad_norm": 0.99609375,
"learning_rate": 7.547495653986536e-05,
"loss": 1.0893,
"mean_token_accuracy": 0.7245491112271945,
"step": 620
},
{
"epoch": 1.0695455509223266,
"grad_norm": 1.015625,
"learning_rate": 7.468167703344902e-05,
"loss": 1.0853,
"mean_token_accuracy": 0.7234922610223293,
"step": 630
},
{
"epoch": 1.086507880415577,
"grad_norm": 0.92578125,
"learning_rate": 7.388009620021959e-05,
"loss": 1.1004,
"mean_token_accuracy": 0.7199652560055256,
"step": 640
},
{
"epoch": 1.1034702099088274,
"grad_norm": 0.94921875,
"learning_rate": 7.307048364035266e-05,
"loss": 1.1206,
"mean_token_accuracy": 0.719177692135175,
"step": 650
},
{
"epoch": 1.120432539402078,
"grad_norm": 0.9375,
"learning_rate": 7.225311165537956e-05,
"loss": 1.0905,
"mean_token_accuracy": 0.7229047452410062,
"step": 660
},
{
"epoch": 1.1373948688953284,
"grad_norm": 0.90234375,
"learning_rate": 7.142825515660259e-05,
"loss": 1.1184,
"mean_token_accuracy": 0.7195753792921702,
"step": 670
},
{
"epoch": 1.1543571983885788,
"grad_norm": 0.9453125,
"learning_rate": 7.059619157263245e-05,
"loss": 1.1152,
"mean_token_accuracy": 0.7191205089290936,
"step": 680
},
{
"epoch": 1.1713195278818291,
"grad_norm": 0.984375,
"learning_rate": 6.975720075607927e-05,
"loss": 1.1029,
"mean_token_accuracy": 0.7206906000773112,
"step": 690
},
{
"epoch": 1.1882818573750795,
"grad_norm": 0.953125,
"learning_rate": 6.891156488942811e-05,
"loss": 1.0929,
"mean_token_accuracy": 0.7221428496142228,
"step": 700
},
{
"epoch": 1.20524418686833,
"grad_norm": 1.03125,
"learning_rate": 6.805956839013107e-05,
"loss": 1.1047,
"mean_token_accuracy": 0.7189865835011006,
"step": 710
},
{
"epoch": 1.2222065163615803,
"grad_norm": 0.97265625,
"learning_rate": 6.720149781494738e-05,
"loss": 1.1151,
"mean_token_accuracy": 0.7195135744909446,
"step": 720
},
{
"epoch": 1.2391688458548307,
"grad_norm": 1.0,
"learning_rate": 6.633764176356434e-05,
"loss": 1.0717,
"mean_token_accuracy": 0.7264206613103549,
"step": 730
},
{
"epoch": 1.2561311753480813,
"grad_norm": 1.0625,
"learning_rate": 6.546829078153086e-05,
"loss": 1.109,
"mean_token_accuracy": 0.718661529570818,
"step": 740
},
{
"epoch": 1.2730935048413317,
"grad_norm": 0.94140625,
"learning_rate": 6.459373726253672e-05,
"loss": 1.0936,
"mean_token_accuracy": 0.7220857585469882,
"step": 750
},
{
"epoch": 1.290055834334582,
"grad_norm": 0.9453125,
"learning_rate": 6.371427535007008e-05,
"loss": 1.0909,
"mean_token_accuracy": 0.7229609449704488,
"step": 760
},
{
"epoch": 1.3070181638278324,
"grad_norm": 0.8984375,
"learning_rate": 6.283020083848661e-05,
"loss": 1.1011,
"mean_token_accuracy": 0.7228824739654859,
"step": 770
},
{
"epoch": 1.3239804933210828,
"grad_norm": 0.91015625,
"learning_rate": 6.194181107352331e-05,
"loss": 1.0762,
"mean_token_accuracy": 0.7260710549851258,
"step": 780
},
{
"epoch": 1.3409428228143332,
"grad_norm": 0.88671875,
"learning_rate": 6.104940485229054e-05,
"loss": 1.097,
"mean_token_accuracy": 0.7203553736209869,
"step": 790
},
{
"epoch": 1.3579051523075836,
"grad_norm": 1.0234375,
"learning_rate": 6.015328232277593e-05,
"loss": 1.1041,
"mean_token_accuracy": 0.722166525820891,
"step": 800
},
{
"epoch": 1.374867481800834,
"grad_norm": 0.97265625,
"learning_rate": 5.925374488289388e-05,
"loss": 1.096,
"mean_token_accuracy": 0.7233567799131075,
"step": 810
},
{
"epoch": 1.3918298112940843,
"grad_norm": 0.96875,
"learning_rate": 5.8351095079114745e-05,
"loss": 1.103,
"mean_token_accuracy": 0.721570813159148,
"step": 820
},
{
"epoch": 1.4087921407873347,
"grad_norm": 0.95703125,
"learning_rate": 5.74456365047077e-05,
"loss": 1.1058,
"mean_token_accuracy": 0.7204289863506953,
"step": 830
},
{
"epoch": 1.425754470280585,
"grad_norm": 1.0859375,
"learning_rate": 5.653767369763148e-05,
"loss": 1.0895,
"mean_token_accuracy": 0.7256171715756258,
"step": 840
},
{
"epoch": 1.4427167997738355,
"grad_norm": 0.953125,
"learning_rate": 5.562751203810742e-05,
"loss": 1.0696,
"mean_token_accuracy": 0.72752467567722,
"step": 850
},
{
"epoch": 1.459679129267086,
"grad_norm": 0.95703125,
"learning_rate": 5.471545764590924e-05,
"loss": 1.0586,
"mean_token_accuracy": 0.7292891172071297,
"step": 860
},
{
"epoch": 1.4766414587603365,
"grad_norm": 0.92578125,
"learning_rate": 5.3801817277404066e-05,
"loss": 1.0876,
"mean_token_accuracy": 0.7224949277937412,
"step": 870
},
{
"epoch": 1.4936037882535869,
"grad_norm": 0.921875,
"learning_rate": 5.28868982223793e-05,
"loss": 1.0675,
"mean_token_accuracy": 0.7286781263848146,
"step": 880
},
{
"epoch": 1.5105661177468372,
"grad_norm": 0.94140625,
"learning_rate": 5.197100820069016e-05,
"loss": 1.0845,
"mean_token_accuracy": 0.7255125172436238,
"step": 890
},
{
"epoch": 1.5275284472400876,
"grad_norm": 0.94140625,
"learning_rate": 5.1054455258762535e-05,
"loss": 1.0998,
"mean_token_accuracy": 0.7214603280027707,
"step": 900
},
{
"epoch": 1.544490776733338,
"grad_norm": 0.99609375,
"learning_rate": 5.0137547665985985e-05,
"loss": 1.1018,
"mean_token_accuracy": 0.7225766807794571,
"step": 910
},
{
"epoch": 1.5614531062265886,
"grad_norm": 0.9453125,
"learning_rate": 4.9220593811031786e-05,
"loss": 1.0482,
"mean_token_accuracy": 0.7321401789784432,
"step": 920
},
{
"epoch": 1.578415435719839,
"grad_norm": 0.9921875,
"learning_rate": 4.83039020981308e-05,
"loss": 1.0618,
"mean_token_accuracy": 0.7308160757025083,
"step": 930
},
{
"epoch": 1.5953777652130894,
"grad_norm": 0.9453125,
"learning_rate": 4.738778084334625e-05,
"loss": 1.079,
"mean_token_accuracy": 0.7259436552723249,
"step": 940
},
{
"epoch": 1.6123400947063398,
"grad_norm": 0.9921875,
"learning_rate": 4.6472538170875924e-05,
"loss": 1.054,
"mean_token_accuracy": 0.7302558933695157,
"step": 950
},
{
"epoch": 1.6293024241995901,
"grad_norm": 0.984375,
"learning_rate": 4.5558481909419095e-05,
"loss": 1.0522,
"mean_token_accuracy": 0.7308388692637284,
"step": 960
},
{
"epoch": 1.6462647536928405,
"grad_norm": 0.93359375,
"learning_rate": 4.46459194886428e-05,
"loss": 1.0886,
"mean_token_accuracy": 0.7265370438496271,
"step": 970
},
{
"epoch": 1.663227083186091,
"grad_norm": 0.9375,
"learning_rate": 4.373515783578226e-05,
"loss": 1.0657,
"mean_token_accuracy": 0.7278412433962027,
"step": 980
},
{
"epoch": 1.6801894126793413,
"grad_norm": 0.90234375,
"learning_rate": 4.2826503272410304e-05,
"loss": 1.0636,
"mean_token_accuracy": 0.7297646810611089,
"step": 990
},
{
"epoch": 1.6971517421725917,
"grad_norm": 0.90625,
"learning_rate": 4.1920261411410536e-05,
"loss": 1.0802,
"mean_token_accuracy": 0.7267571208377679,
"step": 1000
},
{
"epoch": 1.714114071665842,
"grad_norm": 0.9921875,
"learning_rate": 4.101673705418888e-05,
"loss": 1.0609,
"mean_token_accuracy": 0.7287999058763186,
"step": 1010
},
{
"epoch": 1.7310764011590924,
"grad_norm": 1.046875,
"learning_rate": 4.011623408815799e-05,
"loss": 1.0515,
"mean_token_accuracy": 0.7311748243868351,
"step": 1020
},
{
"epoch": 1.7480387306523428,
"grad_norm": 0.8984375,
"learning_rate": 3.9219055384529e-05,
"loss": 1.0815,
"mean_token_accuracy": 0.7260699895521004,
"step": 1030
},
{
"epoch": 1.7650010601455932,
"grad_norm": 0.95703125,
"learning_rate": 3.83255026964453e-05,
"loss": 1.0436,
"mean_token_accuracy": 0.7335223399102688,
"step": 1040
},
{
"epoch": 1.7819633896388436,
"grad_norm": 0.89453125,
"learning_rate": 3.7435876557492156e-05,
"loss": 1.0713,
"mean_token_accuracy": 0.7298086928824584,
"step": 1050
},
{
"epoch": 1.798925719132094,
"grad_norm": 1.0,
"learning_rate": 3.655047618061648e-05,
"loss": 1.0633,
"mean_token_accuracy": 0.7291242313881715,
"step": 1060
},
{
"epoch": 1.8158880486253446,
"grad_norm": 0.96875,
"learning_rate": 3.566959935749101e-05,
"loss": 1.062,
"mean_token_accuracy": 0.7298740123709043,
"step": 1070
},
{
"epoch": 1.832850378118595,
"grad_norm": 0.94921875,
"learning_rate": 3.479354235835622e-05,
"loss": 1.041,
"mean_token_accuracy": 0.7348680111269156,
"step": 1080
},
{
"epoch": 1.8498127076118454,
"grad_norm": 0.9375,
"learning_rate": 3.3922599832374226e-05,
"loss": 1.0601,
"mean_token_accuracy": 0.7297768058876196,
"step": 1090
},
{
"epoch": 1.8667750371050957,
"grad_norm": 0.9296875,
"learning_rate": 3.3057064708527686e-05,
"loss": 1.0516,
"mean_token_accuracy": 0.7326766779025395,
"step": 1100
},
{
"epoch": 1.8837373665983461,
"grad_norm": 0.90234375,
"learning_rate": 3.2197228097097346e-05,
"loss": 1.0737,
"mean_token_accuracy": 0.7286487720906735,
"step": 1110
},
{
"epoch": 1.9006996960915967,
"grad_norm": 0.90625,
"learning_rate": 3.1343379191751364e-05,
"loss": 1.0685,
"mean_token_accuracy": 0.7287176544467608,
"step": 1120
},
{
"epoch": 1.917662025584847,
"grad_norm": 0.83203125,
"learning_rate": 3.0495805172279167e-05,
"loss": 1.0279,
"mean_token_accuracy": 0.7362240366637707,
"step": 1130
},
{
"epoch": 1.9346243550780975,
"grad_norm": 0.9765625,
"learning_rate": 2.9654791108002567e-05,
"loss": 1.0658,
"mean_token_accuracy": 0.7271889204780261,
"step": 1140
},
{
"epoch": 1.9515866845713479,
"grad_norm": 0.96875,
"learning_rate": 2.8820619861896907e-05,
"loss": 1.0521,
"mean_token_accuracy": 0.7311085325976213,
"step": 1150
},
{
"epoch": 1.9685490140645983,
"grad_norm": 0.875,
"learning_rate": 2.7993571995454126e-05,
"loss": 1.0457,
"mean_token_accuracy": 0.7344075481096903,
"step": 1160
},
{
"epoch": 1.9855113435578486,
"grad_norm": 0.92578125,
"learning_rate": 2.7173925674319957e-05,
"loss": 1.0498,
"mean_token_accuracy": 0.7325132201115291,
"step": 1170
},
{
"epoch": 2.00339246589865,
"grad_norm": 1.40625,
"learning_rate": 2.6361956574736868e-05,
"loss": 1.139,
"mean_token_accuracy": 0.7362643013195116,
"step": 1180
},
{
"epoch": 2.0203547953919005,
"grad_norm": 1.0546875,
"learning_rate": 2.5557937790824382e-05,
"loss": 0.8593,
"mean_token_accuracy": 0.774844840914011,
"step": 1190
},
{
"epoch": 2.037317124885151,
"grad_norm": 1.0234375,
"learning_rate": 2.4762139742727797e-05,
"loss": 0.8657,
"mean_token_accuracy": 0.7745203738411267,
"step": 1200
},
{
"epoch": 2.0542794543784013,
"grad_norm": 0.9765625,
"learning_rate": 2.3974830085666237e-05,
"loss": 0.8389,
"mean_token_accuracy": 0.7786116595069568,
"step": 1210
},
{
"epoch": 2.0712417838716517,
"grad_norm": 1.0234375,
"learning_rate": 2.3196273619910637e-05,
"loss": 0.8757,
"mean_token_accuracy": 0.7714692994952201,
"step": 1220
},
{
"epoch": 2.088204113364902,
"grad_norm": 1.03125,
"learning_rate": 2.242673220172209e-05,
"loss": 0.869,
"mean_token_accuracy": 0.7752320093413194,
"step": 1230
},
{
"epoch": 2.1051664428581525,
"grad_norm": 1.0078125,
"learning_rate": 2.1666464655280133e-05,
"loss": 0.8593,
"mean_token_accuracy": 0.7770325655738513,
"step": 1240
},
{
"epoch": 2.122128772351403,
"grad_norm": 0.9765625,
"learning_rate": 2.0915726685631075e-05,
"loss": 0.8809,
"mean_token_accuracy": 0.7723904815812905,
"step": 1250
},
{
"epoch": 2.1390911018446532,
"grad_norm": 1.0625,
"learning_rate": 2.0174770792685243e-05,
"loss": 0.8523,
"mean_token_accuracy": 0.7762084101637204,
"step": 1260
},
{
"epoch": 2.1560534313379036,
"grad_norm": 1.015625,
"learning_rate": 1.9443846186292202e-05,
"loss": 0.8619,
"mean_token_accuracy": 0.7747002402941386,
"step": 1270
},
{
"epoch": 2.173015760831154,
"grad_norm": 0.9453125,
"learning_rate": 1.8723198702422724e-05,
"loss": 0.8542,
"mean_token_accuracy": 0.7765256710350513,
"step": 1280
},
{
"epoch": 2.1899780903244044,
"grad_norm": 0.921875,
"learning_rate": 1.8013070720485354e-05,
"loss": 0.8496,
"mean_token_accuracy": 0.7781426074604193,
"step": 1290
},
{
"epoch": 2.2069404198176548,
"grad_norm": 0.96875,
"learning_rate": 1.7313701081805506e-05,
"loss": 0.8945,
"mean_token_accuracy": 0.7692260307570299,
"step": 1300
},
{
"epoch": 2.2239027493109056,
"grad_norm": 0.984375,
"learning_rate": 1.6625325009294774e-05,
"loss": 0.8587,
"mean_token_accuracy": 0.7756107933819294,
"step": 1310
},
{
"epoch": 2.240865078804156,
"grad_norm": 1.0078125,
"learning_rate": 1.594817402833693e-05,
"loss": 0.8394,
"mean_token_accuracy": 0.7796109855175019,
"step": 1320
},
{
"epoch": 2.2578274082974064,
"grad_norm": 1.0078125,
"learning_rate": 1.5282475888917835e-05,
"loss": 0.8728,
"mean_token_accuracy": 0.7744500560065111,
"step": 1330
},
{
"epoch": 2.2747897377906567,
"grad_norm": 1.0,
"learning_rate": 1.4628454489024934e-05,
"loss": 0.8518,
"mean_token_accuracy": 0.777750201523304,
"step": 1340
},
{
"epoch": 2.291752067283907,
"grad_norm": 1.0859375,
"learning_rate": 1.398632979934235e-05,
"loss": 0.8548,
"mean_token_accuracy": 0.7758344347278278,
"step": 1350
},
{
"epoch": 2.3087143967771575,
"grad_norm": 1.0390625,
"learning_rate": 1.335631778926702e-05,
"loss": 0.8651,
"mean_token_accuracy": 0.7755650137861569,
"step": 1360
},
{
"epoch": 2.325676726270408,
"grad_norm": 1.0078125,
"learning_rate": 1.2738630354270437e-05,
"loss": 0.8697,
"mean_token_accuracy": 0.7738447397947311,
"step": 1370
},
{
"epoch": 2.3426390557636583,
"grad_norm": 0.984375,
"learning_rate": 1.2133475244630615e-05,
"loss": 0.8657,
"mean_token_accuracy": 0.7746792284150918,
"step": 1380
},
{
"epoch": 2.3596013852569087,
"grad_norm": 0.984375,
"learning_rate": 1.154105599555837e-05,
"loss": 0.8817,
"mean_token_accuracy": 0.7732328993578752,
"step": 1390
},
{
"epoch": 2.376563714750159,
"grad_norm": 0.9921875,
"learning_rate": 1.0961571858741088e-05,
"loss": 0.861,
"mean_token_accuracy": 0.7760703690350056,
"step": 1400
},
{
"epoch": 2.3935260442434094,
"grad_norm": 1.0625,
"learning_rate": 1.0395217735327362e-05,
"loss": 0.8615,
"mean_token_accuracy": 0.775514493137598,
"step": 1410
},
{
"epoch": 2.41048837373666,
"grad_norm": 0.984375,
"learning_rate": 9.84218411037477e-06,
"loss": 0.874,
"mean_token_accuracy": 0.7736226240793864,
"step": 1420
},
{
"epoch": 2.42745070322991,
"grad_norm": 1.0,
"learning_rate": 9.30265698878291e-06,
"loss": 0.8676,
"mean_token_accuracy": 0.7738830464581649,
"step": 1430
},
{
"epoch": 2.4444130327231606,
"grad_norm": 1.5,
"learning_rate": 8.776817832733436e-06,
"loss": 0.8733,
"mean_token_accuracy": 0.7751010999083519,
"step": 1440
},
{
"epoch": 2.461375362216411,
"grad_norm": 0.98046875,
"learning_rate": 8.264843500657799e-06,
"loss": 0.8521,
"mean_token_accuracy": 0.7777772823969523,
"step": 1450
},
{
"epoch": 2.4783376917096613,
"grad_norm": 0.9765625,
"learning_rate": 7.766906187753442e-06,
"loss": 0.8635,
"mean_token_accuracy": 0.773188495139281,
"step": 1460
},
{
"epoch": 2.4953000212029117,
"grad_norm": 0.96875,
"learning_rate": 7.283173368068497e-06,
"loss": 0.844,
"mean_token_accuracy": 0.7778890219827493,
"step": 1470
},
{
"epoch": 2.5122623506961625,
"grad_norm": 1.0546875,
"learning_rate": 6.8138077381742e-06,
"loss": 0.8645,
"mean_token_accuracy": 0.7747706746061643,
"step": 1480
},
{
"epoch": 2.5292246801894125,
"grad_norm": 1.015625,
"learning_rate": 6.3589671624443405e-06,
"loss": 0.868,
"mean_token_accuracy": 0.774104047069947,
"step": 1490
},
{
"epoch": 2.5461870096826633,
"grad_norm": 1.0234375,
"learning_rate": 5.918804619959806e-06,
"loss": 0.8732,
"mean_token_accuracy": 0.7745006288091342,
"step": 1500
},
{
"epoch": 2.5631493391759133,
"grad_norm": 0.98828125,
"learning_rate": 5.493468153056236e-06,
"loss": 0.8811,
"mean_token_accuracy": 0.7711024150252342,
"step": 1510
},
{
"epoch": 2.580111668669164,
"grad_norm": 0.96484375,
"learning_rate": 5.083100817532177e-06,
"loss": 0.8709,
"mean_token_accuracy": 0.7753738241891066,
"step": 1520
},
{
"epoch": 2.5970739981624145,
"grad_norm": 0.98046875,
"learning_rate": 4.687840634534302e-06,
"loss": 0.8596,
"mean_token_accuracy": 0.7756442760427793,
"step": 1530
},
{
"epoch": 2.614036327655665,
"grad_norm": 1.0234375,
"learning_rate": 4.307820544135937e-06,
"loss": 0.8553,
"mean_token_accuracy": 0.7786166049540043,
"step": 1540
},
{
"epoch": 2.6309986571489152,
"grad_norm": 1.03125,
"learning_rate": 3.943168360624672e-06,
"loss": 0.8614,
"mean_token_accuracy": 0.7758901623388131,
"step": 1550
},
{
"epoch": 2.6479609866421656,
"grad_norm": 1.0234375,
"learning_rate": 3.594006729513771e-06,
"loss": 0.8507,
"mean_token_accuracy": 0.7781767211854458,
"step": 1560
},
{
"epoch": 2.664923316135416,
"grad_norm": 1.0078125,
"learning_rate": 3.260453086292187e-06,
"loss": 0.8734,
"mean_token_accuracy": 0.7742785550653934,
"step": 1570
},
{
"epoch": 2.6818856456286664,
"grad_norm": 1.078125,
"learning_rate": 2.942619616926806e-06,
"loss": 0.8792,
"mean_token_accuracy": 0.772778149942557,
"step": 1580
},
{
"epoch": 2.6988479751219168,
"grad_norm": 1.0,
"learning_rate": 2.640613220130278e-06,
"loss": 0.8412,
"mean_token_accuracy": 0.7816825255751609,
"step": 1590
},
{
"epoch": 2.715810304615167,
"grad_norm": 0.96875,
"learning_rate": 2.3545354714072264e-06,
"loss": 0.8622,
"mean_token_accuracy": 0.7764831451078256,
"step": 1600
},
{
"epoch": 2.7327726341084175,
"grad_norm": 0.9296875,
"learning_rate": 2.0844825888907738e-06,
"loss": 0.8662,
"mean_token_accuracy": 0.7740148122111956,
"step": 1610
},
{
"epoch": 2.749734963601668,
"grad_norm": 1.0078125,
"learning_rate": 1.8305454009809097e-06,
"loss": 0.8801,
"mean_token_accuracy": 0.7726842557390531,
"step": 1620
},
{
"epoch": 2.7666972930949183,
"grad_norm": 1.0234375,
"learning_rate": 1.5928093157957403e-06,
"loss": 0.8601,
"mean_token_accuracy": 0.7756263218820095,
"step": 1630
},
{
"epoch": 2.7836596225881687,
"grad_norm": 0.97265625,
"learning_rate": 1.3713542924456479e-06,
"loss": 0.8507,
"mean_token_accuracy": 0.777658429245154,
"step": 1640
},
{
"epoch": 2.800621952081419,
"grad_norm": 1.03125,
"learning_rate": 1.1662548141402163e-06,
"loss": 0.8511,
"mean_token_accuracy": 0.7770614944398403,
"step": 1650
},
{
"epoch": 2.8175842815746694,
"grad_norm": 1.015625,
"learning_rate": 9.775798631368626e-07,
"loss": 0.8463,
"mean_token_accuracy": 0.7784749428431194,
"step": 1660
},
{
"epoch": 2.83454661106792,
"grad_norm": 1.03125,
"learning_rate": 8.053928975396418e-07,
"loss": 0.8785,
"mean_token_accuracy": 0.7732983765502771,
"step": 1670
},
{
"epoch": 2.85150894056117,
"grad_norm": 1.03125,
"learning_rate": 6.497518299560634e-07,
"loss": 0.8695,
"mean_token_accuracy": 0.7760962655146917,
"step": 1680
},
{
"epoch": 2.868471270054421,
"grad_norm": 0.9453125,
"learning_rate": 5.107090080189725e-07,
"loss": 0.844,
"mean_token_accuracy": 0.7783816205958526,
"step": 1690
},
{
"epoch": 2.885433599547671,
"grad_norm": 1.0234375,
"learning_rate": 3.883111967802111e-07,
"loss": 0.8782,
"mean_token_accuracy": 0.7730122750004133,
"step": 1700
},
{
"epoch": 2.902395929040922,
"grad_norm": 1.046875,
"learning_rate": 2.8259956298185566e-07,
"loss": 0.8752,
"mean_token_accuracy": 0.7732014996310075,
"step": 1710
},
{
"epoch": 2.919358258534172,
"grad_norm": 1.0078125,
"learning_rate": 1.9360966121035329e-07,
"loss": 0.8592,
"mean_token_accuracy": 0.7777578723927339,
"step": 1720
},
{
"epoch": 2.9363205880274226,
"grad_norm": 1.015625,
"learning_rate": 1.213714219382933e-07,
"loss": 0.8657,
"mean_token_accuracy": 0.7750948662559192,
"step": 1730
},
{
"epoch": 2.953282917520673,
"grad_norm": 1.015625,
"learning_rate": 6.59091414576929e-08,
"loss": 0.8642,
"mean_token_accuracy": 0.7749369906882445,
"step": 1740
},
{
"epoch": 2.9702452470139233,
"grad_norm": 1.03125,
"learning_rate": 2.7241473708283783e-08,
"loss": 0.8584,
"mean_token_accuracy": 0.7779290979107221,
"step": 1750
},
{
"epoch": 2.9872075765071737,
"grad_norm": 0.94921875,
"learning_rate": 5.381424003553237e-09,
"loss": 0.8599,
"mean_token_accuracy": 0.7751329804460207,
"step": 1760
}
],
"logging_steps": 10,
"max_steps": 1767,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.981901778352865e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}