gate_instruct / trainer_state.json
IvenWu123's picture
Upload folder using huggingface_hub
881c70f verified
Raw
History Blame Contribute Delete
54.4 kB
{
"best_global_step": 12000,
"best_metric": 1.75405061,
"best_model_checkpoint": "/scratch/prj0000000267/yuefan/UnifyTrajLLM/output_rope_instruct_gate_llmlow5_5e-4/v3-20251108-210106/checkpoint-12000",
"epoch": 1.5774944130406205,
"eval_steps": 2000,
"global_step": 12000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00013145786775338504,
"grad_norm": 268.0,
"learning_rate": 1.314060446780552e-07,
"loss": 30.039020538330078,
"step": 1,
"token_acc": 0.0
},
{
"epoch": 0.006572893387669252,
"grad_norm": 47.5,
"learning_rate": 6.5703022339027605e-06,
"loss": 24.82280123963648,
"step": 50,
"token_acc": 0.036200167198766994
},
{
"epoch": 0.013145786775338505,
"grad_norm": 59.5,
"learning_rate": 1.3140604467805521e-05,
"loss": 14.215625,
"step": 100,
"token_acc": 0.43851580932023954
},
{
"epoch": 0.019718680163007755,
"grad_norm": 10.1875,
"learning_rate": 1.9710906701708278e-05,
"loss": 6.50881591796875,
"step": 150,
"token_acc": 0.6614513467387406
},
{
"epoch": 0.02629157355067701,
"grad_norm": 32.25,
"learning_rate": 2.6281208935611042e-05,
"loss": 6.06468505859375,
"step": 200,
"token_acc": 0.6807562909100595
},
{
"epoch": 0.03286446693834626,
"grad_norm": 10.9375,
"learning_rate": 3.2851511169513796e-05,
"loss": 5.703701171875,
"step": 250,
"token_acc": 0.6826512527189237
},
{
"epoch": 0.03943736032601551,
"grad_norm": 14.3125,
"learning_rate": 3.9421813403416556e-05,
"loss": 5.412930908203125,
"step": 300,
"token_acc": 0.6959136632453732
},
{
"epoch": 0.046010253713684765,
"grad_norm": 7.03125,
"learning_rate": 4.5992115637319317e-05,
"loss": 5.1832080078125,
"step": 350,
"token_acc": 0.6958016739273748
},
{
"epoch": 0.05258314710135402,
"grad_norm": 6.125,
"learning_rate": 5.2562417871222084e-05,
"loss": 5.019949340820313,
"step": 400,
"token_acc": 0.7036372985827332
},
{
"epoch": 0.059156040489023266,
"grad_norm": 5.09375,
"learning_rate": 5.913272010512484e-05,
"loss": 4.91180908203125,
"step": 450,
"token_acc": 0.7090584023609108
},
{
"epoch": 0.06572893387669251,
"grad_norm": 6.90625,
"learning_rate": 6.570302233902759e-05,
"loss": 4.86035888671875,
"step": 500,
"token_acc": 0.7103483463054483
},
{
"epoch": 0.07230182726436177,
"grad_norm": 4.21875,
"learning_rate": 7.227332457293036e-05,
"loss": 4.782819213867188,
"step": 550,
"token_acc": 0.711807526156599
},
{
"epoch": 0.07887472065203102,
"grad_norm": 4.3125,
"learning_rate": 7.884362680683311e-05,
"loss": 4.707298889160156,
"step": 600,
"token_acc": 0.7184096746149201
},
{
"epoch": 0.08544761403970028,
"grad_norm": 5.25,
"learning_rate": 8.541392904073588e-05,
"loss": 4.594347534179687,
"step": 650,
"token_acc": 0.7216674015037796
},
{
"epoch": 0.09202050742736953,
"grad_norm": 3.609375,
"learning_rate": 9.198423127463863e-05,
"loss": 4.58609375,
"step": 700,
"token_acc": 0.714123727321825
},
{
"epoch": 0.09859340081503878,
"grad_norm": 3.421875,
"learning_rate": 9.85545335085414e-05,
"loss": 4.494258422851562,
"step": 750,
"token_acc": 0.7178661247995335
},
{
"epoch": 0.10516629420270804,
"grad_norm": 4.75,
"learning_rate": 9.999820340427517e-05,
"loss": 4.393697509765625,
"step": 800,
"token_acc": 0.719993257471585
},
{
"epoch": 0.11173918759037728,
"grad_norm": 3.28125,
"learning_rate": 9.999064399990964e-05,
"loss": 4.324550170898437,
"step": 850,
"token_acc": 0.7193472614965004
},
{
"epoch": 0.11831208097804653,
"grad_norm": 3.359375,
"learning_rate": 9.997717975457807e-05,
"loss": 4.1653680419921875,
"step": 900,
"token_acc": 0.7247381121504013
},
{
"epoch": 0.12488497436571579,
"grad_norm": 1.5546875,
"learning_rate": 9.995781225866254e-05,
"loss": 4.05832275390625,
"step": 950,
"token_acc": 0.7224337955208558
},
{
"epoch": 0.13145786775338503,
"grad_norm": 2.21875,
"learning_rate": 9.993254379983084e-05,
"loss": 3.977420654296875,
"step": 1000,
"token_acc": 0.7213713367669359
},
{
"epoch": 0.13803076114105428,
"grad_norm": 1.59375,
"learning_rate": 9.990137736276604e-05,
"loss": 3.8511199951171875,
"step": 1050,
"token_acc": 0.7259079629296982
},
{
"epoch": 0.14460365452872354,
"grad_norm": 2.109375,
"learning_rate": 9.98643166288141e-05,
"loss": 3.7238555908203126,
"step": 1100,
"token_acc": 0.729174537368435
},
{
"epoch": 0.1511765479163928,
"grad_norm": 3.484375,
"learning_rate": 9.982136597554896e-05,
"loss": 3.6605801391601562,
"step": 1150,
"token_acc": 0.7267878333802646
},
{
"epoch": 0.15774944130406204,
"grad_norm": 1.765625,
"learning_rate": 9.977253047625546e-05,
"loss": 3.586345520019531,
"step": 1200,
"token_acc": 0.7296032337886851
},
{
"epoch": 0.1643223346917313,
"grad_norm": 1.875,
"learning_rate": 9.971781589933012e-05,
"loss": 3.5275897216796874,
"step": 1250,
"token_acc": 0.7274431196530496
},
{
"epoch": 0.17089522807940055,
"grad_norm": 5.34375,
"learning_rate": 9.965722870759977e-05,
"loss": 3.4567681884765626,
"step": 1300,
"token_acc": 0.7260510593991347
},
{
"epoch": 0.1774681214670698,
"grad_norm": 2.984375,
"learning_rate": 9.959077605755818e-05,
"loss": 3.35341064453125,
"step": 1350,
"token_acc": 0.7315002945127964
},
{
"epoch": 0.18404101485473906,
"grad_norm": 2.640625,
"learning_rate": 9.951846579852069e-05,
"loss": 3.2548678588867186,
"step": 1400,
"token_acc": 0.7334485568361279
},
{
"epoch": 0.19061390824240831,
"grad_norm": 2.328125,
"learning_rate": 9.944030647169715e-05,
"loss": 3.1699752807617188,
"step": 1450,
"token_acc": 0.7346018069265517
},
{
"epoch": 0.19718680163007757,
"grad_norm": 3.75,
"learning_rate": 9.935630730918297e-05,
"loss": 3.123944091796875,
"step": 1500,
"token_acc": 0.7325574233567774
},
{
"epoch": 0.20375969501774682,
"grad_norm": 2.328125,
"learning_rate": 9.926647823286865e-05,
"loss": 3.031203308105469,
"step": 1550,
"token_acc": 0.7343244664345068
},
{
"epoch": 0.21033258840541608,
"grad_norm": 2.09375,
"learning_rate": 9.917082985326782e-05,
"loss": 2.9396633911132812,
"step": 1600,
"token_acc": 0.736209056167852
},
{
"epoch": 0.2169054817930853,
"grad_norm": 1.921875,
"learning_rate": 9.906937346826395e-05,
"loss": 2.8921356201171875,
"step": 1650,
"token_acc": 0.7373535529118604
},
{
"epoch": 0.22347837518075456,
"grad_norm": 2.015625,
"learning_rate": 9.896212106177583e-05,
"loss": 2.8311395263671875,
"step": 1700,
"token_acc": 0.7392403929710977
},
{
"epoch": 0.2300512685684238,
"grad_norm": 2.015625,
"learning_rate": 9.884908530234208e-05,
"loss": 2.7363882446289063,
"step": 1750,
"token_acc": 0.7410795625843831
},
{
"epoch": 0.23662416195609307,
"grad_norm": 1.5546875,
"learning_rate": 9.873027954162471e-05,
"loss": 2.6730242919921876,
"step": 1800,
"token_acc": 0.7443422077792354
},
{
"epoch": 0.24319705534376232,
"grad_norm": 1.84375,
"learning_rate": 9.860571781283208e-05,
"loss": 2.6252935791015624,
"step": 1850,
"token_acc": 0.7444647858608681
},
{
"epoch": 0.24976994873143157,
"grad_norm": 1.6640625,
"learning_rate": 9.847541482906129e-05,
"loss": 2.5712957763671875,
"step": 1900,
"token_acc": 0.7508503287266872
},
{
"epoch": 0.25634284211910086,
"grad_norm": 1.265625,
"learning_rate": 9.833938598156025e-05,
"loss": 2.5640655517578126,
"step": 1950,
"token_acc": 0.7425811658922213
},
{
"epoch": 0.26291573550677005,
"grad_norm": 1.1953125,
"learning_rate": 9.819764733790979e-05,
"loss": 2.5158842468261717,
"step": 2000,
"token_acc": 0.7452882362784471
},
{
"epoch": 0.26291573550677005,
"eval_loss": 2.518305778503418,
"eval_runtime": 236.0729,
"eval_samples_per_second": 186.692,
"eval_steps_per_second": 2.919,
"eval_token_acc": 0.749429720552148,
"step": 2000
},
{
"epoch": 0.2694886288944393,
"grad_norm": 1.4765625,
"learning_rate": 9.805021564012564e-05,
"loss": 2.4857614135742185,
"step": 2050,
"token_acc": 0.7441553323650091
},
{
"epoch": 0.27606152228210856,
"grad_norm": 2.421875,
"learning_rate": 9.789710830268099e-05,
"loss": 2.450667724609375,
"step": 2100,
"token_acc": 0.7500234051090009
},
{
"epoch": 0.2826344156697778,
"grad_norm": 1.046875,
"learning_rate": 9.773834341044944e-05,
"loss": 2.4290037536621094,
"step": 2150,
"token_acc": 0.7513961437248966
},
{
"epoch": 0.28920730905744707,
"grad_norm": 1.8203125,
"learning_rate": 9.757393971656888e-05,
"loss": 2.413728942871094,
"step": 2200,
"token_acc": 0.7452505502003465
},
{
"epoch": 0.2957802024451163,
"grad_norm": 1.7578125,
"learning_rate": 9.740391664022633e-05,
"loss": 2.3729684448242185,
"step": 2250,
"token_acc": 0.752083845606853
},
{
"epoch": 0.3023530958327856,
"grad_norm": 1.2421875,
"learning_rate": 9.722829426436427e-05,
"loss": 2.3652894592285154,
"step": 2300,
"token_acc": 0.7517237172802866
},
{
"epoch": 0.30892598922045483,
"grad_norm": 0.8125,
"learning_rate": 9.704709333330836e-05,
"loss": 2.356060791015625,
"step": 2350,
"token_acc": 0.7475131194646989
},
{
"epoch": 0.3154988826081241,
"grad_norm": 0.90625,
"learning_rate": 9.686033525031719e-05,
"loss": 2.3459547424316405,
"step": 2400,
"token_acc": 0.7465522261190833
},
{
"epoch": 0.32207177599579334,
"grad_norm": 0.86328125,
"learning_rate": 9.666804207505414e-05,
"loss": 2.34242919921875,
"step": 2450,
"token_acc": 0.7503000046993743
},
{
"epoch": 0.3286446693834626,
"grad_norm": 1.203125,
"learning_rate": 9.647023652098174e-05,
"loss": 2.30553955078125,
"step": 2500,
"token_acc": 0.7505765418279411
},
{
"epoch": 0.33521756277113185,
"grad_norm": 1.296875,
"learning_rate": 9.626694195267876e-05,
"loss": 2.2867636108398437,
"step": 2550,
"token_acc": 0.7495737322589445
},
{
"epoch": 0.3417904561588011,
"grad_norm": 1.3046875,
"learning_rate": 9.605818238308038e-05,
"loss": 2.2838902282714844,
"step": 2600,
"token_acc": 0.7510741453019647
},
{
"epoch": 0.34836334954647036,
"grad_norm": 1.2578125,
"learning_rate": 9.584398247064188e-05,
"loss": 2.2479782104492188,
"step": 2650,
"token_acc": 0.7525476660092044
},
{
"epoch": 0.3549362429341396,
"grad_norm": 0.98046875,
"learning_rate": 9.562436751642593e-05,
"loss": 2.2379521179199218,
"step": 2700,
"token_acc": 0.7535541690112872
},
{
"epoch": 0.36150913632180887,
"grad_norm": 1.2109375,
"learning_rate": 9.539936346111416e-05,
"loss": 2.25480712890625,
"step": 2750,
"token_acc": 0.7502481934500133
},
{
"epoch": 0.3680820297094781,
"grad_norm": 0.73046875,
"learning_rate": 9.516899688194294e-05,
"loss": 2.1890530395507812,
"step": 2800,
"token_acc": 0.7559071920628226
},
{
"epoch": 0.3746549230971474,
"grad_norm": 0.99609375,
"learning_rate": 9.493329498956421e-05,
"loss": 2.2252967834472654,
"step": 2850,
"token_acc": 0.7540685282249956
},
{
"epoch": 0.38122781648481663,
"grad_norm": 0.875,
"learning_rate": 9.469228562483132e-05,
"loss": 2.211038818359375,
"step": 2900,
"token_acc": 0.7534921970366274
},
{
"epoch": 0.3878007098724859,
"grad_norm": 1.0625,
"learning_rate": 9.444599725551061e-05,
"loss": 2.1635357666015627,
"step": 2950,
"token_acc": 0.7530642715579734
},
{
"epoch": 0.39437360326015514,
"grad_norm": 1.265625,
"learning_rate": 9.419445897291867e-05,
"loss": 2.1792333984375,
"step": 3000,
"token_acc": 0.7530076526518494
},
{
"epoch": 0.4009464966478244,
"grad_norm": 1.0390625,
"learning_rate": 9.393770048848622e-05,
"loss": 2.168623352050781,
"step": 3050,
"token_acc": 0.7545220973858319
},
{
"epoch": 0.40751939003549364,
"grad_norm": 0.703125,
"learning_rate": 9.367575213024861e-05,
"loss": 2.1656561279296875,
"step": 3100,
"token_acc": 0.7529528081537318
},
{
"epoch": 0.4140922834231629,
"grad_norm": 0.69921875,
"learning_rate": 9.340864483926343e-05,
"loss": 2.147900390625,
"step": 3150,
"token_acc": 0.7535216548028473
},
{
"epoch": 0.42066517681083215,
"grad_norm": 0.703125,
"learning_rate": 9.313641016595588e-05,
"loss": 2.1436308288574217,
"step": 3200,
"token_acc": 0.754756994891503
},
{
"epoch": 0.4272380701985014,
"grad_norm": 0.91796875,
"learning_rate": 9.285908026639207e-05,
"loss": 2.1488153076171876,
"step": 3250,
"token_acc": 0.7516223648809727
},
{
"epoch": 0.4338109635861706,
"grad_norm": 0.80859375,
"learning_rate": 9.257668789848067e-05,
"loss": 2.1125421142578125,
"step": 3300,
"token_acc": 0.7542638775798605
},
{
"epoch": 0.44038385697383986,
"grad_norm": 0.94921875,
"learning_rate": 9.228926641810367e-05,
"loss": 2.127976379394531,
"step": 3350,
"token_acc": 0.7534171662400647
},
{
"epoch": 0.4469567503615091,
"grad_norm": 0.90234375,
"learning_rate": 9.199684977517645e-05,
"loss": 2.117357025146484,
"step": 3400,
"token_acc": 0.754085423576444
},
{
"epoch": 0.45352964374917837,
"grad_norm": 0.82421875,
"learning_rate": 9.169947250963753e-05,
"loss": 2.1096246337890623,
"step": 3450,
"token_acc": 0.7573267065803522
},
{
"epoch": 0.4601025371368476,
"grad_norm": 0.68359375,
"learning_rate": 9.139716974736889e-05,
"loss": 2.08451904296875,
"step": 3500,
"token_acc": 0.7560729910200322
},
{
"epoch": 0.4666754305245169,
"grad_norm": 0.94140625,
"learning_rate": 9.108997719604687e-05,
"loss": 2.0704086303710936,
"step": 3550,
"token_acc": 0.758234912558547
},
{
"epoch": 0.47324832391218613,
"grad_norm": 0.94921875,
"learning_rate": 9.077793114092435e-05,
"loss": 2.0744793701171873,
"step": 3600,
"token_acc": 0.7553985593878892
},
{
"epoch": 0.4798212172998554,
"grad_norm": 0.87109375,
"learning_rate": 9.046106844054491e-05,
"loss": 2.055031433105469,
"step": 3650,
"token_acc": 0.7576514412150057
},
{
"epoch": 0.48639411068752464,
"grad_norm": 0.6875,
"learning_rate": 9.013942652238908e-05,
"loss": 2.0411907958984377,
"step": 3700,
"token_acc": 0.7550962098257407
},
{
"epoch": 0.4929670040751939,
"grad_norm": 0.75390625,
"learning_rate": 8.981304337845337e-05,
"loss": 2.0571356201171875,
"step": 3750,
"token_acc": 0.7560834470136957
},
{
"epoch": 0.49953989746286315,
"grad_norm": 0.67578125,
"learning_rate": 8.948195756076285e-05,
"loss": 2.074111022949219,
"step": 3800,
"token_acc": 0.7541681281518444
},
{
"epoch": 0.5061127908505324,
"grad_norm": 0.69140625,
"learning_rate": 8.914620817681729e-05,
"loss": 2.0392041015625,
"step": 3850,
"token_acc": 0.7577290307595985
},
{
"epoch": 0.5126856842382017,
"grad_norm": 1.0703125,
"learning_rate": 8.880583488497192e-05,
"loss": 2.0631610107421876,
"step": 3900,
"token_acc": 0.7556020800972513
},
{
"epoch": 0.5192585776258709,
"grad_norm": 0.69921875,
"learning_rate": 8.846087788975292e-05,
"loss": 2.0386505126953125,
"step": 3950,
"token_acc": 0.7588939592425994
},
{
"epoch": 0.5258314710135401,
"grad_norm": 0.9375,
"learning_rate": 8.811137793710863e-05,
"loss": 2.0313320922851563,
"step": 4000,
"token_acc": 0.7567113841020556
},
{
"epoch": 0.5258314710135401,
"eval_loss": 2.076582908630371,
"eval_runtime": 235.7989,
"eval_samples_per_second": 186.909,
"eval_steps_per_second": 2.922,
"eval_token_acc": 0.7596436572643577,
"step": 4000
},
{
"epoch": 0.5324043644012094,
"grad_norm": 0.82421875,
"learning_rate": 8.775737630959662e-05,
"loss": 2.0022723388671877,
"step": 4050,
"token_acc": 0.7579540762299285
},
{
"epoch": 0.5389772577888786,
"grad_norm": 0.93359375,
"learning_rate": 8.739891482150741e-05,
"loss": 2.0245912170410154,
"step": 4100,
"token_acc": 0.7564656900076155
},
{
"epoch": 0.5455501511765479,
"grad_norm": 0.76171875,
"learning_rate": 8.703603581392546e-05,
"loss": 2.0100286865234374,
"step": 4150,
"token_acc": 0.7567689057436112
},
{
"epoch": 0.5521230445642171,
"grad_norm": 0.8359375,
"learning_rate": 8.666878214972783e-05,
"loss": 2.021333923339844,
"step": 4200,
"token_acc": 0.7570281681148902
},
{
"epoch": 0.5586959379518864,
"grad_norm": 0.8046875,
"learning_rate": 8.629719720852138e-05,
"loss": 2.0115155029296874,
"step": 4250,
"token_acc": 0.7570202860215485
},
{
"epoch": 0.5652688313395556,
"grad_norm": 0.80859375,
"learning_rate": 8.59213248815187e-05,
"loss": 2.014189910888672,
"step": 4300,
"token_acc": 0.7566674687681726
},
{
"epoch": 0.5718417247272249,
"grad_norm": 0.69140625,
"learning_rate": 8.554120956635375e-05,
"loss": 1.9902659606933595,
"step": 4350,
"token_acc": 0.7575061325565429
},
{
"epoch": 0.5784146181148941,
"grad_norm": 0.7421875,
"learning_rate": 8.515689616183769e-05,
"loss": 1.9776287841796876,
"step": 4400,
"token_acc": 0.7585736758424619
},
{
"epoch": 0.5849875115025635,
"grad_norm": 0.609375,
"learning_rate": 8.476843006265545e-05,
"loss": 1.97283447265625,
"step": 4450,
"token_acc": 0.754149127932067
},
{
"epoch": 0.5915604048902326,
"grad_norm": 0.79296875,
"learning_rate": 8.437585715400384e-05,
"loss": 1.9807916259765626,
"step": 4500,
"token_acc": 0.7596238589600954
},
{
"epoch": 0.598133298277902,
"grad_norm": 0.84765625,
"learning_rate": 8.39792238061715e-05,
"loss": 1.9481539916992188,
"step": 4550,
"token_acc": 0.7577328978855814
},
{
"epoch": 0.6047061916655712,
"grad_norm": 0.875,
"learning_rate": 8.357857686906182e-05,
"loss": 1.9619242858886718,
"step": 4600,
"token_acc": 0.7588518194659313
},
{
"epoch": 0.6112790850532405,
"grad_norm": 0.66796875,
"learning_rate": 8.317396366665899e-05,
"loss": 1.9576710510253905,
"step": 4650,
"token_acc": 0.7596985079347987
},
{
"epoch": 0.6178519784409097,
"grad_norm": 0.6953125,
"learning_rate": 8.27654319914382e-05,
"loss": 1.9588572692871093,
"step": 4700,
"token_acc": 0.757741116751269
},
{
"epoch": 0.624424871828579,
"grad_norm": 0.6328125,
"learning_rate": 8.235303009872043e-05,
"loss": 1.954942626953125,
"step": 4750,
"token_acc": 0.761324026042421
},
{
"epoch": 0.6309977652162482,
"grad_norm": 0.94140625,
"learning_rate": 8.193680670097257e-05,
"loss": 1.9273374938964845,
"step": 4800,
"token_acc": 0.7592756976720991
},
{
"epoch": 0.6375706586039175,
"grad_norm": 0.73828125,
"learning_rate": 8.151681096205356e-05,
"loss": 1.94022216796875,
"step": 4850,
"token_acc": 0.7566143414515606
},
{
"epoch": 0.6441435519915867,
"grad_norm": 0.90234375,
"learning_rate": 8.109309249140721e-05,
"loss": 1.9436038208007813,
"step": 4900,
"token_acc": 0.7617455006768156
},
{
"epoch": 0.650716445379256,
"grad_norm": 0.83203125,
"learning_rate": 8.06657013382024e-05,
"loss": 1.937064208984375,
"step": 4950,
"token_acc": 0.7591963945791783
},
{
"epoch": 0.6572893387669252,
"grad_norm": 0.9140625,
"learning_rate": 8.023468798542127e-05,
"loss": 1.9416938781738282,
"step": 5000,
"token_acc": 0.7584784429628924
},
{
"epoch": 0.6638622321545945,
"grad_norm": 0.87890625,
"learning_rate": 7.980010334389636e-05,
"loss": 1.9161361694335937,
"step": 5050,
"token_acc": 0.7541581670687739
},
{
"epoch": 0.6704351255422637,
"grad_norm": 0.67578125,
"learning_rate": 7.936199874629689e-05,
"loss": 1.9216696166992187,
"step": 5100,
"token_acc": 0.7606859291730552
},
{
"epoch": 0.677008018929933,
"grad_norm": 0.69921875,
"learning_rate": 7.892042594106555e-05,
"loss": 1.9201712036132812,
"step": 5150,
"token_acc": 0.7614792261222095
},
{
"epoch": 0.6835809123176022,
"grad_norm": 0.6640625,
"learning_rate": 7.847543708630593e-05,
"loss": 1.8924771118164063,
"step": 5200,
"token_acc": 0.7622566250217598
},
{
"epoch": 0.6901538057052714,
"grad_norm": 0.69921875,
"learning_rate": 7.80270847436218e-05,
"loss": 1.8791021728515624,
"step": 5250,
"token_acc": 0.7611149879556877
},
{
"epoch": 0.6967266990929407,
"grad_norm": 0.63671875,
"learning_rate": 7.757542187190838e-05,
"loss": 1.8818046569824218,
"step": 5300,
"token_acc": 0.7627985225662821
},
{
"epoch": 0.7032995924806099,
"grad_norm": 0.78515625,
"learning_rate": 7.712050182109711e-05,
"loss": 1.9103680419921876,
"step": 5350,
"token_acc": 0.7583346132272023
},
{
"epoch": 0.7098724858682792,
"grad_norm": 0.61328125,
"learning_rate": 7.666237832585382e-05,
"loss": 1.8824064636230469,
"step": 5400,
"token_acc": 0.7605793230321414
},
{
"epoch": 0.7164453792559484,
"grad_norm": 0.78515625,
"learning_rate": 7.620110549923181e-05,
"loss": 1.877305450439453,
"step": 5450,
"token_acc": 0.7612452387234705
},
{
"epoch": 0.7230182726436177,
"grad_norm": 0.7578125,
"learning_rate": 7.573673782628e-05,
"loss": 1.904554443359375,
"step": 5500,
"token_acc": 0.7584822432750704
},
{
"epoch": 0.7295911660312869,
"grad_norm": 0.86328125,
"learning_rate": 7.526933015760717e-05,
"loss": 1.8621942138671874,
"step": 5550,
"token_acc": 0.7622160103275084
},
{
"epoch": 0.7361640594189562,
"grad_norm": 0.76171875,
"learning_rate": 7.479893770290321e-05,
"loss": 1.8591368103027344,
"step": 5600,
"token_acc": 0.7647176822254823
},
{
"epoch": 0.7427369528066254,
"grad_norm": 0.76171875,
"learning_rate": 7.43256160244176e-05,
"loss": 1.860885009765625,
"step": 5650,
"token_acc": 0.7656942497192635
},
{
"epoch": 0.7493098461942947,
"grad_norm": 0.62890625,
"learning_rate": 7.38494210303967e-05,
"loss": 1.8532620239257813,
"step": 5700,
"token_acc": 0.7606205964388161
},
{
"epoch": 0.755882739581964,
"grad_norm": 0.765625,
"learning_rate": 7.337040896847967e-05,
"loss": 1.8677340698242189,
"step": 5750,
"token_acc": 0.7610945995293353
},
{
"epoch": 0.7624556329696333,
"grad_norm": 0.63671875,
"learning_rate": 7.288863641905481e-05,
"loss": 1.8541110229492188,
"step": 5800,
"token_acc": 0.7597710414081623
},
{
"epoch": 0.7690285263573025,
"grad_norm": 0.7265625,
"learning_rate": 7.240416028857617e-05,
"loss": 1.8557376098632812,
"step": 5850,
"token_acc": 0.7620176547719397
},
{
"epoch": 0.7756014197449718,
"grad_norm": 0.75390625,
"learning_rate": 7.191703780284187e-05,
"loss": 1.8637747192382812,
"step": 5900,
"token_acc": 0.7612350143995713
},
{
"epoch": 0.782174313132641,
"grad_norm": 0.7421875,
"learning_rate": 7.14273265002347e-05,
"loss": 1.8630572509765626,
"step": 5950,
"token_acc": 0.762536667545336
},
{
"epoch": 0.7887472065203103,
"grad_norm": 0.76171875,
"learning_rate": 7.093508422492568e-05,
"loss": 1.8501144409179688,
"step": 6000,
"token_acc": 0.7612453038222803
},
{
"epoch": 0.7887472065203103,
"eval_loss": 1.9278579950332642,
"eval_runtime": 236.8225,
"eval_samples_per_second": 186.101,
"eval_steps_per_second": 2.909,
"eval_token_acc": 0.7643631778364091,
"step": 6000
},
{
"epoch": 0.7953200999079795,
"grad_norm": 0.9609375,
"learning_rate": 7.044036912004159e-05,
"loss": 1.8581178283691406,
"step": 6050,
"token_acc": 0.7597096150613548
},
{
"epoch": 0.8018929932956488,
"grad_norm": 0.66015625,
"learning_rate": 6.99432396207972e-05,
"loss": 1.85968994140625,
"step": 6100,
"token_acc": 0.760983003636121
},
{
"epoch": 0.808465886683318,
"grad_norm": 0.8984375,
"learning_rate": 6.94437544475929e-05,
"loss": 1.8418545532226562,
"step": 6150,
"token_acc": 0.7633178669389553
},
{
"epoch": 0.8150387800709873,
"grad_norm": 0.640625,
"learning_rate": 6.894197259907879e-05,
"loss": 1.8265931701660156,
"step": 6200,
"token_acc": 0.7654604394264699
},
{
"epoch": 0.8216116734586565,
"grad_norm": 0.6953125,
"learning_rate": 6.843795334518576e-05,
"loss": 1.8271298217773437,
"step": 6250,
"token_acc": 0.7637430879196561
},
{
"epoch": 0.8281845668463258,
"grad_norm": 0.8203125,
"learning_rate": 6.79317562201246e-05,
"loss": 1.8288789367675782,
"step": 6300,
"token_acc": 0.7608325763635624
},
{
"epoch": 0.834757460233995,
"grad_norm": 0.6953125,
"learning_rate": 6.742344101535394e-05,
"loss": 1.8086236572265626,
"step": 6350,
"token_acc": 0.7633120515197936
},
{
"epoch": 0.8413303536216643,
"grad_norm": 0.55859375,
"learning_rate": 6.691306777251762e-05,
"loss": 1.7932760620117187,
"step": 6400,
"token_acc": 0.7665305845357507
},
{
"epoch": 0.8479032470093335,
"grad_norm": 0.6484375,
"learning_rate": 6.640069677635282e-05,
"loss": 1.8109786987304688,
"step": 6450,
"token_acc": 0.7631320021044253
},
{
"epoch": 0.8544761403970028,
"grad_norm": 0.5625,
"learning_rate": 6.58863885475691e-05,
"loss": 1.7983740234375,
"step": 6500,
"token_acc": 0.7650729466919018
},
{
"epoch": 0.861049033784672,
"grad_norm": 0.625,
"learning_rate": 6.537020383569988e-05,
"loss": 1.8120062255859375,
"step": 6550,
"token_acc": 0.7633286718136009
},
{
"epoch": 0.8676219271723412,
"grad_norm": 0.68359375,
"learning_rate": 6.485220361192677e-05,
"loss": 1.826031951904297,
"step": 6600,
"token_acc": 0.7621315968131627
},
{
"epoch": 0.8741948205600105,
"grad_norm": 0.7421875,
"learning_rate": 6.433244906187763e-05,
"loss": 1.8025027465820314,
"step": 6650,
"token_acc": 0.763683785326105
},
{
"epoch": 0.8807677139476797,
"grad_norm": 0.61328125,
"learning_rate": 6.381100157839948e-05,
"loss": 1.8083682250976563,
"step": 6700,
"token_acc": 0.7638322653360178
},
{
"epoch": 0.887340607335349,
"grad_norm": 0.55078125,
"learning_rate": 6.328792275430682e-05,
"loss": 1.8106515502929688,
"step": 6750,
"token_acc": 0.7645292486420634
},
{
"epoch": 0.8939135007230182,
"grad_norm": 0.6171875,
"learning_rate": 6.276327437510636e-05,
"loss": 1.7926376342773438,
"step": 6800,
"token_acc": 0.7669320628731736
},
{
"epoch": 0.9004863941106875,
"grad_norm": 0.7578125,
"learning_rate": 6.22371184116989e-05,
"loss": 1.7977276611328126,
"step": 6850,
"token_acc": 0.7638069323509712
},
{
"epoch": 0.9070592874983567,
"grad_norm": 0.609375,
"learning_rate": 6.170951701305951e-05,
"loss": 1.8151174926757812,
"step": 6900,
"token_acc": 0.7588274415858517
},
{
"epoch": 0.913632180886026,
"grad_norm": 0.64453125,
"learning_rate": 6.118053249889652e-05,
"loss": 1.7749380493164062,
"step": 6950,
"token_acc": 0.7635055545232533
},
{
"epoch": 0.9202050742736952,
"grad_norm": 0.71875,
"learning_rate": 6.0650227352290345e-05,
"loss": 1.7828396606445311,
"step": 7000,
"token_acc": 0.7631919048643416
},
{
"epoch": 0.9267779676613646,
"grad_norm": 0.55859375,
"learning_rate": 6.011866421231309e-05,
"loss": 1.7750047302246095,
"step": 7050,
"token_acc": 0.7658884744785802
},
{
"epoch": 0.9333508610490338,
"grad_norm": 0.703125,
"learning_rate": 5.9585905866629687e-05,
"loss": 1.7743110656738281,
"step": 7100,
"token_acc": 0.764956263144499
},
{
"epoch": 0.9399237544367031,
"grad_norm": 0.70703125,
"learning_rate": 5.905201524408148e-05,
"loss": 1.7661270141601562,
"step": 7150,
"token_acc": 0.7625542988555213
},
{
"epoch": 0.9464966478243723,
"grad_norm": 0.62890625,
"learning_rate": 5.8517055407253115e-05,
"loss": 1.7674331665039062,
"step": 7200,
"token_acc": 0.7658641448139589
},
{
"epoch": 0.9530695412120416,
"grad_norm": 0.7265625,
"learning_rate": 5.798108954502368e-05,
"loss": 1.76580810546875,
"step": 7250,
"token_acc": 0.764363801032948
},
{
"epoch": 0.9596424345997108,
"grad_norm": 0.6953125,
"learning_rate": 5.7444180965102936e-05,
"loss": 1.764315185546875,
"step": 7300,
"token_acc": 0.765748932533409
},
{
"epoch": 0.9662153279873801,
"grad_norm": 0.5390625,
"learning_rate": 5.69063930865534e-05,
"loss": 1.7479220581054689,
"step": 7350,
"token_acc": 0.7640023759364084
},
{
"epoch": 0.9727882213750493,
"grad_norm": 0.6796875,
"learning_rate": 5.63677894322994e-05,
"loss": 1.783513641357422,
"step": 7400,
"token_acc": 0.7652048454713193
},
{
"epoch": 0.9793611147627186,
"grad_norm": 0.6484375,
"learning_rate": 5.5828433621623845e-05,
"loss": 1.7546864318847657,
"step": 7450,
"token_acc": 0.7662415623916118
},
{
"epoch": 0.9859340081503878,
"grad_norm": 0.60546875,
"learning_rate": 5.5288389362653484e-05,
"loss": 1.7443992614746093,
"step": 7500,
"token_acc": 0.7674998687827633
},
{
"epoch": 0.9925069015380571,
"grad_norm": 0.625,
"learning_rate": 5.474772044483391e-05,
"loss": 1.7637782287597656,
"step": 7550,
"token_acc": 0.7650653153454292
},
{
"epoch": 0.9990797949257263,
"grad_norm": 0.57421875,
"learning_rate": 5.420649073139469e-05,
"loss": 1.744835205078125,
"step": 7600,
"token_acc": 0.7651500535549154
},
{
"epoch": 1.0056526883133956,
"grad_norm": 0.57421875,
"learning_rate": 5.366476415180599e-05,
"loss": 1.7677224731445313,
"step": 7650,
"token_acc": 0.7611829185169189
},
{
"epoch": 1.0122255817010648,
"grad_norm": 0.546875,
"learning_rate": 5.3122604694227265e-05,
"loss": 1.731588134765625,
"step": 7700,
"token_acc": 0.7665744247751762
},
{
"epoch": 1.018798475088734,
"grad_norm": 0.515625,
"learning_rate": 5.258007639794907e-05,
"loss": 1.7428884887695313,
"step": 7750,
"token_acc": 0.76526092110062
},
{
"epoch": 1.0253713684764034,
"grad_norm": 0.5390625,
"learning_rate": 5.203724334582875e-05,
"loss": 1.724066162109375,
"step": 7800,
"token_acc": 0.7678043302715252
},
{
"epoch": 1.0319442618640726,
"grad_norm": 0.578125,
"learning_rate": 5.1494169656721104e-05,
"loss": 1.7163406372070313,
"step": 7850,
"token_acc": 0.7682152833871836
},
{
"epoch": 1.0385171552517418,
"grad_norm": 0.734375,
"learning_rate": 5.095091947790472e-05,
"loss": 1.7165689086914062,
"step": 7900,
"token_acc": 0.7668357289737444
},
{
"epoch": 1.045090048639411,
"grad_norm": 0.58984375,
"learning_rate": 5.040755697750496e-05,
"loss": 1.7132667541503905,
"step": 7950,
"token_acc": 0.767668712380105
},
{
"epoch": 1.0516629420270802,
"grad_norm": 0.53515625,
"learning_rate": 4.9864146336914465e-05,
"loss": 1.7193359375,
"step": 8000,
"token_acc": 0.7657205294292607
},
{
"epoch": 1.0516629420270802,
"eval_loss": 1.837268590927124,
"eval_runtime": 235.8095,
"eval_samples_per_second": 186.901,
"eval_steps_per_second": 2.922,
"eval_token_acc": 0.7689604263199924,
"step": 8000
},
{
"epoch": 1.0582358354147496,
"grad_norm": 0.64453125,
"learning_rate": 4.9320751743212176e-05,
"loss": 1.7165196228027344,
"step": 8050,
"token_acc": 0.7653232432176571
},
{
"epoch": 1.0648087288024188,
"grad_norm": 0.59765625,
"learning_rate": 4.877743738158155e-05,
"loss": 1.7286593627929687,
"step": 8100,
"token_acc": 0.7641165060152924
},
{
"epoch": 1.071381622190088,
"grad_norm": 0.51171875,
"learning_rate": 4.823426742772917e-05,
"loss": 1.695826873779297,
"step": 8150,
"token_acc": 0.7685074226887527
},
{
"epoch": 1.0779545155777572,
"grad_norm": 0.5703125,
"learning_rate": 4.7691306040304306e-05,
"loss": 1.7172344970703124,
"step": 8200,
"token_acc": 0.7652489671997466
},
{
"epoch": 1.0845274089654267,
"grad_norm": 0.7109375,
"learning_rate": 4.714861735332058e-05,
"loss": 1.6970980834960938,
"step": 8250,
"token_acc": 0.7678146398472478
},
{
"epoch": 1.0911003023530959,
"grad_norm": 0.55859375,
"learning_rate": 4.6606265468580516e-05,
"loss": 1.6961888122558593,
"step": 8300,
"token_acc": 0.7714541070556682
},
{
"epoch": 1.097673195740765,
"grad_norm": 0.546875,
"learning_rate": 4.6064314448103974e-05,
"loss": 1.6937094116210938,
"step": 8350,
"token_acc": 0.7699465368393305
},
{
"epoch": 1.1042460891284342,
"grad_norm": 0.56640625,
"learning_rate": 4.5522828306561085e-05,
"loss": 1.6934506225585937,
"step": 8400,
"token_acc": 0.7680296355759598
},
{
"epoch": 1.1108189825161037,
"grad_norm": 0.734375,
"learning_rate": 4.498187100371105e-05,
"loss": 1.703126220703125,
"step": 8450,
"token_acc": 0.7660449721996203
},
{
"epoch": 1.1173918759037729,
"grad_norm": 0.57421875,
"learning_rate": 4.4441506436847194e-05,
"loss": 1.6976077270507812,
"step": 8500,
"token_acc": 0.7660062252481052
},
{
"epoch": 1.123964769291442,
"grad_norm": 0.58203125,
"learning_rate": 4.390179843324947e-05,
"loss": 1.6787896728515626,
"step": 8550,
"token_acc": 0.7675875080012128
},
{
"epoch": 1.1305376626791113,
"grad_norm": 0.5546875,
"learning_rate": 4.3362810742645344e-05,
"loss": 1.671527099609375,
"step": 8600,
"token_acc": 0.7726572467785786
},
{
"epoch": 1.1371105560667807,
"grad_norm": 0.58984375,
"learning_rate": 4.282460702967962e-05,
"loss": 1.6855081176757813,
"step": 8650,
"token_acc": 0.7697742729365689
},
{
"epoch": 1.1436834494544499,
"grad_norm": 0.52734375,
"learning_rate": 4.228725086639458e-05,
"loss": 1.6703143310546875,
"step": 8700,
"token_acc": 0.7704307602426076
},
{
"epoch": 1.150256342842119,
"grad_norm": 0.56640625,
"learning_rate": 4.175080572472082e-05,
"loss": 1.6878749084472657,
"step": 8750,
"token_acc": 0.7682984359233098
},
{
"epoch": 1.1568292362297883,
"grad_norm": 0.59375,
"learning_rate": 4.121533496898002e-05,
"loss": 1.69472412109375,
"step": 8800,
"token_acc": 0.7653907536202668
},
{
"epoch": 1.1634021296174577,
"grad_norm": 0.5703125,
"learning_rate": 4.068090184840047e-05,
"loss": 1.6784718322753907,
"step": 8850,
"token_acc": 0.7674758636654547
},
{
"epoch": 1.169975023005127,
"grad_norm": 0.53515625,
"learning_rate": 4.0147569489646135e-05,
"loss": 1.6566871643066405,
"step": 8900,
"token_acc": 0.7731254763401035
},
{
"epoch": 1.176547916392796,
"grad_norm": 0.55078125,
"learning_rate": 3.9615400889360146e-05,
"loss": 1.6721833801269532,
"step": 8950,
"token_acc": 0.7681256368871351
},
{
"epoch": 1.1831208097804653,
"grad_norm": 0.609375,
"learning_rate": 3.908445890672373e-05,
"loss": 1.6834414672851563,
"step": 9000,
"token_acc": 0.7689145513676909
},
{
"epoch": 1.1896937031681345,
"grad_norm": 0.5,
"learning_rate": 3.855480625603142e-05,
"loss": 1.6795899963378906,
"step": 9050,
"token_acc": 0.7689981482717203
},
{
"epoch": 1.196266596555804,
"grad_norm": 0.515625,
"learning_rate": 3.8026505499283184e-05,
"loss": 1.6775094604492187,
"step": 9100,
"token_acc": 0.7678651418602145
},
{
"epoch": 1.2028394899434731,
"grad_norm": 0.57421875,
"learning_rate": 3.749961903879477e-05,
"loss": 1.6525213623046875,
"step": 9150,
"token_acc": 0.7686804131998874
},
{
"epoch": 1.2094123833311423,
"grad_norm": 0.53515625,
"learning_rate": 3.6974209109826726e-05,
"loss": 1.6840940856933593,
"step": 9200,
"token_acc": 0.7642316750821783
},
{
"epoch": 1.2159852767188117,
"grad_norm": 0.494140625,
"learning_rate": 3.645033777323339e-05,
"loss": 1.6511599731445312,
"step": 9250,
"token_acc": 0.7737184378932712
},
{
"epoch": 1.222558170106481,
"grad_norm": 0.5234375,
"learning_rate": 3.5928066908132144e-05,
"loss": 1.6393515014648437,
"step": 9300,
"token_acc": 0.7703170812446798
},
{
"epoch": 1.2291310634941501,
"grad_norm": 0.5234375,
"learning_rate": 3.5407458204594426e-05,
"loss": 1.6625300598144532,
"step": 9350,
"token_acc": 0.7698629299985743
},
{
"epoch": 1.2357039568818193,
"grad_norm": 0.5859375,
"learning_rate": 3.488857315635893e-05,
"loss": 1.6773255920410157,
"step": 9400,
"token_acc": 0.7693895226882114
},
{
"epoch": 1.2422768502694885,
"grad_norm": 0.54296875,
"learning_rate": 3.437147305356807e-05,
"loss": 1.6579641723632812,
"step": 9450,
"token_acc": 0.7706150398406374
},
{
"epoch": 1.248849743657158,
"grad_norm": 0.5078125,
"learning_rate": 3.3856218975528434e-05,
"loss": 1.6528695678710938,
"step": 9500,
"token_acc": 0.7691562115211821
},
{
"epoch": 1.2554226370448271,
"grad_norm": 0.5234375,
"learning_rate": 3.334287178349611e-05,
"loss": 1.6566635131835938,
"step": 9550,
"token_acc": 0.7694686428681645
},
{
"epoch": 1.2619955304324963,
"grad_norm": 0.53515625,
"learning_rate": 3.2831492113487904e-05,
"loss": 1.6695437622070313,
"step": 9600,
"token_acc": 0.7692787363152542
},
{
"epoch": 1.2685684238201658,
"grad_norm": 0.50390625,
"learning_rate": 3.2322140369119045e-05,
"loss": 1.670698699951172,
"step": 9650,
"token_acc": 0.7669365404642725
},
{
"epoch": 1.275141317207835,
"grad_norm": 0.52734375,
"learning_rate": 3.181487671446836e-05,
"loss": 1.646166534423828,
"step": 9700,
"token_acc": 0.7694847008702814
},
{
"epoch": 1.2817142105955042,
"grad_norm": 0.53515625,
"learning_rate": 3.130976106697174e-05,
"loss": 1.6512442016601563,
"step": 9750,
"token_acc": 0.7689283254541929
},
{
"epoch": 1.2882871039831734,
"grad_norm": 0.484375,
"learning_rate": 3.080685309034487e-05,
"loss": 1.6418820190429688,
"step": 9800,
"token_acc": 0.7712326161812989
},
{
"epoch": 1.2948599973708426,
"grad_norm": 0.55078125,
"learning_rate": 3.0306212187535653e-05,
"loss": 1.6504058837890625,
"step": 9850,
"token_acc": 0.7678108326514603
},
{
"epoch": 1.301432890758512,
"grad_norm": 0.52734375,
"learning_rate": 2.9807897493707703e-05,
"loss": 1.6451980590820312,
"step": 9900,
"token_acc": 0.7685139265616777
},
{
"epoch": 1.3080057841461812,
"grad_norm": 0.48046875,
"learning_rate": 2.9311967869255324e-05,
"loss": 1.6350534057617188,
"step": 9950,
"token_acc": 0.7734443379837148
},
{
"epoch": 1.3145786775338504,
"grad_norm": 0.5390625,
"learning_rate": 2.881848189285105e-05,
"loss": 1.6401956176757813,
"step": 10000,
"token_acc": 0.7687987154069689
},
{
"epoch": 1.3145786775338504,
"eval_loss": 1.7793248891830444,
"eval_runtime": 236.4249,
"eval_samples_per_second": 186.414,
"eval_steps_per_second": 2.914,
"eval_token_acc": 0.7717742013781854,
"step": 10000
},
{
"epoch": 1.3211515709215196,
"grad_norm": 0.51171875,
"learning_rate": 2.8327497854526276e-05,
"loss": 1.6347137451171876,
"step": 10050,
"token_acc": 0.7699386839582517
},
{
"epoch": 1.3277244643091888,
"grad_norm": 0.546875,
"learning_rate": 2.783907374878623e-05,
"loss": 1.6400608825683594,
"step": 10100,
"token_acc": 0.7713103096257532
},
{
"epoch": 1.3342973576968582,
"grad_norm": 0.54296875,
"learning_rate": 2.7353267267759587e-05,
"loss": 1.6206582641601563,
"step": 10150,
"token_acc": 0.7717408144832179
},
{
"epoch": 1.3408702510845274,
"grad_norm": 0.5234375,
"learning_rate": 2.6870135794384084e-05,
"loss": 1.6268215942382813,
"step": 10200,
"token_acc": 0.7686959485978778
},
{
"epoch": 1.3474431444721966,
"grad_norm": 0.48828125,
"learning_rate": 2.63897363956284e-05,
"loss": 1.626171875,
"step": 10250,
"token_acc": 0.7716723626001355
},
{
"epoch": 1.354016037859866,
"grad_norm": 0.5234375,
"learning_rate": 2.591212581575153e-05,
"loss": 1.6198342895507813,
"step": 10300,
"token_acc": 0.7731112873028669
},
{
"epoch": 1.3605889312475352,
"grad_norm": 0.498046875,
"learning_rate": 2.543736046960019e-05,
"loss": 1.6164779663085938,
"step": 10350,
"token_acc": 0.769608566595914
},
{
"epoch": 1.3671618246352044,
"grad_norm": 0.490234375,
"learning_rate": 2.4965496435945106e-05,
"loss": 1.641104736328125,
"step": 10400,
"token_acc": 0.7738014461806595
},
{
"epoch": 1.3737347180228736,
"grad_norm": 0.486328125,
"learning_rate": 2.449658945085718e-05,
"loss": 1.6213189697265624,
"step": 10450,
"token_acc": 0.768932852837636
},
{
"epoch": 1.3803076114105428,
"grad_norm": 0.494140625,
"learning_rate": 2.4030694901123825e-05,
"loss": 1.6324661254882813,
"step": 10500,
"token_acc": 0.7704852495864681
},
{
"epoch": 1.3868805047982122,
"grad_norm": 0.453125,
"learning_rate": 2.3567867817706974e-05,
"loss": 1.6237179565429687,
"step": 10550,
"token_acc": 0.770884503460658
},
{
"epoch": 1.3934533981858814,
"grad_norm": 0.60546875,
"learning_rate": 2.310816286924261e-05,
"loss": 1.6309237670898438,
"step": 10600,
"token_acc": 0.7696517291192732
},
{
"epoch": 1.4000262915735506,
"grad_norm": 0.51953125,
"learning_rate": 2.2651634355583606e-05,
"loss": 1.6198001098632813,
"step": 10650,
"token_acc": 0.7706210696067582
},
{
"epoch": 1.40659918496122,
"grad_norm": 0.462890625,
"learning_rate": 2.2198336201385674e-05,
"loss": 1.637750244140625,
"step": 10700,
"token_acc": 0.768926374214257
},
{
"epoch": 1.4131720783488892,
"grad_norm": 0.52734375,
"learning_rate": 2.1748321949738088e-05,
"loss": 1.6378421020507812,
"step": 10750,
"token_acc": 0.767715041979971
},
{
"epoch": 1.4197449717365584,
"grad_norm": 0.482421875,
"learning_rate": 2.130164475583896e-05,
"loss": 1.6255686950683594,
"step": 10800,
"token_acc": 0.7689490174738093
},
{
"epoch": 1.4263178651242276,
"grad_norm": 0.47265625,
"learning_rate": 2.0858357380716826e-05,
"loss": 1.6103607177734376,
"step": 10850,
"token_acc": 0.7713349007488887
},
{
"epoch": 1.4328907585118968,
"grad_norm": 0.51171875,
"learning_rate": 2.041851218499844e-05,
"loss": 1.6049491882324218,
"step": 10900,
"token_acc": 0.7700552131398218
},
{
"epoch": 1.4394636518995663,
"grad_norm": 0.50390625,
"learning_rate": 1.998216112272407e-05,
"loss": 1.6167376708984376,
"step": 10950,
"token_acc": 0.770873658059624
},
{
"epoch": 1.4460365452872355,
"grad_norm": 0.56640625,
"learning_rate": 1.9549355735210663e-05,
"loss": 1.607739715576172,
"step": 11000,
"token_acc": 0.7714620438930024
},
{
"epoch": 1.4526094386749047,
"grad_norm": 0.5390625,
"learning_rate": 1.9120147144963918e-05,
"loss": 1.6082345581054687,
"step": 11050,
"token_acc": 0.7727137897326997
},
{
"epoch": 1.459182332062574,
"grad_norm": 0.51171875,
"learning_rate": 1.869458604963973e-05,
"loss": 1.6116729736328126,
"step": 11100,
"token_acc": 0.7693582542656137
},
{
"epoch": 1.465755225450243,
"grad_norm": 0.482421875,
"learning_rate": 1.827272271605581e-05,
"loss": 1.6167997741699218,
"step": 11150,
"token_acc": 0.7716410540840345
},
{
"epoch": 1.4723281188379125,
"grad_norm": 0.4609375,
"learning_rate": 1.785460697425422e-05,
"loss": 1.6175106811523436,
"step": 11200,
"token_acc": 0.7678629010587413
},
{
"epoch": 1.4789010122255817,
"grad_norm": 0.50390625,
"learning_rate": 1.7440288211615553e-05,
"loss": 1.6349208068847656,
"step": 11250,
"token_acc": 0.767691050989249
},
{
"epoch": 1.4854739056132509,
"grad_norm": 0.470703125,
"learning_rate": 1.7029815367025304e-05,
"loss": 1.5947479248046874,
"step": 11300,
"token_acc": 0.7744744172137812
},
{
"epoch": 1.4920467990009203,
"grad_norm": 0.46484375,
"learning_rate": 1.6623236925093293e-05,
"loss": 1.6072760009765625,
"step": 11350,
"token_acc": 0.7699475558805804
},
{
"epoch": 1.4986196923885895,
"grad_norm": 0.48046875,
"learning_rate": 1.622060091042666e-05,
"loss": 1.6194985961914063,
"step": 11400,
"token_acc": 0.771240942645837
},
{
"epoch": 1.5051925857762587,
"grad_norm": 0.48828125,
"learning_rate": 1.582195488195731e-05,
"loss": 1.5986326599121095,
"step": 11450,
"token_acc": 0.7726320047065744
},
{
"epoch": 1.5117654791639281,
"grad_norm": 0.458984375,
"learning_rate": 1.5427345927324305e-05,
"loss": 1.6070111083984375,
"step": 11500,
"token_acc": 0.773540255743937
},
{
"epoch": 1.518338372551597,
"grad_norm": 0.4921875,
"learning_rate": 1.5036820657311839e-05,
"loss": 1.6082412719726562,
"step": 11550,
"token_acc": 0.769586028833374
},
{
"epoch": 1.5249112659392665,
"grad_norm": 0.4609375,
"learning_rate": 1.4650425200343732e-05,
"loss": 1.6070376586914064,
"step": 11600,
"token_acc": 0.7711919617970434
},
{
"epoch": 1.5314841593269357,
"grad_norm": 0.455078125,
"learning_rate": 1.4268205197034717e-05,
"loss": 1.6123899841308593,
"step": 11650,
"token_acc": 0.7703268714219823
},
{
"epoch": 1.538057052714605,
"grad_norm": 0.455078125,
"learning_rate": 1.3890205794799476e-05,
"loss": 1.615906982421875,
"step": 11700,
"token_acc": 0.7700723973792927
},
{
"epoch": 1.5446299461022743,
"grad_norm": 0.4453125,
"learning_rate": 1.3516471642519784e-05,
"loss": 1.5967388916015626,
"step": 11750,
"token_acc": 0.772738502438927
},
{
"epoch": 1.5512028394899433,
"grad_norm": 0.462890625,
"learning_rate": 1.3147046885270736e-05,
"loss": 1.5981399536132812,
"step": 11800,
"token_acc": 0.7728519929058771
},
{
"epoch": 1.5577757328776127,
"grad_norm": 0.45703125,
"learning_rate": 1.2781975159106319e-05,
"loss": 1.6243299865722656,
"step": 11850,
"token_acc": 0.7696561945272465
},
{
"epoch": 1.564348626265282,
"grad_norm": 0.474609375,
"learning_rate": 1.24212995859052e-05,
"loss": 1.6280474853515625,
"step": 11900,
"token_acc": 0.7673742244402482
},
{
"epoch": 1.5709215196529511,
"grad_norm": 0.50390625,
"learning_rate": 1.2065062768277135e-05,
"loss": 1.5971218872070312,
"step": 11950,
"token_acc": 0.7735671474840562
},
{
"epoch": 1.5774944130406205,
"grad_norm": 0.427734375,
"learning_rate": 1.171330678453097e-05,
"loss": 1.591236572265625,
"step": 12000,
"token_acc": 0.7735620511595184
},
{
"epoch": 1.5774944130406205,
"eval_loss": 1.754050612449646,
"eval_runtime": 236.7394,
"eval_samples_per_second": 186.167,
"eval_steps_per_second": 2.91,
"eval_token_acc": 0.7731684672723433,
"step": 12000
}
],
"logging_steps": 50,
"max_steps": 15214,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.831322460026044e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}