{ "best_global_step": 12000, "best_metric": 1.75405061, "best_model_checkpoint": "/scratch/prj0000000267/yuefan/UnifyTrajLLM/output_rope_instruct_gate_llmlow5_5e-4/v3-20251108-210106/checkpoint-12000", "epoch": 1.5774944130406205, "eval_steps": 2000, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00013145786775338504, "grad_norm": 268.0, "learning_rate": 1.314060446780552e-07, "loss": 30.039020538330078, "step": 1, "token_acc": 0.0 }, { "epoch": 0.006572893387669252, "grad_norm": 47.5, "learning_rate": 6.5703022339027605e-06, "loss": 24.82280123963648, "step": 50, "token_acc": 0.036200167198766994 }, { "epoch": 0.013145786775338505, "grad_norm": 59.5, "learning_rate": 1.3140604467805521e-05, "loss": 14.215625, "step": 100, "token_acc": 0.43851580932023954 }, { "epoch": 0.019718680163007755, "grad_norm": 10.1875, "learning_rate": 1.9710906701708278e-05, "loss": 6.50881591796875, "step": 150, "token_acc": 0.6614513467387406 }, { "epoch": 0.02629157355067701, "grad_norm": 32.25, "learning_rate": 2.6281208935611042e-05, "loss": 6.06468505859375, "step": 200, "token_acc": 0.6807562909100595 }, { "epoch": 0.03286446693834626, "grad_norm": 10.9375, "learning_rate": 3.2851511169513796e-05, "loss": 5.703701171875, "step": 250, "token_acc": 0.6826512527189237 }, { "epoch": 0.03943736032601551, "grad_norm": 14.3125, "learning_rate": 3.9421813403416556e-05, "loss": 5.412930908203125, "step": 300, "token_acc": 0.6959136632453732 }, { "epoch": 0.046010253713684765, "grad_norm": 7.03125, "learning_rate": 4.5992115637319317e-05, "loss": 5.1832080078125, "step": 350, "token_acc": 0.6958016739273748 }, { "epoch": 0.05258314710135402, "grad_norm": 6.125, "learning_rate": 5.2562417871222084e-05, "loss": 5.019949340820313, "step": 400, "token_acc": 0.7036372985827332 }, { "epoch": 0.059156040489023266, "grad_norm": 5.09375, "learning_rate": 5.913272010512484e-05, "loss": 4.91180908203125, "step": 450, "token_acc": 0.7090584023609108 }, { "epoch": 0.06572893387669251, "grad_norm": 6.90625, "learning_rate": 6.570302233902759e-05, "loss": 4.86035888671875, "step": 500, "token_acc": 0.7103483463054483 }, { "epoch": 0.07230182726436177, "grad_norm": 4.21875, "learning_rate": 7.227332457293036e-05, "loss": 4.782819213867188, "step": 550, "token_acc": 0.711807526156599 }, { "epoch": 0.07887472065203102, "grad_norm": 4.3125, "learning_rate": 7.884362680683311e-05, "loss": 4.707298889160156, "step": 600, "token_acc": 0.7184096746149201 }, { "epoch": 0.08544761403970028, "grad_norm": 5.25, "learning_rate": 8.541392904073588e-05, "loss": 4.594347534179687, "step": 650, "token_acc": 0.7216674015037796 }, { "epoch": 0.09202050742736953, "grad_norm": 3.609375, "learning_rate": 9.198423127463863e-05, "loss": 4.58609375, "step": 700, "token_acc": 0.714123727321825 }, { "epoch": 0.09859340081503878, "grad_norm": 3.421875, "learning_rate": 9.85545335085414e-05, "loss": 4.494258422851562, "step": 750, "token_acc": 0.7178661247995335 }, { "epoch": 0.10516629420270804, "grad_norm": 4.75, "learning_rate": 9.999820340427517e-05, "loss": 4.393697509765625, "step": 800, "token_acc": 0.719993257471585 }, { "epoch": 0.11173918759037728, "grad_norm": 3.28125, "learning_rate": 9.999064399990964e-05, "loss": 4.324550170898437, "step": 850, "token_acc": 0.7193472614965004 }, { "epoch": 0.11831208097804653, "grad_norm": 3.359375, "learning_rate": 9.997717975457807e-05, "loss": 4.1653680419921875, "step": 900, "token_acc": 0.7247381121504013 }, { "epoch": 0.12488497436571579, "grad_norm": 1.5546875, "learning_rate": 9.995781225866254e-05, "loss": 4.05832275390625, "step": 950, "token_acc": 0.7224337955208558 }, { "epoch": 0.13145786775338503, "grad_norm": 2.21875, "learning_rate": 9.993254379983084e-05, "loss": 3.977420654296875, "step": 1000, "token_acc": 0.7213713367669359 }, { "epoch": 0.13803076114105428, "grad_norm": 1.59375, "learning_rate": 9.990137736276604e-05, "loss": 3.8511199951171875, "step": 1050, "token_acc": 0.7259079629296982 }, { "epoch": 0.14460365452872354, "grad_norm": 2.109375, "learning_rate": 9.98643166288141e-05, "loss": 3.7238555908203126, "step": 1100, "token_acc": 0.729174537368435 }, { "epoch": 0.1511765479163928, "grad_norm": 3.484375, "learning_rate": 9.982136597554896e-05, "loss": 3.6605801391601562, "step": 1150, "token_acc": 0.7267878333802646 }, { "epoch": 0.15774944130406204, "grad_norm": 1.765625, "learning_rate": 9.977253047625546e-05, "loss": 3.586345520019531, "step": 1200, "token_acc": 0.7296032337886851 }, { "epoch": 0.1643223346917313, "grad_norm": 1.875, "learning_rate": 9.971781589933012e-05, "loss": 3.5275897216796874, "step": 1250, "token_acc": 0.7274431196530496 }, { "epoch": 0.17089522807940055, "grad_norm": 5.34375, "learning_rate": 9.965722870759977e-05, "loss": 3.4567681884765626, "step": 1300, "token_acc": 0.7260510593991347 }, { "epoch": 0.1774681214670698, "grad_norm": 2.984375, "learning_rate": 9.959077605755818e-05, "loss": 3.35341064453125, "step": 1350, "token_acc": 0.7315002945127964 }, { "epoch": 0.18404101485473906, "grad_norm": 2.640625, "learning_rate": 9.951846579852069e-05, "loss": 3.2548678588867186, "step": 1400, "token_acc": 0.7334485568361279 }, { "epoch": 0.19061390824240831, "grad_norm": 2.328125, "learning_rate": 9.944030647169715e-05, "loss": 3.1699752807617188, "step": 1450, "token_acc": 0.7346018069265517 }, { "epoch": 0.19718680163007757, "grad_norm": 3.75, "learning_rate": 9.935630730918297e-05, "loss": 3.123944091796875, "step": 1500, "token_acc": 0.7325574233567774 }, { "epoch": 0.20375969501774682, "grad_norm": 2.328125, "learning_rate": 9.926647823286865e-05, "loss": 3.031203308105469, "step": 1550, "token_acc": 0.7343244664345068 }, { "epoch": 0.21033258840541608, "grad_norm": 2.09375, "learning_rate": 9.917082985326782e-05, "loss": 2.9396633911132812, "step": 1600, "token_acc": 0.736209056167852 }, { "epoch": 0.2169054817930853, "grad_norm": 1.921875, "learning_rate": 9.906937346826395e-05, "loss": 2.8921356201171875, "step": 1650, "token_acc": 0.7373535529118604 }, { "epoch": 0.22347837518075456, "grad_norm": 2.015625, "learning_rate": 9.896212106177583e-05, "loss": 2.8311395263671875, "step": 1700, "token_acc": 0.7392403929710977 }, { "epoch": 0.2300512685684238, "grad_norm": 2.015625, "learning_rate": 9.884908530234208e-05, "loss": 2.7363882446289063, "step": 1750, "token_acc": 0.7410795625843831 }, { "epoch": 0.23662416195609307, "grad_norm": 1.5546875, "learning_rate": 9.873027954162471e-05, "loss": 2.6730242919921876, "step": 1800, "token_acc": 0.7443422077792354 }, { "epoch": 0.24319705534376232, "grad_norm": 1.84375, "learning_rate": 9.860571781283208e-05, "loss": 2.6252935791015624, "step": 1850, "token_acc": 0.7444647858608681 }, { "epoch": 0.24976994873143157, "grad_norm": 1.6640625, "learning_rate": 9.847541482906129e-05, "loss": 2.5712957763671875, "step": 1900, "token_acc": 0.7508503287266872 }, { "epoch": 0.25634284211910086, "grad_norm": 1.265625, "learning_rate": 9.833938598156025e-05, "loss": 2.5640655517578126, "step": 1950, "token_acc": 0.7425811658922213 }, { "epoch": 0.26291573550677005, "grad_norm": 1.1953125, "learning_rate": 9.819764733790979e-05, "loss": 2.5158842468261717, "step": 2000, "token_acc": 0.7452882362784471 }, { "epoch": 0.26291573550677005, "eval_loss": 2.518305778503418, "eval_runtime": 236.0729, "eval_samples_per_second": 186.692, "eval_steps_per_second": 2.919, "eval_token_acc": 0.749429720552148, "step": 2000 }, { "epoch": 0.2694886288944393, "grad_norm": 1.4765625, "learning_rate": 9.805021564012564e-05, "loss": 2.4857614135742185, "step": 2050, "token_acc": 0.7441553323650091 }, { "epoch": 0.27606152228210856, "grad_norm": 2.421875, "learning_rate": 9.789710830268099e-05, "loss": 2.450667724609375, "step": 2100, "token_acc": 0.7500234051090009 }, { "epoch": 0.2826344156697778, "grad_norm": 1.046875, "learning_rate": 9.773834341044944e-05, "loss": 2.4290037536621094, "step": 2150, "token_acc": 0.7513961437248966 }, { "epoch": 0.28920730905744707, "grad_norm": 1.8203125, "learning_rate": 9.757393971656888e-05, "loss": 2.413728942871094, "step": 2200, "token_acc": 0.7452505502003465 }, { "epoch": 0.2957802024451163, "grad_norm": 1.7578125, "learning_rate": 9.740391664022633e-05, "loss": 2.3729684448242185, "step": 2250, "token_acc": 0.752083845606853 }, { "epoch": 0.3023530958327856, "grad_norm": 1.2421875, "learning_rate": 9.722829426436427e-05, "loss": 2.3652894592285154, "step": 2300, "token_acc": 0.7517237172802866 }, { "epoch": 0.30892598922045483, "grad_norm": 0.8125, "learning_rate": 9.704709333330836e-05, "loss": 2.356060791015625, "step": 2350, "token_acc": 0.7475131194646989 }, { "epoch": 0.3154988826081241, "grad_norm": 0.90625, "learning_rate": 9.686033525031719e-05, "loss": 2.3459547424316405, "step": 2400, "token_acc": 0.7465522261190833 }, { "epoch": 0.32207177599579334, "grad_norm": 0.86328125, "learning_rate": 9.666804207505414e-05, "loss": 2.34242919921875, "step": 2450, "token_acc": 0.7503000046993743 }, { "epoch": 0.3286446693834626, "grad_norm": 1.203125, "learning_rate": 9.647023652098174e-05, "loss": 2.30553955078125, "step": 2500, "token_acc": 0.7505765418279411 }, { "epoch": 0.33521756277113185, "grad_norm": 1.296875, "learning_rate": 9.626694195267876e-05, "loss": 2.2867636108398437, "step": 2550, "token_acc": 0.7495737322589445 }, { "epoch": 0.3417904561588011, "grad_norm": 1.3046875, "learning_rate": 9.605818238308038e-05, "loss": 2.2838902282714844, "step": 2600, "token_acc": 0.7510741453019647 }, { "epoch": 0.34836334954647036, "grad_norm": 1.2578125, "learning_rate": 9.584398247064188e-05, "loss": 2.2479782104492188, "step": 2650, "token_acc": 0.7525476660092044 }, { "epoch": 0.3549362429341396, "grad_norm": 0.98046875, "learning_rate": 9.562436751642593e-05, "loss": 2.2379521179199218, "step": 2700, "token_acc": 0.7535541690112872 }, { "epoch": 0.36150913632180887, "grad_norm": 1.2109375, "learning_rate": 9.539936346111416e-05, "loss": 2.25480712890625, "step": 2750, "token_acc": 0.7502481934500133 }, { "epoch": 0.3680820297094781, "grad_norm": 0.73046875, "learning_rate": 9.516899688194294e-05, "loss": 2.1890530395507812, "step": 2800, "token_acc": 0.7559071920628226 }, { "epoch": 0.3746549230971474, "grad_norm": 0.99609375, "learning_rate": 9.493329498956421e-05, "loss": 2.2252967834472654, "step": 2850, "token_acc": 0.7540685282249956 }, { "epoch": 0.38122781648481663, "grad_norm": 0.875, "learning_rate": 9.469228562483132e-05, "loss": 2.211038818359375, "step": 2900, "token_acc": 0.7534921970366274 }, { "epoch": 0.3878007098724859, "grad_norm": 1.0625, "learning_rate": 9.444599725551061e-05, "loss": 2.1635357666015627, "step": 2950, "token_acc": 0.7530642715579734 }, { "epoch": 0.39437360326015514, "grad_norm": 1.265625, "learning_rate": 9.419445897291867e-05, "loss": 2.1792333984375, "step": 3000, "token_acc": 0.7530076526518494 }, { "epoch": 0.4009464966478244, "grad_norm": 1.0390625, "learning_rate": 9.393770048848622e-05, "loss": 2.168623352050781, "step": 3050, "token_acc": 0.7545220973858319 }, { "epoch": 0.40751939003549364, "grad_norm": 0.703125, "learning_rate": 9.367575213024861e-05, "loss": 2.1656561279296875, "step": 3100, "token_acc": 0.7529528081537318 }, { "epoch": 0.4140922834231629, "grad_norm": 0.69921875, "learning_rate": 9.340864483926343e-05, "loss": 2.147900390625, "step": 3150, "token_acc": 0.7535216548028473 }, { "epoch": 0.42066517681083215, "grad_norm": 0.703125, "learning_rate": 9.313641016595588e-05, "loss": 2.1436308288574217, "step": 3200, "token_acc": 0.754756994891503 }, { "epoch": 0.4272380701985014, "grad_norm": 0.91796875, "learning_rate": 9.285908026639207e-05, "loss": 2.1488153076171876, "step": 3250, "token_acc": 0.7516223648809727 }, { "epoch": 0.4338109635861706, "grad_norm": 0.80859375, "learning_rate": 9.257668789848067e-05, "loss": 2.1125421142578125, "step": 3300, "token_acc": 0.7542638775798605 }, { "epoch": 0.44038385697383986, "grad_norm": 0.94921875, "learning_rate": 9.228926641810367e-05, "loss": 2.127976379394531, "step": 3350, "token_acc": 0.7534171662400647 }, { "epoch": 0.4469567503615091, "grad_norm": 0.90234375, "learning_rate": 9.199684977517645e-05, "loss": 2.117357025146484, "step": 3400, "token_acc": 0.754085423576444 }, { "epoch": 0.45352964374917837, "grad_norm": 0.82421875, "learning_rate": 9.169947250963753e-05, "loss": 2.1096246337890623, "step": 3450, "token_acc": 0.7573267065803522 }, { "epoch": 0.4601025371368476, "grad_norm": 0.68359375, "learning_rate": 9.139716974736889e-05, "loss": 2.08451904296875, "step": 3500, "token_acc": 0.7560729910200322 }, { "epoch": 0.4666754305245169, "grad_norm": 0.94140625, "learning_rate": 9.108997719604687e-05, "loss": 2.0704086303710936, "step": 3550, "token_acc": 0.758234912558547 }, { "epoch": 0.47324832391218613, "grad_norm": 0.94921875, "learning_rate": 9.077793114092435e-05, "loss": 2.0744793701171873, "step": 3600, "token_acc": 0.7553985593878892 }, { "epoch": 0.4798212172998554, "grad_norm": 0.87109375, "learning_rate": 9.046106844054491e-05, "loss": 2.055031433105469, "step": 3650, "token_acc": 0.7576514412150057 }, { "epoch": 0.48639411068752464, "grad_norm": 0.6875, "learning_rate": 9.013942652238908e-05, "loss": 2.0411907958984377, "step": 3700, "token_acc": 0.7550962098257407 }, { "epoch": 0.4929670040751939, "grad_norm": 0.75390625, "learning_rate": 8.981304337845337e-05, "loss": 2.0571356201171875, "step": 3750, "token_acc": 0.7560834470136957 }, { "epoch": 0.49953989746286315, "grad_norm": 0.67578125, "learning_rate": 8.948195756076285e-05, "loss": 2.074111022949219, "step": 3800, "token_acc": 0.7541681281518444 }, { "epoch": 0.5061127908505324, "grad_norm": 0.69140625, "learning_rate": 8.914620817681729e-05, "loss": 2.0392041015625, "step": 3850, "token_acc": 0.7577290307595985 }, { "epoch": 0.5126856842382017, "grad_norm": 1.0703125, "learning_rate": 8.880583488497192e-05, "loss": 2.0631610107421876, "step": 3900, "token_acc": 0.7556020800972513 }, { "epoch": 0.5192585776258709, "grad_norm": 0.69921875, "learning_rate": 8.846087788975292e-05, "loss": 2.0386505126953125, "step": 3950, "token_acc": 0.7588939592425994 }, { "epoch": 0.5258314710135401, "grad_norm": 0.9375, "learning_rate": 8.811137793710863e-05, "loss": 2.0313320922851563, "step": 4000, "token_acc": 0.7567113841020556 }, { "epoch": 0.5258314710135401, "eval_loss": 2.076582908630371, "eval_runtime": 235.7989, "eval_samples_per_second": 186.909, "eval_steps_per_second": 2.922, "eval_token_acc": 0.7596436572643577, "step": 4000 }, { "epoch": 0.5324043644012094, "grad_norm": 0.82421875, "learning_rate": 8.775737630959662e-05, "loss": 2.0022723388671877, "step": 4050, "token_acc": 0.7579540762299285 }, { "epoch": 0.5389772577888786, "grad_norm": 0.93359375, "learning_rate": 8.739891482150741e-05, "loss": 2.0245912170410154, "step": 4100, "token_acc": 0.7564656900076155 }, { "epoch": 0.5455501511765479, "grad_norm": 0.76171875, "learning_rate": 8.703603581392546e-05, "loss": 2.0100286865234374, "step": 4150, "token_acc": 0.7567689057436112 }, { "epoch": 0.5521230445642171, "grad_norm": 0.8359375, "learning_rate": 8.666878214972783e-05, "loss": 2.021333923339844, "step": 4200, "token_acc": 0.7570281681148902 }, { "epoch": 0.5586959379518864, "grad_norm": 0.8046875, "learning_rate": 8.629719720852138e-05, "loss": 2.0115155029296874, "step": 4250, "token_acc": 0.7570202860215485 }, { "epoch": 0.5652688313395556, "grad_norm": 0.80859375, "learning_rate": 8.59213248815187e-05, "loss": 2.014189910888672, "step": 4300, "token_acc": 0.7566674687681726 }, { "epoch": 0.5718417247272249, "grad_norm": 0.69140625, "learning_rate": 8.554120956635375e-05, "loss": 1.9902659606933595, "step": 4350, "token_acc": 0.7575061325565429 }, { "epoch": 0.5784146181148941, "grad_norm": 0.7421875, "learning_rate": 8.515689616183769e-05, "loss": 1.9776287841796876, "step": 4400, "token_acc": 0.7585736758424619 }, { "epoch": 0.5849875115025635, "grad_norm": 0.609375, "learning_rate": 8.476843006265545e-05, "loss": 1.97283447265625, "step": 4450, "token_acc": 0.754149127932067 }, { "epoch": 0.5915604048902326, "grad_norm": 0.79296875, "learning_rate": 8.437585715400384e-05, "loss": 1.9807916259765626, "step": 4500, "token_acc": 0.7596238589600954 }, { "epoch": 0.598133298277902, "grad_norm": 0.84765625, "learning_rate": 8.39792238061715e-05, "loss": 1.9481539916992188, "step": 4550, "token_acc": 0.7577328978855814 }, { "epoch": 0.6047061916655712, "grad_norm": 0.875, "learning_rate": 8.357857686906182e-05, "loss": 1.9619242858886718, "step": 4600, "token_acc": 0.7588518194659313 }, { "epoch": 0.6112790850532405, "grad_norm": 0.66796875, "learning_rate": 8.317396366665899e-05, "loss": 1.9576710510253905, "step": 4650, "token_acc": 0.7596985079347987 }, { "epoch": 0.6178519784409097, "grad_norm": 0.6953125, "learning_rate": 8.27654319914382e-05, "loss": 1.9588572692871093, "step": 4700, "token_acc": 0.757741116751269 }, { "epoch": 0.624424871828579, "grad_norm": 0.6328125, "learning_rate": 8.235303009872043e-05, "loss": 1.954942626953125, "step": 4750, "token_acc": 0.761324026042421 }, { "epoch": 0.6309977652162482, "grad_norm": 0.94140625, "learning_rate": 8.193680670097257e-05, "loss": 1.9273374938964845, "step": 4800, "token_acc": 0.7592756976720991 }, { "epoch": 0.6375706586039175, "grad_norm": 0.73828125, "learning_rate": 8.151681096205356e-05, "loss": 1.94022216796875, "step": 4850, "token_acc": 0.7566143414515606 }, { "epoch": 0.6441435519915867, "grad_norm": 0.90234375, "learning_rate": 8.109309249140721e-05, "loss": 1.9436038208007813, "step": 4900, "token_acc": 0.7617455006768156 }, { "epoch": 0.650716445379256, "grad_norm": 0.83203125, "learning_rate": 8.06657013382024e-05, "loss": 1.937064208984375, "step": 4950, "token_acc": 0.7591963945791783 }, { "epoch": 0.6572893387669252, "grad_norm": 0.9140625, "learning_rate": 8.023468798542127e-05, "loss": 1.9416938781738282, "step": 5000, "token_acc": 0.7584784429628924 }, { "epoch": 0.6638622321545945, "grad_norm": 0.87890625, "learning_rate": 7.980010334389636e-05, "loss": 1.9161361694335937, "step": 5050, "token_acc": 0.7541581670687739 }, { "epoch": 0.6704351255422637, "grad_norm": 0.67578125, "learning_rate": 7.936199874629689e-05, "loss": 1.9216696166992187, "step": 5100, "token_acc": 0.7606859291730552 }, { "epoch": 0.677008018929933, "grad_norm": 0.69921875, "learning_rate": 7.892042594106555e-05, "loss": 1.9201712036132812, "step": 5150, "token_acc": 0.7614792261222095 }, { "epoch": 0.6835809123176022, "grad_norm": 0.6640625, "learning_rate": 7.847543708630593e-05, "loss": 1.8924771118164063, "step": 5200, "token_acc": 0.7622566250217598 }, { "epoch": 0.6901538057052714, "grad_norm": 0.69921875, "learning_rate": 7.80270847436218e-05, "loss": 1.8791021728515624, "step": 5250, "token_acc": 0.7611149879556877 }, { "epoch": 0.6967266990929407, "grad_norm": 0.63671875, "learning_rate": 7.757542187190838e-05, "loss": 1.8818046569824218, "step": 5300, "token_acc": 0.7627985225662821 }, { "epoch": 0.7032995924806099, "grad_norm": 0.78515625, "learning_rate": 7.712050182109711e-05, "loss": 1.9103680419921876, "step": 5350, "token_acc": 0.7583346132272023 }, { "epoch": 0.7098724858682792, "grad_norm": 0.61328125, "learning_rate": 7.666237832585382e-05, "loss": 1.8824064636230469, "step": 5400, "token_acc": 0.7605793230321414 }, { "epoch": 0.7164453792559484, "grad_norm": 0.78515625, "learning_rate": 7.620110549923181e-05, "loss": 1.877305450439453, "step": 5450, "token_acc": 0.7612452387234705 }, { "epoch": 0.7230182726436177, "grad_norm": 0.7578125, "learning_rate": 7.573673782628e-05, "loss": 1.904554443359375, "step": 5500, "token_acc": 0.7584822432750704 }, { "epoch": 0.7295911660312869, "grad_norm": 0.86328125, "learning_rate": 7.526933015760717e-05, "loss": 1.8621942138671874, "step": 5550, "token_acc": 0.7622160103275084 }, { "epoch": 0.7361640594189562, "grad_norm": 0.76171875, "learning_rate": 7.479893770290321e-05, "loss": 1.8591368103027344, "step": 5600, "token_acc": 0.7647176822254823 }, { "epoch": 0.7427369528066254, "grad_norm": 0.76171875, "learning_rate": 7.43256160244176e-05, "loss": 1.860885009765625, "step": 5650, "token_acc": 0.7656942497192635 }, { "epoch": 0.7493098461942947, "grad_norm": 0.62890625, "learning_rate": 7.38494210303967e-05, "loss": 1.8532620239257813, "step": 5700, "token_acc": 0.7606205964388161 }, { "epoch": 0.755882739581964, "grad_norm": 0.765625, "learning_rate": 7.337040896847967e-05, "loss": 1.8677340698242189, "step": 5750, "token_acc": 0.7610945995293353 }, { "epoch": 0.7624556329696333, "grad_norm": 0.63671875, "learning_rate": 7.288863641905481e-05, "loss": 1.8541110229492188, "step": 5800, "token_acc": 0.7597710414081623 }, { "epoch": 0.7690285263573025, "grad_norm": 0.7265625, "learning_rate": 7.240416028857617e-05, "loss": 1.8557376098632812, "step": 5850, "token_acc": 0.7620176547719397 }, { "epoch": 0.7756014197449718, "grad_norm": 0.75390625, "learning_rate": 7.191703780284187e-05, "loss": 1.8637747192382812, "step": 5900, "token_acc": 0.7612350143995713 }, { "epoch": 0.782174313132641, "grad_norm": 0.7421875, "learning_rate": 7.14273265002347e-05, "loss": 1.8630572509765626, "step": 5950, "token_acc": 0.762536667545336 }, { "epoch": 0.7887472065203103, "grad_norm": 0.76171875, "learning_rate": 7.093508422492568e-05, "loss": 1.8501144409179688, "step": 6000, "token_acc": 0.7612453038222803 }, { "epoch": 0.7887472065203103, "eval_loss": 1.9278579950332642, "eval_runtime": 236.8225, "eval_samples_per_second": 186.101, "eval_steps_per_second": 2.909, "eval_token_acc": 0.7643631778364091, "step": 6000 }, { "epoch": 0.7953200999079795, "grad_norm": 0.9609375, "learning_rate": 7.044036912004159e-05, "loss": 1.8581178283691406, "step": 6050, "token_acc": 0.7597096150613548 }, { "epoch": 0.8018929932956488, "grad_norm": 0.66015625, "learning_rate": 6.99432396207972e-05, "loss": 1.85968994140625, "step": 6100, "token_acc": 0.760983003636121 }, { "epoch": 0.808465886683318, "grad_norm": 0.8984375, "learning_rate": 6.94437544475929e-05, "loss": 1.8418545532226562, "step": 6150, "token_acc": 0.7633178669389553 }, { "epoch": 0.8150387800709873, "grad_norm": 0.640625, "learning_rate": 6.894197259907879e-05, "loss": 1.8265931701660156, "step": 6200, "token_acc": 0.7654604394264699 }, { "epoch": 0.8216116734586565, "grad_norm": 0.6953125, "learning_rate": 6.843795334518576e-05, "loss": 1.8271298217773437, "step": 6250, "token_acc": 0.7637430879196561 }, { "epoch": 0.8281845668463258, "grad_norm": 0.8203125, "learning_rate": 6.79317562201246e-05, "loss": 1.8288789367675782, "step": 6300, "token_acc": 0.7608325763635624 }, { "epoch": 0.834757460233995, "grad_norm": 0.6953125, "learning_rate": 6.742344101535394e-05, "loss": 1.8086236572265626, "step": 6350, "token_acc": 0.7633120515197936 }, { "epoch": 0.8413303536216643, "grad_norm": 0.55859375, "learning_rate": 6.691306777251762e-05, "loss": 1.7932760620117187, "step": 6400, "token_acc": 0.7665305845357507 }, { "epoch": 0.8479032470093335, "grad_norm": 0.6484375, "learning_rate": 6.640069677635282e-05, "loss": 1.8109786987304688, "step": 6450, "token_acc": 0.7631320021044253 }, { "epoch": 0.8544761403970028, "grad_norm": 0.5625, "learning_rate": 6.58863885475691e-05, "loss": 1.7983740234375, "step": 6500, "token_acc": 0.7650729466919018 }, { "epoch": 0.861049033784672, "grad_norm": 0.625, "learning_rate": 6.537020383569988e-05, "loss": 1.8120062255859375, "step": 6550, "token_acc": 0.7633286718136009 }, { "epoch": 0.8676219271723412, "grad_norm": 0.68359375, "learning_rate": 6.485220361192677e-05, "loss": 1.826031951904297, "step": 6600, "token_acc": 0.7621315968131627 }, { "epoch": 0.8741948205600105, "grad_norm": 0.7421875, "learning_rate": 6.433244906187763e-05, "loss": 1.8025027465820314, "step": 6650, "token_acc": 0.763683785326105 }, { "epoch": 0.8807677139476797, "grad_norm": 0.61328125, "learning_rate": 6.381100157839948e-05, "loss": 1.8083682250976563, "step": 6700, "token_acc": 0.7638322653360178 }, { "epoch": 0.887340607335349, "grad_norm": 0.55078125, "learning_rate": 6.328792275430682e-05, "loss": 1.8106515502929688, "step": 6750, "token_acc": 0.7645292486420634 }, { "epoch": 0.8939135007230182, "grad_norm": 0.6171875, "learning_rate": 6.276327437510636e-05, "loss": 1.7926376342773438, "step": 6800, "token_acc": 0.7669320628731736 }, { "epoch": 0.9004863941106875, "grad_norm": 0.7578125, "learning_rate": 6.22371184116989e-05, "loss": 1.7977276611328126, "step": 6850, "token_acc": 0.7638069323509712 }, { "epoch": 0.9070592874983567, "grad_norm": 0.609375, "learning_rate": 6.170951701305951e-05, "loss": 1.8151174926757812, "step": 6900, "token_acc": 0.7588274415858517 }, { "epoch": 0.913632180886026, "grad_norm": 0.64453125, "learning_rate": 6.118053249889652e-05, "loss": 1.7749380493164062, "step": 6950, "token_acc": 0.7635055545232533 }, { "epoch": 0.9202050742736952, "grad_norm": 0.71875, "learning_rate": 6.0650227352290345e-05, "loss": 1.7828396606445311, "step": 7000, "token_acc": 0.7631919048643416 }, { "epoch": 0.9267779676613646, "grad_norm": 0.55859375, "learning_rate": 6.011866421231309e-05, "loss": 1.7750047302246095, "step": 7050, "token_acc": 0.7658884744785802 }, { "epoch": 0.9333508610490338, "grad_norm": 0.703125, "learning_rate": 5.9585905866629687e-05, "loss": 1.7743110656738281, "step": 7100, "token_acc": 0.764956263144499 }, { "epoch": 0.9399237544367031, "grad_norm": 0.70703125, "learning_rate": 5.905201524408148e-05, "loss": 1.7661270141601562, "step": 7150, "token_acc": 0.7625542988555213 }, { "epoch": 0.9464966478243723, "grad_norm": 0.62890625, "learning_rate": 5.8517055407253115e-05, "loss": 1.7674331665039062, "step": 7200, "token_acc": 0.7658641448139589 }, { "epoch": 0.9530695412120416, "grad_norm": 0.7265625, "learning_rate": 5.798108954502368e-05, "loss": 1.76580810546875, "step": 7250, "token_acc": 0.764363801032948 }, { "epoch": 0.9596424345997108, "grad_norm": 0.6953125, "learning_rate": 5.7444180965102936e-05, "loss": 1.764315185546875, "step": 7300, "token_acc": 0.765748932533409 }, { "epoch": 0.9662153279873801, "grad_norm": 0.5390625, "learning_rate": 5.69063930865534e-05, "loss": 1.7479220581054689, "step": 7350, "token_acc": 0.7640023759364084 }, { "epoch": 0.9727882213750493, "grad_norm": 0.6796875, "learning_rate": 5.63677894322994e-05, "loss": 1.783513641357422, "step": 7400, "token_acc": 0.7652048454713193 }, { "epoch": 0.9793611147627186, "grad_norm": 0.6484375, "learning_rate": 5.5828433621623845e-05, "loss": 1.7546864318847657, "step": 7450, "token_acc": 0.7662415623916118 }, { "epoch": 0.9859340081503878, "grad_norm": 0.60546875, "learning_rate": 5.5288389362653484e-05, "loss": 1.7443992614746093, "step": 7500, "token_acc": 0.7674998687827633 }, { "epoch": 0.9925069015380571, "grad_norm": 0.625, "learning_rate": 5.474772044483391e-05, "loss": 1.7637782287597656, "step": 7550, "token_acc": 0.7650653153454292 }, { "epoch": 0.9990797949257263, "grad_norm": 0.57421875, "learning_rate": 5.420649073139469e-05, "loss": 1.744835205078125, "step": 7600, "token_acc": 0.7651500535549154 }, { "epoch": 1.0056526883133956, "grad_norm": 0.57421875, "learning_rate": 5.366476415180599e-05, "loss": 1.7677224731445313, "step": 7650, "token_acc": 0.7611829185169189 }, { "epoch": 1.0122255817010648, "grad_norm": 0.546875, "learning_rate": 5.3122604694227265e-05, "loss": 1.731588134765625, "step": 7700, "token_acc": 0.7665744247751762 }, { "epoch": 1.018798475088734, "grad_norm": 0.515625, "learning_rate": 5.258007639794907e-05, "loss": 1.7428884887695313, "step": 7750, "token_acc": 0.76526092110062 }, { "epoch": 1.0253713684764034, "grad_norm": 0.5390625, "learning_rate": 5.203724334582875e-05, "loss": 1.724066162109375, "step": 7800, "token_acc": 0.7678043302715252 }, { "epoch": 1.0319442618640726, "grad_norm": 0.578125, "learning_rate": 5.1494169656721104e-05, "loss": 1.7163406372070313, "step": 7850, "token_acc": 0.7682152833871836 }, { "epoch": 1.0385171552517418, "grad_norm": 0.734375, "learning_rate": 5.095091947790472e-05, "loss": 1.7165689086914062, "step": 7900, "token_acc": 0.7668357289737444 }, { "epoch": 1.045090048639411, "grad_norm": 0.58984375, "learning_rate": 5.040755697750496e-05, "loss": 1.7132667541503905, "step": 7950, "token_acc": 0.767668712380105 }, { "epoch": 1.0516629420270802, "grad_norm": 0.53515625, "learning_rate": 4.9864146336914465e-05, "loss": 1.7193359375, "step": 8000, "token_acc": 0.7657205294292607 }, { "epoch": 1.0516629420270802, "eval_loss": 1.837268590927124, "eval_runtime": 235.8095, "eval_samples_per_second": 186.901, "eval_steps_per_second": 2.922, "eval_token_acc": 0.7689604263199924, "step": 8000 }, { "epoch": 1.0582358354147496, "grad_norm": 0.64453125, "learning_rate": 4.9320751743212176e-05, "loss": 1.7165196228027344, "step": 8050, "token_acc": 0.7653232432176571 }, { "epoch": 1.0648087288024188, "grad_norm": 0.59765625, "learning_rate": 4.877743738158155e-05, "loss": 1.7286593627929687, "step": 8100, "token_acc": 0.7641165060152924 }, { "epoch": 1.071381622190088, "grad_norm": 0.51171875, "learning_rate": 4.823426742772917e-05, "loss": 1.695826873779297, "step": 8150, "token_acc": 0.7685074226887527 }, { "epoch": 1.0779545155777572, "grad_norm": 0.5703125, "learning_rate": 4.7691306040304306e-05, "loss": 1.7172344970703124, "step": 8200, "token_acc": 0.7652489671997466 }, { "epoch": 1.0845274089654267, "grad_norm": 0.7109375, "learning_rate": 4.714861735332058e-05, "loss": 1.6970980834960938, "step": 8250, "token_acc": 0.7678146398472478 }, { "epoch": 1.0911003023530959, "grad_norm": 0.55859375, "learning_rate": 4.6606265468580516e-05, "loss": 1.6961888122558593, "step": 8300, "token_acc": 0.7714541070556682 }, { "epoch": 1.097673195740765, "grad_norm": 0.546875, "learning_rate": 4.6064314448103974e-05, "loss": 1.6937094116210938, "step": 8350, "token_acc": 0.7699465368393305 }, { "epoch": 1.1042460891284342, "grad_norm": 0.56640625, "learning_rate": 4.5522828306561085e-05, "loss": 1.6934506225585937, "step": 8400, "token_acc": 0.7680296355759598 }, { "epoch": 1.1108189825161037, "grad_norm": 0.734375, "learning_rate": 4.498187100371105e-05, "loss": 1.703126220703125, "step": 8450, "token_acc": 0.7660449721996203 }, { "epoch": 1.1173918759037729, "grad_norm": 0.57421875, "learning_rate": 4.4441506436847194e-05, "loss": 1.6976077270507812, "step": 8500, "token_acc": 0.7660062252481052 }, { "epoch": 1.123964769291442, "grad_norm": 0.58203125, "learning_rate": 4.390179843324947e-05, "loss": 1.6787896728515626, "step": 8550, "token_acc": 0.7675875080012128 }, { "epoch": 1.1305376626791113, "grad_norm": 0.5546875, "learning_rate": 4.3362810742645344e-05, "loss": 1.671527099609375, "step": 8600, "token_acc": 0.7726572467785786 }, { "epoch": 1.1371105560667807, "grad_norm": 0.58984375, "learning_rate": 4.282460702967962e-05, "loss": 1.6855081176757813, "step": 8650, "token_acc": 0.7697742729365689 }, { "epoch": 1.1436834494544499, "grad_norm": 0.52734375, "learning_rate": 4.228725086639458e-05, "loss": 1.6703143310546875, "step": 8700, "token_acc": 0.7704307602426076 }, { "epoch": 1.150256342842119, "grad_norm": 0.56640625, "learning_rate": 4.175080572472082e-05, "loss": 1.6878749084472657, "step": 8750, "token_acc": 0.7682984359233098 }, { "epoch": 1.1568292362297883, "grad_norm": 0.59375, "learning_rate": 4.121533496898002e-05, "loss": 1.69472412109375, "step": 8800, "token_acc": 0.7653907536202668 }, { "epoch": 1.1634021296174577, "grad_norm": 0.5703125, "learning_rate": 4.068090184840047e-05, "loss": 1.6784718322753907, "step": 8850, "token_acc": 0.7674758636654547 }, { "epoch": 1.169975023005127, "grad_norm": 0.53515625, "learning_rate": 4.0147569489646135e-05, "loss": 1.6566871643066405, "step": 8900, "token_acc": 0.7731254763401035 }, { "epoch": 1.176547916392796, "grad_norm": 0.55078125, "learning_rate": 3.9615400889360146e-05, "loss": 1.6721833801269532, "step": 8950, "token_acc": 0.7681256368871351 }, { "epoch": 1.1831208097804653, "grad_norm": 0.609375, "learning_rate": 3.908445890672373e-05, "loss": 1.6834414672851563, "step": 9000, "token_acc": 0.7689145513676909 }, { "epoch": 1.1896937031681345, "grad_norm": 0.5, "learning_rate": 3.855480625603142e-05, "loss": 1.6795899963378906, "step": 9050, "token_acc": 0.7689981482717203 }, { "epoch": 1.196266596555804, "grad_norm": 0.515625, "learning_rate": 3.8026505499283184e-05, "loss": 1.6775094604492187, "step": 9100, "token_acc": 0.7678651418602145 }, { "epoch": 1.2028394899434731, "grad_norm": 0.57421875, "learning_rate": 3.749961903879477e-05, "loss": 1.6525213623046875, "step": 9150, "token_acc": 0.7686804131998874 }, { "epoch": 1.2094123833311423, "grad_norm": 0.53515625, "learning_rate": 3.6974209109826726e-05, "loss": 1.6840940856933593, "step": 9200, "token_acc": 0.7642316750821783 }, { "epoch": 1.2159852767188117, "grad_norm": 0.494140625, "learning_rate": 3.645033777323339e-05, "loss": 1.6511599731445312, "step": 9250, "token_acc": 0.7737184378932712 }, { "epoch": 1.222558170106481, "grad_norm": 0.5234375, "learning_rate": 3.5928066908132144e-05, "loss": 1.6393515014648437, "step": 9300, "token_acc": 0.7703170812446798 }, { "epoch": 1.2291310634941501, "grad_norm": 0.5234375, "learning_rate": 3.5407458204594426e-05, "loss": 1.6625300598144532, "step": 9350, "token_acc": 0.7698629299985743 }, { "epoch": 1.2357039568818193, "grad_norm": 0.5859375, "learning_rate": 3.488857315635893e-05, "loss": 1.6773255920410157, "step": 9400, "token_acc": 0.7693895226882114 }, { "epoch": 1.2422768502694885, "grad_norm": 0.54296875, "learning_rate": 3.437147305356807e-05, "loss": 1.6579641723632812, "step": 9450, "token_acc": 0.7706150398406374 }, { "epoch": 1.248849743657158, "grad_norm": 0.5078125, "learning_rate": 3.3856218975528434e-05, "loss": 1.6528695678710938, "step": 9500, "token_acc": 0.7691562115211821 }, { "epoch": 1.2554226370448271, "grad_norm": 0.5234375, "learning_rate": 3.334287178349611e-05, "loss": 1.6566635131835938, "step": 9550, "token_acc": 0.7694686428681645 }, { "epoch": 1.2619955304324963, "grad_norm": 0.53515625, "learning_rate": 3.2831492113487904e-05, "loss": 1.6695437622070313, "step": 9600, "token_acc": 0.7692787363152542 }, { "epoch": 1.2685684238201658, "grad_norm": 0.50390625, "learning_rate": 3.2322140369119045e-05, "loss": 1.670698699951172, "step": 9650, "token_acc": 0.7669365404642725 }, { "epoch": 1.275141317207835, "grad_norm": 0.52734375, "learning_rate": 3.181487671446836e-05, "loss": 1.646166534423828, "step": 9700, "token_acc": 0.7694847008702814 }, { "epoch": 1.2817142105955042, "grad_norm": 0.53515625, "learning_rate": 3.130976106697174e-05, "loss": 1.6512442016601563, "step": 9750, "token_acc": 0.7689283254541929 }, { "epoch": 1.2882871039831734, "grad_norm": 0.484375, "learning_rate": 3.080685309034487e-05, "loss": 1.6418820190429688, "step": 9800, "token_acc": 0.7712326161812989 }, { "epoch": 1.2948599973708426, "grad_norm": 0.55078125, "learning_rate": 3.0306212187535653e-05, "loss": 1.6504058837890625, "step": 9850, "token_acc": 0.7678108326514603 }, { "epoch": 1.301432890758512, "grad_norm": 0.52734375, "learning_rate": 2.9807897493707703e-05, "loss": 1.6451980590820312, "step": 9900, "token_acc": 0.7685139265616777 }, { "epoch": 1.3080057841461812, "grad_norm": 0.48046875, "learning_rate": 2.9311967869255324e-05, "loss": 1.6350534057617188, "step": 9950, "token_acc": 0.7734443379837148 }, { "epoch": 1.3145786775338504, "grad_norm": 0.5390625, "learning_rate": 2.881848189285105e-05, "loss": 1.6401956176757813, "step": 10000, "token_acc": 0.7687987154069689 }, { "epoch": 1.3145786775338504, "eval_loss": 1.7793248891830444, "eval_runtime": 236.4249, "eval_samples_per_second": 186.414, "eval_steps_per_second": 2.914, "eval_token_acc": 0.7717742013781854, "step": 10000 }, { "epoch": 1.3211515709215196, "grad_norm": 0.51171875, "learning_rate": 2.8327497854526276e-05, "loss": 1.6347137451171876, "step": 10050, "token_acc": 0.7699386839582517 }, { "epoch": 1.3277244643091888, "grad_norm": 0.546875, "learning_rate": 2.783907374878623e-05, "loss": 1.6400608825683594, "step": 10100, "token_acc": 0.7713103096257532 }, { "epoch": 1.3342973576968582, "grad_norm": 0.54296875, "learning_rate": 2.7353267267759587e-05, "loss": 1.6206582641601563, "step": 10150, "token_acc": 0.7717408144832179 }, { "epoch": 1.3408702510845274, "grad_norm": 0.5234375, "learning_rate": 2.6870135794384084e-05, "loss": 1.6268215942382813, "step": 10200, "token_acc": 0.7686959485978778 }, { "epoch": 1.3474431444721966, "grad_norm": 0.48828125, "learning_rate": 2.63897363956284e-05, "loss": 1.626171875, "step": 10250, "token_acc": 0.7716723626001355 }, { "epoch": 1.354016037859866, "grad_norm": 0.5234375, "learning_rate": 2.591212581575153e-05, "loss": 1.6198342895507813, "step": 10300, "token_acc": 0.7731112873028669 }, { "epoch": 1.3605889312475352, "grad_norm": 0.498046875, "learning_rate": 2.543736046960019e-05, "loss": 1.6164779663085938, "step": 10350, "token_acc": 0.769608566595914 }, { "epoch": 1.3671618246352044, "grad_norm": 0.490234375, "learning_rate": 2.4965496435945106e-05, "loss": 1.641104736328125, "step": 10400, "token_acc": 0.7738014461806595 }, { "epoch": 1.3737347180228736, "grad_norm": 0.486328125, "learning_rate": 2.449658945085718e-05, "loss": 1.6213189697265624, "step": 10450, "token_acc": 0.768932852837636 }, { "epoch": 1.3803076114105428, "grad_norm": 0.494140625, "learning_rate": 2.4030694901123825e-05, "loss": 1.6324661254882813, "step": 10500, "token_acc": 0.7704852495864681 }, { "epoch": 1.3868805047982122, "grad_norm": 0.453125, "learning_rate": 2.3567867817706974e-05, "loss": 1.6237179565429687, "step": 10550, "token_acc": 0.770884503460658 }, { "epoch": 1.3934533981858814, "grad_norm": 0.60546875, "learning_rate": 2.310816286924261e-05, "loss": 1.6309237670898438, "step": 10600, "token_acc": 0.7696517291192732 }, { "epoch": 1.4000262915735506, "grad_norm": 0.51953125, "learning_rate": 2.2651634355583606e-05, "loss": 1.6198001098632813, "step": 10650, "token_acc": 0.7706210696067582 }, { "epoch": 1.40659918496122, "grad_norm": 0.462890625, "learning_rate": 2.2198336201385674e-05, "loss": 1.637750244140625, "step": 10700, "token_acc": 0.768926374214257 }, { "epoch": 1.4131720783488892, "grad_norm": 0.52734375, "learning_rate": 2.1748321949738088e-05, "loss": 1.6378421020507812, "step": 10750, "token_acc": 0.767715041979971 }, { "epoch": 1.4197449717365584, "grad_norm": 0.482421875, "learning_rate": 2.130164475583896e-05, "loss": 1.6255686950683594, "step": 10800, "token_acc": 0.7689490174738093 }, { "epoch": 1.4263178651242276, "grad_norm": 0.47265625, "learning_rate": 2.0858357380716826e-05, "loss": 1.6103607177734376, "step": 10850, "token_acc": 0.7713349007488887 }, { "epoch": 1.4328907585118968, "grad_norm": 0.51171875, "learning_rate": 2.041851218499844e-05, "loss": 1.6049491882324218, "step": 10900, "token_acc": 0.7700552131398218 }, { "epoch": 1.4394636518995663, "grad_norm": 0.50390625, "learning_rate": 1.998216112272407e-05, "loss": 1.6167376708984376, "step": 10950, "token_acc": 0.770873658059624 }, { "epoch": 1.4460365452872355, "grad_norm": 0.56640625, "learning_rate": 1.9549355735210663e-05, "loss": 1.607739715576172, "step": 11000, "token_acc": 0.7714620438930024 }, { "epoch": 1.4526094386749047, "grad_norm": 0.5390625, "learning_rate": 1.9120147144963918e-05, "loss": 1.6082345581054687, "step": 11050, "token_acc": 0.7727137897326997 }, { "epoch": 1.459182332062574, "grad_norm": 0.51171875, "learning_rate": 1.869458604963973e-05, "loss": 1.6116729736328126, "step": 11100, "token_acc": 0.7693582542656137 }, { "epoch": 1.465755225450243, "grad_norm": 0.482421875, "learning_rate": 1.827272271605581e-05, "loss": 1.6167997741699218, "step": 11150, "token_acc": 0.7716410540840345 }, { "epoch": 1.4723281188379125, "grad_norm": 0.4609375, "learning_rate": 1.785460697425422e-05, "loss": 1.6175106811523436, "step": 11200, "token_acc": 0.7678629010587413 }, { "epoch": 1.4789010122255817, "grad_norm": 0.50390625, "learning_rate": 1.7440288211615553e-05, "loss": 1.6349208068847656, "step": 11250, "token_acc": 0.767691050989249 }, { "epoch": 1.4854739056132509, "grad_norm": 0.470703125, "learning_rate": 1.7029815367025304e-05, "loss": 1.5947479248046874, "step": 11300, "token_acc": 0.7744744172137812 }, { "epoch": 1.4920467990009203, "grad_norm": 0.46484375, "learning_rate": 1.6623236925093293e-05, "loss": 1.6072760009765625, "step": 11350, "token_acc": 0.7699475558805804 }, { "epoch": 1.4986196923885895, "grad_norm": 0.48046875, "learning_rate": 1.622060091042666e-05, "loss": 1.6194985961914063, "step": 11400, "token_acc": 0.771240942645837 }, { "epoch": 1.5051925857762587, "grad_norm": 0.48828125, "learning_rate": 1.582195488195731e-05, "loss": 1.5986326599121095, "step": 11450, "token_acc": 0.7726320047065744 }, { "epoch": 1.5117654791639281, "grad_norm": 0.458984375, "learning_rate": 1.5427345927324305e-05, "loss": 1.6070111083984375, "step": 11500, "token_acc": 0.773540255743937 }, { "epoch": 1.518338372551597, "grad_norm": 0.4921875, "learning_rate": 1.5036820657311839e-05, "loss": 1.6082412719726562, "step": 11550, "token_acc": 0.769586028833374 }, { "epoch": 1.5249112659392665, "grad_norm": 0.4609375, "learning_rate": 1.4650425200343732e-05, "loss": 1.6070376586914064, "step": 11600, "token_acc": 0.7711919617970434 }, { "epoch": 1.5314841593269357, "grad_norm": 0.455078125, "learning_rate": 1.4268205197034717e-05, "loss": 1.6123899841308593, "step": 11650, "token_acc": 0.7703268714219823 }, { "epoch": 1.538057052714605, "grad_norm": 0.455078125, "learning_rate": 1.3890205794799476e-05, "loss": 1.615906982421875, "step": 11700, "token_acc": 0.7700723973792927 }, { "epoch": 1.5446299461022743, "grad_norm": 0.4453125, "learning_rate": 1.3516471642519784e-05, "loss": 1.5967388916015626, "step": 11750, "token_acc": 0.772738502438927 }, { "epoch": 1.5512028394899433, "grad_norm": 0.462890625, "learning_rate": 1.3147046885270736e-05, "loss": 1.5981399536132812, "step": 11800, "token_acc": 0.7728519929058771 }, { "epoch": 1.5577757328776127, "grad_norm": 0.45703125, "learning_rate": 1.2781975159106319e-05, "loss": 1.6243299865722656, "step": 11850, "token_acc": 0.7696561945272465 }, { "epoch": 1.564348626265282, "grad_norm": 0.474609375, "learning_rate": 1.24212995859052e-05, "loss": 1.6280474853515625, "step": 11900, "token_acc": 0.7673742244402482 }, { "epoch": 1.5709215196529511, "grad_norm": 0.50390625, "learning_rate": 1.2065062768277135e-05, "loss": 1.5971218872070312, "step": 11950, "token_acc": 0.7735671474840562 }, { "epoch": 1.5774944130406205, "grad_norm": 0.427734375, "learning_rate": 1.171330678453097e-05, "loss": 1.591236572265625, "step": 12000, "token_acc": 0.7735620511595184 }, { "epoch": 1.5774944130406205, "eval_loss": 1.754050612449646, "eval_runtime": 236.7394, "eval_samples_per_second": 186.167, "eval_steps_per_second": 2.91, "eval_token_acc": 0.7731684672723433, "step": 12000 } ], "logging_steps": 50, "max_steps": 15214, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.831322460026044e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }