| { |
| "best_global_step": 12000, |
| "best_metric": 1.75405061, |
| "best_model_checkpoint": "/scratch/prj0000000267/yuefan/UnifyTrajLLM/output_rope_instruct_gate_llmlow5_5e-4/v3-20251108-210106/checkpoint-12000", |
| "epoch": 1.5774944130406205, |
| "eval_steps": 2000, |
| "global_step": 12000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00013145786775338504, |
| "grad_norm": 268.0, |
| "learning_rate": 1.314060446780552e-07, |
| "loss": 30.039020538330078, |
| "step": 1, |
| "token_acc": 0.0 |
| }, |
| { |
| "epoch": 0.006572893387669252, |
| "grad_norm": 47.5, |
| "learning_rate": 6.5703022339027605e-06, |
| "loss": 24.82280123963648, |
| "step": 50, |
| "token_acc": 0.036200167198766994 |
| }, |
| { |
| "epoch": 0.013145786775338505, |
| "grad_norm": 59.5, |
| "learning_rate": 1.3140604467805521e-05, |
| "loss": 14.215625, |
| "step": 100, |
| "token_acc": 0.43851580932023954 |
| }, |
| { |
| "epoch": 0.019718680163007755, |
| "grad_norm": 10.1875, |
| "learning_rate": 1.9710906701708278e-05, |
| "loss": 6.50881591796875, |
| "step": 150, |
| "token_acc": 0.6614513467387406 |
| }, |
| { |
| "epoch": 0.02629157355067701, |
| "grad_norm": 32.25, |
| "learning_rate": 2.6281208935611042e-05, |
| "loss": 6.06468505859375, |
| "step": 200, |
| "token_acc": 0.6807562909100595 |
| }, |
| { |
| "epoch": 0.03286446693834626, |
| "grad_norm": 10.9375, |
| "learning_rate": 3.2851511169513796e-05, |
| "loss": 5.703701171875, |
| "step": 250, |
| "token_acc": 0.6826512527189237 |
| }, |
| { |
| "epoch": 0.03943736032601551, |
| "grad_norm": 14.3125, |
| "learning_rate": 3.9421813403416556e-05, |
| "loss": 5.412930908203125, |
| "step": 300, |
| "token_acc": 0.6959136632453732 |
| }, |
| { |
| "epoch": 0.046010253713684765, |
| "grad_norm": 7.03125, |
| "learning_rate": 4.5992115637319317e-05, |
| "loss": 5.1832080078125, |
| "step": 350, |
| "token_acc": 0.6958016739273748 |
| }, |
| { |
| "epoch": 0.05258314710135402, |
| "grad_norm": 6.125, |
| "learning_rate": 5.2562417871222084e-05, |
| "loss": 5.019949340820313, |
| "step": 400, |
| "token_acc": 0.7036372985827332 |
| }, |
| { |
| "epoch": 0.059156040489023266, |
| "grad_norm": 5.09375, |
| "learning_rate": 5.913272010512484e-05, |
| "loss": 4.91180908203125, |
| "step": 450, |
| "token_acc": 0.7090584023609108 |
| }, |
| { |
| "epoch": 0.06572893387669251, |
| "grad_norm": 6.90625, |
| "learning_rate": 6.570302233902759e-05, |
| "loss": 4.86035888671875, |
| "step": 500, |
| "token_acc": 0.7103483463054483 |
| }, |
| { |
| "epoch": 0.07230182726436177, |
| "grad_norm": 4.21875, |
| "learning_rate": 7.227332457293036e-05, |
| "loss": 4.782819213867188, |
| "step": 550, |
| "token_acc": 0.711807526156599 |
| }, |
| { |
| "epoch": 0.07887472065203102, |
| "grad_norm": 4.3125, |
| "learning_rate": 7.884362680683311e-05, |
| "loss": 4.707298889160156, |
| "step": 600, |
| "token_acc": 0.7184096746149201 |
| }, |
| { |
| "epoch": 0.08544761403970028, |
| "grad_norm": 5.25, |
| "learning_rate": 8.541392904073588e-05, |
| "loss": 4.594347534179687, |
| "step": 650, |
| "token_acc": 0.7216674015037796 |
| }, |
| { |
| "epoch": 0.09202050742736953, |
| "grad_norm": 3.609375, |
| "learning_rate": 9.198423127463863e-05, |
| "loss": 4.58609375, |
| "step": 700, |
| "token_acc": 0.714123727321825 |
| }, |
| { |
| "epoch": 0.09859340081503878, |
| "grad_norm": 3.421875, |
| "learning_rate": 9.85545335085414e-05, |
| "loss": 4.494258422851562, |
| "step": 750, |
| "token_acc": 0.7178661247995335 |
| }, |
| { |
| "epoch": 0.10516629420270804, |
| "grad_norm": 4.75, |
| "learning_rate": 9.999820340427517e-05, |
| "loss": 4.393697509765625, |
| "step": 800, |
| "token_acc": 0.719993257471585 |
| }, |
| { |
| "epoch": 0.11173918759037728, |
| "grad_norm": 3.28125, |
| "learning_rate": 9.999064399990964e-05, |
| "loss": 4.324550170898437, |
| "step": 850, |
| "token_acc": 0.7193472614965004 |
| }, |
| { |
| "epoch": 0.11831208097804653, |
| "grad_norm": 3.359375, |
| "learning_rate": 9.997717975457807e-05, |
| "loss": 4.1653680419921875, |
| "step": 900, |
| "token_acc": 0.7247381121504013 |
| }, |
| { |
| "epoch": 0.12488497436571579, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.995781225866254e-05, |
| "loss": 4.05832275390625, |
| "step": 950, |
| "token_acc": 0.7224337955208558 |
| }, |
| { |
| "epoch": 0.13145786775338503, |
| "grad_norm": 2.21875, |
| "learning_rate": 9.993254379983084e-05, |
| "loss": 3.977420654296875, |
| "step": 1000, |
| "token_acc": 0.7213713367669359 |
| }, |
| { |
| "epoch": 0.13803076114105428, |
| "grad_norm": 1.59375, |
| "learning_rate": 9.990137736276604e-05, |
| "loss": 3.8511199951171875, |
| "step": 1050, |
| "token_acc": 0.7259079629296982 |
| }, |
| { |
| "epoch": 0.14460365452872354, |
| "grad_norm": 2.109375, |
| "learning_rate": 9.98643166288141e-05, |
| "loss": 3.7238555908203126, |
| "step": 1100, |
| "token_acc": 0.729174537368435 |
| }, |
| { |
| "epoch": 0.1511765479163928, |
| "grad_norm": 3.484375, |
| "learning_rate": 9.982136597554896e-05, |
| "loss": 3.6605801391601562, |
| "step": 1150, |
| "token_acc": 0.7267878333802646 |
| }, |
| { |
| "epoch": 0.15774944130406204, |
| "grad_norm": 1.765625, |
| "learning_rate": 9.977253047625546e-05, |
| "loss": 3.586345520019531, |
| "step": 1200, |
| "token_acc": 0.7296032337886851 |
| }, |
| { |
| "epoch": 0.1643223346917313, |
| "grad_norm": 1.875, |
| "learning_rate": 9.971781589933012e-05, |
| "loss": 3.5275897216796874, |
| "step": 1250, |
| "token_acc": 0.7274431196530496 |
| }, |
| { |
| "epoch": 0.17089522807940055, |
| "grad_norm": 5.34375, |
| "learning_rate": 9.965722870759977e-05, |
| "loss": 3.4567681884765626, |
| "step": 1300, |
| "token_acc": 0.7260510593991347 |
| }, |
| { |
| "epoch": 0.1774681214670698, |
| "grad_norm": 2.984375, |
| "learning_rate": 9.959077605755818e-05, |
| "loss": 3.35341064453125, |
| "step": 1350, |
| "token_acc": 0.7315002945127964 |
| }, |
| { |
| "epoch": 0.18404101485473906, |
| "grad_norm": 2.640625, |
| "learning_rate": 9.951846579852069e-05, |
| "loss": 3.2548678588867186, |
| "step": 1400, |
| "token_acc": 0.7334485568361279 |
| }, |
| { |
| "epoch": 0.19061390824240831, |
| "grad_norm": 2.328125, |
| "learning_rate": 9.944030647169715e-05, |
| "loss": 3.1699752807617188, |
| "step": 1450, |
| "token_acc": 0.7346018069265517 |
| }, |
| { |
| "epoch": 0.19718680163007757, |
| "grad_norm": 3.75, |
| "learning_rate": 9.935630730918297e-05, |
| "loss": 3.123944091796875, |
| "step": 1500, |
| "token_acc": 0.7325574233567774 |
| }, |
| { |
| "epoch": 0.20375969501774682, |
| "grad_norm": 2.328125, |
| "learning_rate": 9.926647823286865e-05, |
| "loss": 3.031203308105469, |
| "step": 1550, |
| "token_acc": 0.7343244664345068 |
| }, |
| { |
| "epoch": 0.21033258840541608, |
| "grad_norm": 2.09375, |
| "learning_rate": 9.917082985326782e-05, |
| "loss": 2.9396633911132812, |
| "step": 1600, |
| "token_acc": 0.736209056167852 |
| }, |
| { |
| "epoch": 0.2169054817930853, |
| "grad_norm": 1.921875, |
| "learning_rate": 9.906937346826395e-05, |
| "loss": 2.8921356201171875, |
| "step": 1650, |
| "token_acc": 0.7373535529118604 |
| }, |
| { |
| "epoch": 0.22347837518075456, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.896212106177583e-05, |
| "loss": 2.8311395263671875, |
| "step": 1700, |
| "token_acc": 0.7392403929710977 |
| }, |
| { |
| "epoch": 0.2300512685684238, |
| "grad_norm": 2.015625, |
| "learning_rate": 9.884908530234208e-05, |
| "loss": 2.7363882446289063, |
| "step": 1750, |
| "token_acc": 0.7410795625843831 |
| }, |
| { |
| "epoch": 0.23662416195609307, |
| "grad_norm": 1.5546875, |
| "learning_rate": 9.873027954162471e-05, |
| "loss": 2.6730242919921876, |
| "step": 1800, |
| "token_acc": 0.7443422077792354 |
| }, |
| { |
| "epoch": 0.24319705534376232, |
| "grad_norm": 1.84375, |
| "learning_rate": 9.860571781283208e-05, |
| "loss": 2.6252935791015624, |
| "step": 1850, |
| "token_acc": 0.7444647858608681 |
| }, |
| { |
| "epoch": 0.24976994873143157, |
| "grad_norm": 1.6640625, |
| "learning_rate": 9.847541482906129e-05, |
| "loss": 2.5712957763671875, |
| "step": 1900, |
| "token_acc": 0.7508503287266872 |
| }, |
| { |
| "epoch": 0.25634284211910086, |
| "grad_norm": 1.265625, |
| "learning_rate": 9.833938598156025e-05, |
| "loss": 2.5640655517578126, |
| "step": 1950, |
| "token_acc": 0.7425811658922213 |
| }, |
| { |
| "epoch": 0.26291573550677005, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.819764733790979e-05, |
| "loss": 2.5158842468261717, |
| "step": 2000, |
| "token_acc": 0.7452882362784471 |
| }, |
| { |
| "epoch": 0.26291573550677005, |
| "eval_loss": 2.518305778503418, |
| "eval_runtime": 236.0729, |
| "eval_samples_per_second": 186.692, |
| "eval_steps_per_second": 2.919, |
| "eval_token_acc": 0.749429720552148, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2694886288944393, |
| "grad_norm": 1.4765625, |
| "learning_rate": 9.805021564012564e-05, |
| "loss": 2.4857614135742185, |
| "step": 2050, |
| "token_acc": 0.7441553323650091 |
| }, |
| { |
| "epoch": 0.27606152228210856, |
| "grad_norm": 2.421875, |
| "learning_rate": 9.789710830268099e-05, |
| "loss": 2.450667724609375, |
| "step": 2100, |
| "token_acc": 0.7500234051090009 |
| }, |
| { |
| "epoch": 0.2826344156697778, |
| "grad_norm": 1.046875, |
| "learning_rate": 9.773834341044944e-05, |
| "loss": 2.4290037536621094, |
| "step": 2150, |
| "token_acc": 0.7513961437248966 |
| }, |
| { |
| "epoch": 0.28920730905744707, |
| "grad_norm": 1.8203125, |
| "learning_rate": 9.757393971656888e-05, |
| "loss": 2.413728942871094, |
| "step": 2200, |
| "token_acc": 0.7452505502003465 |
| }, |
| { |
| "epoch": 0.2957802024451163, |
| "grad_norm": 1.7578125, |
| "learning_rate": 9.740391664022633e-05, |
| "loss": 2.3729684448242185, |
| "step": 2250, |
| "token_acc": 0.752083845606853 |
| }, |
| { |
| "epoch": 0.3023530958327856, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.722829426436427e-05, |
| "loss": 2.3652894592285154, |
| "step": 2300, |
| "token_acc": 0.7517237172802866 |
| }, |
| { |
| "epoch": 0.30892598922045483, |
| "grad_norm": 0.8125, |
| "learning_rate": 9.704709333330836e-05, |
| "loss": 2.356060791015625, |
| "step": 2350, |
| "token_acc": 0.7475131194646989 |
| }, |
| { |
| "epoch": 0.3154988826081241, |
| "grad_norm": 0.90625, |
| "learning_rate": 9.686033525031719e-05, |
| "loss": 2.3459547424316405, |
| "step": 2400, |
| "token_acc": 0.7465522261190833 |
| }, |
| { |
| "epoch": 0.32207177599579334, |
| "grad_norm": 0.86328125, |
| "learning_rate": 9.666804207505414e-05, |
| "loss": 2.34242919921875, |
| "step": 2450, |
| "token_acc": 0.7503000046993743 |
| }, |
| { |
| "epoch": 0.3286446693834626, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.647023652098174e-05, |
| "loss": 2.30553955078125, |
| "step": 2500, |
| "token_acc": 0.7505765418279411 |
| }, |
| { |
| "epoch": 0.33521756277113185, |
| "grad_norm": 1.296875, |
| "learning_rate": 9.626694195267876e-05, |
| "loss": 2.2867636108398437, |
| "step": 2550, |
| "token_acc": 0.7495737322589445 |
| }, |
| { |
| "epoch": 0.3417904561588011, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.605818238308038e-05, |
| "loss": 2.2838902282714844, |
| "step": 2600, |
| "token_acc": 0.7510741453019647 |
| }, |
| { |
| "epoch": 0.34836334954647036, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.584398247064188e-05, |
| "loss": 2.2479782104492188, |
| "step": 2650, |
| "token_acc": 0.7525476660092044 |
| }, |
| { |
| "epoch": 0.3549362429341396, |
| "grad_norm": 0.98046875, |
| "learning_rate": 9.562436751642593e-05, |
| "loss": 2.2379521179199218, |
| "step": 2700, |
| "token_acc": 0.7535541690112872 |
| }, |
| { |
| "epoch": 0.36150913632180887, |
| "grad_norm": 1.2109375, |
| "learning_rate": 9.539936346111416e-05, |
| "loss": 2.25480712890625, |
| "step": 2750, |
| "token_acc": 0.7502481934500133 |
| }, |
| { |
| "epoch": 0.3680820297094781, |
| "grad_norm": 0.73046875, |
| "learning_rate": 9.516899688194294e-05, |
| "loss": 2.1890530395507812, |
| "step": 2800, |
| "token_acc": 0.7559071920628226 |
| }, |
| { |
| "epoch": 0.3746549230971474, |
| "grad_norm": 0.99609375, |
| "learning_rate": 9.493329498956421e-05, |
| "loss": 2.2252967834472654, |
| "step": 2850, |
| "token_acc": 0.7540685282249956 |
| }, |
| { |
| "epoch": 0.38122781648481663, |
| "grad_norm": 0.875, |
| "learning_rate": 9.469228562483132e-05, |
| "loss": 2.211038818359375, |
| "step": 2900, |
| "token_acc": 0.7534921970366274 |
| }, |
| { |
| "epoch": 0.3878007098724859, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.444599725551061e-05, |
| "loss": 2.1635357666015627, |
| "step": 2950, |
| "token_acc": 0.7530642715579734 |
| }, |
| { |
| "epoch": 0.39437360326015514, |
| "grad_norm": 1.265625, |
| "learning_rate": 9.419445897291867e-05, |
| "loss": 2.1792333984375, |
| "step": 3000, |
| "token_acc": 0.7530076526518494 |
| }, |
| { |
| "epoch": 0.4009464966478244, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.393770048848622e-05, |
| "loss": 2.168623352050781, |
| "step": 3050, |
| "token_acc": 0.7545220973858319 |
| }, |
| { |
| "epoch": 0.40751939003549364, |
| "grad_norm": 0.703125, |
| "learning_rate": 9.367575213024861e-05, |
| "loss": 2.1656561279296875, |
| "step": 3100, |
| "token_acc": 0.7529528081537318 |
| }, |
| { |
| "epoch": 0.4140922834231629, |
| "grad_norm": 0.69921875, |
| "learning_rate": 9.340864483926343e-05, |
| "loss": 2.147900390625, |
| "step": 3150, |
| "token_acc": 0.7535216548028473 |
| }, |
| { |
| "epoch": 0.42066517681083215, |
| "grad_norm": 0.703125, |
| "learning_rate": 9.313641016595588e-05, |
| "loss": 2.1436308288574217, |
| "step": 3200, |
| "token_acc": 0.754756994891503 |
| }, |
| { |
| "epoch": 0.4272380701985014, |
| "grad_norm": 0.91796875, |
| "learning_rate": 9.285908026639207e-05, |
| "loss": 2.1488153076171876, |
| "step": 3250, |
| "token_acc": 0.7516223648809727 |
| }, |
| { |
| "epoch": 0.4338109635861706, |
| "grad_norm": 0.80859375, |
| "learning_rate": 9.257668789848067e-05, |
| "loss": 2.1125421142578125, |
| "step": 3300, |
| "token_acc": 0.7542638775798605 |
| }, |
| { |
| "epoch": 0.44038385697383986, |
| "grad_norm": 0.94921875, |
| "learning_rate": 9.228926641810367e-05, |
| "loss": 2.127976379394531, |
| "step": 3350, |
| "token_acc": 0.7534171662400647 |
| }, |
| { |
| "epoch": 0.4469567503615091, |
| "grad_norm": 0.90234375, |
| "learning_rate": 9.199684977517645e-05, |
| "loss": 2.117357025146484, |
| "step": 3400, |
| "token_acc": 0.754085423576444 |
| }, |
| { |
| "epoch": 0.45352964374917837, |
| "grad_norm": 0.82421875, |
| "learning_rate": 9.169947250963753e-05, |
| "loss": 2.1096246337890623, |
| "step": 3450, |
| "token_acc": 0.7573267065803522 |
| }, |
| { |
| "epoch": 0.4601025371368476, |
| "grad_norm": 0.68359375, |
| "learning_rate": 9.139716974736889e-05, |
| "loss": 2.08451904296875, |
| "step": 3500, |
| "token_acc": 0.7560729910200322 |
| }, |
| { |
| "epoch": 0.4666754305245169, |
| "grad_norm": 0.94140625, |
| "learning_rate": 9.108997719604687e-05, |
| "loss": 2.0704086303710936, |
| "step": 3550, |
| "token_acc": 0.758234912558547 |
| }, |
| { |
| "epoch": 0.47324832391218613, |
| "grad_norm": 0.94921875, |
| "learning_rate": 9.077793114092435e-05, |
| "loss": 2.0744793701171873, |
| "step": 3600, |
| "token_acc": 0.7553985593878892 |
| }, |
| { |
| "epoch": 0.4798212172998554, |
| "grad_norm": 0.87109375, |
| "learning_rate": 9.046106844054491e-05, |
| "loss": 2.055031433105469, |
| "step": 3650, |
| "token_acc": 0.7576514412150057 |
| }, |
| { |
| "epoch": 0.48639411068752464, |
| "grad_norm": 0.6875, |
| "learning_rate": 9.013942652238908e-05, |
| "loss": 2.0411907958984377, |
| "step": 3700, |
| "token_acc": 0.7550962098257407 |
| }, |
| { |
| "epoch": 0.4929670040751939, |
| "grad_norm": 0.75390625, |
| "learning_rate": 8.981304337845337e-05, |
| "loss": 2.0571356201171875, |
| "step": 3750, |
| "token_acc": 0.7560834470136957 |
| }, |
| { |
| "epoch": 0.49953989746286315, |
| "grad_norm": 0.67578125, |
| "learning_rate": 8.948195756076285e-05, |
| "loss": 2.074111022949219, |
| "step": 3800, |
| "token_acc": 0.7541681281518444 |
| }, |
| { |
| "epoch": 0.5061127908505324, |
| "grad_norm": 0.69140625, |
| "learning_rate": 8.914620817681729e-05, |
| "loss": 2.0392041015625, |
| "step": 3850, |
| "token_acc": 0.7577290307595985 |
| }, |
| { |
| "epoch": 0.5126856842382017, |
| "grad_norm": 1.0703125, |
| "learning_rate": 8.880583488497192e-05, |
| "loss": 2.0631610107421876, |
| "step": 3900, |
| "token_acc": 0.7556020800972513 |
| }, |
| { |
| "epoch": 0.5192585776258709, |
| "grad_norm": 0.69921875, |
| "learning_rate": 8.846087788975292e-05, |
| "loss": 2.0386505126953125, |
| "step": 3950, |
| "token_acc": 0.7588939592425994 |
| }, |
| { |
| "epoch": 0.5258314710135401, |
| "grad_norm": 0.9375, |
| "learning_rate": 8.811137793710863e-05, |
| "loss": 2.0313320922851563, |
| "step": 4000, |
| "token_acc": 0.7567113841020556 |
| }, |
| { |
| "epoch": 0.5258314710135401, |
| "eval_loss": 2.076582908630371, |
| "eval_runtime": 235.7989, |
| "eval_samples_per_second": 186.909, |
| "eval_steps_per_second": 2.922, |
| "eval_token_acc": 0.7596436572643577, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.5324043644012094, |
| "grad_norm": 0.82421875, |
| "learning_rate": 8.775737630959662e-05, |
| "loss": 2.0022723388671877, |
| "step": 4050, |
| "token_acc": 0.7579540762299285 |
| }, |
| { |
| "epoch": 0.5389772577888786, |
| "grad_norm": 0.93359375, |
| "learning_rate": 8.739891482150741e-05, |
| "loss": 2.0245912170410154, |
| "step": 4100, |
| "token_acc": 0.7564656900076155 |
| }, |
| { |
| "epoch": 0.5455501511765479, |
| "grad_norm": 0.76171875, |
| "learning_rate": 8.703603581392546e-05, |
| "loss": 2.0100286865234374, |
| "step": 4150, |
| "token_acc": 0.7567689057436112 |
| }, |
| { |
| "epoch": 0.5521230445642171, |
| "grad_norm": 0.8359375, |
| "learning_rate": 8.666878214972783e-05, |
| "loss": 2.021333923339844, |
| "step": 4200, |
| "token_acc": 0.7570281681148902 |
| }, |
| { |
| "epoch": 0.5586959379518864, |
| "grad_norm": 0.8046875, |
| "learning_rate": 8.629719720852138e-05, |
| "loss": 2.0115155029296874, |
| "step": 4250, |
| "token_acc": 0.7570202860215485 |
| }, |
| { |
| "epoch": 0.5652688313395556, |
| "grad_norm": 0.80859375, |
| "learning_rate": 8.59213248815187e-05, |
| "loss": 2.014189910888672, |
| "step": 4300, |
| "token_acc": 0.7566674687681726 |
| }, |
| { |
| "epoch": 0.5718417247272249, |
| "grad_norm": 0.69140625, |
| "learning_rate": 8.554120956635375e-05, |
| "loss": 1.9902659606933595, |
| "step": 4350, |
| "token_acc": 0.7575061325565429 |
| }, |
| { |
| "epoch": 0.5784146181148941, |
| "grad_norm": 0.7421875, |
| "learning_rate": 8.515689616183769e-05, |
| "loss": 1.9776287841796876, |
| "step": 4400, |
| "token_acc": 0.7585736758424619 |
| }, |
| { |
| "epoch": 0.5849875115025635, |
| "grad_norm": 0.609375, |
| "learning_rate": 8.476843006265545e-05, |
| "loss": 1.97283447265625, |
| "step": 4450, |
| "token_acc": 0.754149127932067 |
| }, |
| { |
| "epoch": 0.5915604048902326, |
| "grad_norm": 0.79296875, |
| "learning_rate": 8.437585715400384e-05, |
| "loss": 1.9807916259765626, |
| "step": 4500, |
| "token_acc": 0.7596238589600954 |
| }, |
| { |
| "epoch": 0.598133298277902, |
| "grad_norm": 0.84765625, |
| "learning_rate": 8.39792238061715e-05, |
| "loss": 1.9481539916992188, |
| "step": 4550, |
| "token_acc": 0.7577328978855814 |
| }, |
| { |
| "epoch": 0.6047061916655712, |
| "grad_norm": 0.875, |
| "learning_rate": 8.357857686906182e-05, |
| "loss": 1.9619242858886718, |
| "step": 4600, |
| "token_acc": 0.7588518194659313 |
| }, |
| { |
| "epoch": 0.6112790850532405, |
| "grad_norm": 0.66796875, |
| "learning_rate": 8.317396366665899e-05, |
| "loss": 1.9576710510253905, |
| "step": 4650, |
| "token_acc": 0.7596985079347987 |
| }, |
| { |
| "epoch": 0.6178519784409097, |
| "grad_norm": 0.6953125, |
| "learning_rate": 8.27654319914382e-05, |
| "loss": 1.9588572692871093, |
| "step": 4700, |
| "token_acc": 0.757741116751269 |
| }, |
| { |
| "epoch": 0.624424871828579, |
| "grad_norm": 0.6328125, |
| "learning_rate": 8.235303009872043e-05, |
| "loss": 1.954942626953125, |
| "step": 4750, |
| "token_acc": 0.761324026042421 |
| }, |
| { |
| "epoch": 0.6309977652162482, |
| "grad_norm": 0.94140625, |
| "learning_rate": 8.193680670097257e-05, |
| "loss": 1.9273374938964845, |
| "step": 4800, |
| "token_acc": 0.7592756976720991 |
| }, |
| { |
| "epoch": 0.6375706586039175, |
| "grad_norm": 0.73828125, |
| "learning_rate": 8.151681096205356e-05, |
| "loss": 1.94022216796875, |
| "step": 4850, |
| "token_acc": 0.7566143414515606 |
| }, |
| { |
| "epoch": 0.6441435519915867, |
| "grad_norm": 0.90234375, |
| "learning_rate": 8.109309249140721e-05, |
| "loss": 1.9436038208007813, |
| "step": 4900, |
| "token_acc": 0.7617455006768156 |
| }, |
| { |
| "epoch": 0.650716445379256, |
| "grad_norm": 0.83203125, |
| "learning_rate": 8.06657013382024e-05, |
| "loss": 1.937064208984375, |
| "step": 4950, |
| "token_acc": 0.7591963945791783 |
| }, |
| { |
| "epoch": 0.6572893387669252, |
| "grad_norm": 0.9140625, |
| "learning_rate": 8.023468798542127e-05, |
| "loss": 1.9416938781738282, |
| "step": 5000, |
| "token_acc": 0.7584784429628924 |
| }, |
| { |
| "epoch": 0.6638622321545945, |
| "grad_norm": 0.87890625, |
| "learning_rate": 7.980010334389636e-05, |
| "loss": 1.9161361694335937, |
| "step": 5050, |
| "token_acc": 0.7541581670687739 |
| }, |
| { |
| "epoch": 0.6704351255422637, |
| "grad_norm": 0.67578125, |
| "learning_rate": 7.936199874629689e-05, |
| "loss": 1.9216696166992187, |
| "step": 5100, |
| "token_acc": 0.7606859291730552 |
| }, |
| { |
| "epoch": 0.677008018929933, |
| "grad_norm": 0.69921875, |
| "learning_rate": 7.892042594106555e-05, |
| "loss": 1.9201712036132812, |
| "step": 5150, |
| "token_acc": 0.7614792261222095 |
| }, |
| { |
| "epoch": 0.6835809123176022, |
| "grad_norm": 0.6640625, |
| "learning_rate": 7.847543708630593e-05, |
| "loss": 1.8924771118164063, |
| "step": 5200, |
| "token_acc": 0.7622566250217598 |
| }, |
| { |
| "epoch": 0.6901538057052714, |
| "grad_norm": 0.69921875, |
| "learning_rate": 7.80270847436218e-05, |
| "loss": 1.8791021728515624, |
| "step": 5250, |
| "token_acc": 0.7611149879556877 |
| }, |
| { |
| "epoch": 0.6967266990929407, |
| "grad_norm": 0.63671875, |
| "learning_rate": 7.757542187190838e-05, |
| "loss": 1.8818046569824218, |
| "step": 5300, |
| "token_acc": 0.7627985225662821 |
| }, |
| { |
| "epoch": 0.7032995924806099, |
| "grad_norm": 0.78515625, |
| "learning_rate": 7.712050182109711e-05, |
| "loss": 1.9103680419921876, |
| "step": 5350, |
| "token_acc": 0.7583346132272023 |
| }, |
| { |
| "epoch": 0.7098724858682792, |
| "grad_norm": 0.61328125, |
| "learning_rate": 7.666237832585382e-05, |
| "loss": 1.8824064636230469, |
| "step": 5400, |
| "token_acc": 0.7605793230321414 |
| }, |
| { |
| "epoch": 0.7164453792559484, |
| "grad_norm": 0.78515625, |
| "learning_rate": 7.620110549923181e-05, |
| "loss": 1.877305450439453, |
| "step": 5450, |
| "token_acc": 0.7612452387234705 |
| }, |
| { |
| "epoch": 0.7230182726436177, |
| "grad_norm": 0.7578125, |
| "learning_rate": 7.573673782628e-05, |
| "loss": 1.904554443359375, |
| "step": 5500, |
| "token_acc": 0.7584822432750704 |
| }, |
| { |
| "epoch": 0.7295911660312869, |
| "grad_norm": 0.86328125, |
| "learning_rate": 7.526933015760717e-05, |
| "loss": 1.8621942138671874, |
| "step": 5550, |
| "token_acc": 0.7622160103275084 |
| }, |
| { |
| "epoch": 0.7361640594189562, |
| "grad_norm": 0.76171875, |
| "learning_rate": 7.479893770290321e-05, |
| "loss": 1.8591368103027344, |
| "step": 5600, |
| "token_acc": 0.7647176822254823 |
| }, |
| { |
| "epoch": 0.7427369528066254, |
| "grad_norm": 0.76171875, |
| "learning_rate": 7.43256160244176e-05, |
| "loss": 1.860885009765625, |
| "step": 5650, |
| "token_acc": 0.7656942497192635 |
| }, |
| { |
| "epoch": 0.7493098461942947, |
| "grad_norm": 0.62890625, |
| "learning_rate": 7.38494210303967e-05, |
| "loss": 1.8532620239257813, |
| "step": 5700, |
| "token_acc": 0.7606205964388161 |
| }, |
| { |
| "epoch": 0.755882739581964, |
| "grad_norm": 0.765625, |
| "learning_rate": 7.337040896847967e-05, |
| "loss": 1.8677340698242189, |
| "step": 5750, |
| "token_acc": 0.7610945995293353 |
| }, |
| { |
| "epoch": 0.7624556329696333, |
| "grad_norm": 0.63671875, |
| "learning_rate": 7.288863641905481e-05, |
| "loss": 1.8541110229492188, |
| "step": 5800, |
| "token_acc": 0.7597710414081623 |
| }, |
| { |
| "epoch": 0.7690285263573025, |
| "grad_norm": 0.7265625, |
| "learning_rate": 7.240416028857617e-05, |
| "loss": 1.8557376098632812, |
| "step": 5850, |
| "token_acc": 0.7620176547719397 |
| }, |
| { |
| "epoch": 0.7756014197449718, |
| "grad_norm": 0.75390625, |
| "learning_rate": 7.191703780284187e-05, |
| "loss": 1.8637747192382812, |
| "step": 5900, |
| "token_acc": 0.7612350143995713 |
| }, |
| { |
| "epoch": 0.782174313132641, |
| "grad_norm": 0.7421875, |
| "learning_rate": 7.14273265002347e-05, |
| "loss": 1.8630572509765626, |
| "step": 5950, |
| "token_acc": 0.762536667545336 |
| }, |
| { |
| "epoch": 0.7887472065203103, |
| "grad_norm": 0.76171875, |
| "learning_rate": 7.093508422492568e-05, |
| "loss": 1.8501144409179688, |
| "step": 6000, |
| "token_acc": 0.7612453038222803 |
| }, |
| { |
| "epoch": 0.7887472065203103, |
| "eval_loss": 1.9278579950332642, |
| "eval_runtime": 236.8225, |
| "eval_samples_per_second": 186.101, |
| "eval_steps_per_second": 2.909, |
| "eval_token_acc": 0.7643631778364091, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.7953200999079795, |
| "grad_norm": 0.9609375, |
| "learning_rate": 7.044036912004159e-05, |
| "loss": 1.8581178283691406, |
| "step": 6050, |
| "token_acc": 0.7597096150613548 |
| }, |
| { |
| "epoch": 0.8018929932956488, |
| "grad_norm": 0.66015625, |
| "learning_rate": 6.99432396207972e-05, |
| "loss": 1.85968994140625, |
| "step": 6100, |
| "token_acc": 0.760983003636121 |
| }, |
| { |
| "epoch": 0.808465886683318, |
| "grad_norm": 0.8984375, |
| "learning_rate": 6.94437544475929e-05, |
| "loss": 1.8418545532226562, |
| "step": 6150, |
| "token_acc": 0.7633178669389553 |
| }, |
| { |
| "epoch": 0.8150387800709873, |
| "grad_norm": 0.640625, |
| "learning_rate": 6.894197259907879e-05, |
| "loss": 1.8265931701660156, |
| "step": 6200, |
| "token_acc": 0.7654604394264699 |
| }, |
| { |
| "epoch": 0.8216116734586565, |
| "grad_norm": 0.6953125, |
| "learning_rate": 6.843795334518576e-05, |
| "loss": 1.8271298217773437, |
| "step": 6250, |
| "token_acc": 0.7637430879196561 |
| }, |
| { |
| "epoch": 0.8281845668463258, |
| "grad_norm": 0.8203125, |
| "learning_rate": 6.79317562201246e-05, |
| "loss": 1.8288789367675782, |
| "step": 6300, |
| "token_acc": 0.7608325763635624 |
| }, |
| { |
| "epoch": 0.834757460233995, |
| "grad_norm": 0.6953125, |
| "learning_rate": 6.742344101535394e-05, |
| "loss": 1.8086236572265626, |
| "step": 6350, |
| "token_acc": 0.7633120515197936 |
| }, |
| { |
| "epoch": 0.8413303536216643, |
| "grad_norm": 0.55859375, |
| "learning_rate": 6.691306777251762e-05, |
| "loss": 1.7932760620117187, |
| "step": 6400, |
| "token_acc": 0.7665305845357507 |
| }, |
| { |
| "epoch": 0.8479032470093335, |
| "grad_norm": 0.6484375, |
| "learning_rate": 6.640069677635282e-05, |
| "loss": 1.8109786987304688, |
| "step": 6450, |
| "token_acc": 0.7631320021044253 |
| }, |
| { |
| "epoch": 0.8544761403970028, |
| "grad_norm": 0.5625, |
| "learning_rate": 6.58863885475691e-05, |
| "loss": 1.7983740234375, |
| "step": 6500, |
| "token_acc": 0.7650729466919018 |
| }, |
| { |
| "epoch": 0.861049033784672, |
| "grad_norm": 0.625, |
| "learning_rate": 6.537020383569988e-05, |
| "loss": 1.8120062255859375, |
| "step": 6550, |
| "token_acc": 0.7633286718136009 |
| }, |
| { |
| "epoch": 0.8676219271723412, |
| "grad_norm": 0.68359375, |
| "learning_rate": 6.485220361192677e-05, |
| "loss": 1.826031951904297, |
| "step": 6600, |
| "token_acc": 0.7621315968131627 |
| }, |
| { |
| "epoch": 0.8741948205600105, |
| "grad_norm": 0.7421875, |
| "learning_rate": 6.433244906187763e-05, |
| "loss": 1.8025027465820314, |
| "step": 6650, |
| "token_acc": 0.763683785326105 |
| }, |
| { |
| "epoch": 0.8807677139476797, |
| "grad_norm": 0.61328125, |
| "learning_rate": 6.381100157839948e-05, |
| "loss": 1.8083682250976563, |
| "step": 6700, |
| "token_acc": 0.7638322653360178 |
| }, |
| { |
| "epoch": 0.887340607335349, |
| "grad_norm": 0.55078125, |
| "learning_rate": 6.328792275430682e-05, |
| "loss": 1.8106515502929688, |
| "step": 6750, |
| "token_acc": 0.7645292486420634 |
| }, |
| { |
| "epoch": 0.8939135007230182, |
| "grad_norm": 0.6171875, |
| "learning_rate": 6.276327437510636e-05, |
| "loss": 1.7926376342773438, |
| "step": 6800, |
| "token_acc": 0.7669320628731736 |
| }, |
| { |
| "epoch": 0.9004863941106875, |
| "grad_norm": 0.7578125, |
| "learning_rate": 6.22371184116989e-05, |
| "loss": 1.7977276611328126, |
| "step": 6850, |
| "token_acc": 0.7638069323509712 |
| }, |
| { |
| "epoch": 0.9070592874983567, |
| "grad_norm": 0.609375, |
| "learning_rate": 6.170951701305951e-05, |
| "loss": 1.8151174926757812, |
| "step": 6900, |
| "token_acc": 0.7588274415858517 |
| }, |
| { |
| "epoch": 0.913632180886026, |
| "grad_norm": 0.64453125, |
| "learning_rate": 6.118053249889652e-05, |
| "loss": 1.7749380493164062, |
| "step": 6950, |
| "token_acc": 0.7635055545232533 |
| }, |
| { |
| "epoch": 0.9202050742736952, |
| "grad_norm": 0.71875, |
| "learning_rate": 6.0650227352290345e-05, |
| "loss": 1.7828396606445311, |
| "step": 7000, |
| "token_acc": 0.7631919048643416 |
| }, |
| { |
| "epoch": 0.9267779676613646, |
| "grad_norm": 0.55859375, |
| "learning_rate": 6.011866421231309e-05, |
| "loss": 1.7750047302246095, |
| "step": 7050, |
| "token_acc": 0.7658884744785802 |
| }, |
| { |
| "epoch": 0.9333508610490338, |
| "grad_norm": 0.703125, |
| "learning_rate": 5.9585905866629687e-05, |
| "loss": 1.7743110656738281, |
| "step": 7100, |
| "token_acc": 0.764956263144499 |
| }, |
| { |
| "epoch": 0.9399237544367031, |
| "grad_norm": 0.70703125, |
| "learning_rate": 5.905201524408148e-05, |
| "loss": 1.7661270141601562, |
| "step": 7150, |
| "token_acc": 0.7625542988555213 |
| }, |
| { |
| "epoch": 0.9464966478243723, |
| "grad_norm": 0.62890625, |
| "learning_rate": 5.8517055407253115e-05, |
| "loss": 1.7674331665039062, |
| "step": 7200, |
| "token_acc": 0.7658641448139589 |
| }, |
| { |
| "epoch": 0.9530695412120416, |
| "grad_norm": 0.7265625, |
| "learning_rate": 5.798108954502368e-05, |
| "loss": 1.76580810546875, |
| "step": 7250, |
| "token_acc": 0.764363801032948 |
| }, |
| { |
| "epoch": 0.9596424345997108, |
| "grad_norm": 0.6953125, |
| "learning_rate": 5.7444180965102936e-05, |
| "loss": 1.764315185546875, |
| "step": 7300, |
| "token_acc": 0.765748932533409 |
| }, |
| { |
| "epoch": 0.9662153279873801, |
| "grad_norm": 0.5390625, |
| "learning_rate": 5.69063930865534e-05, |
| "loss": 1.7479220581054689, |
| "step": 7350, |
| "token_acc": 0.7640023759364084 |
| }, |
| { |
| "epoch": 0.9727882213750493, |
| "grad_norm": 0.6796875, |
| "learning_rate": 5.63677894322994e-05, |
| "loss": 1.783513641357422, |
| "step": 7400, |
| "token_acc": 0.7652048454713193 |
| }, |
| { |
| "epoch": 0.9793611147627186, |
| "grad_norm": 0.6484375, |
| "learning_rate": 5.5828433621623845e-05, |
| "loss": 1.7546864318847657, |
| "step": 7450, |
| "token_acc": 0.7662415623916118 |
| }, |
| { |
| "epoch": 0.9859340081503878, |
| "grad_norm": 0.60546875, |
| "learning_rate": 5.5288389362653484e-05, |
| "loss": 1.7443992614746093, |
| "step": 7500, |
| "token_acc": 0.7674998687827633 |
| }, |
| { |
| "epoch": 0.9925069015380571, |
| "grad_norm": 0.625, |
| "learning_rate": 5.474772044483391e-05, |
| "loss": 1.7637782287597656, |
| "step": 7550, |
| "token_acc": 0.7650653153454292 |
| }, |
| { |
| "epoch": 0.9990797949257263, |
| "grad_norm": 0.57421875, |
| "learning_rate": 5.420649073139469e-05, |
| "loss": 1.744835205078125, |
| "step": 7600, |
| "token_acc": 0.7651500535549154 |
| }, |
| { |
| "epoch": 1.0056526883133956, |
| "grad_norm": 0.57421875, |
| "learning_rate": 5.366476415180599e-05, |
| "loss": 1.7677224731445313, |
| "step": 7650, |
| "token_acc": 0.7611829185169189 |
| }, |
| { |
| "epoch": 1.0122255817010648, |
| "grad_norm": 0.546875, |
| "learning_rate": 5.3122604694227265e-05, |
| "loss": 1.731588134765625, |
| "step": 7700, |
| "token_acc": 0.7665744247751762 |
| }, |
| { |
| "epoch": 1.018798475088734, |
| "grad_norm": 0.515625, |
| "learning_rate": 5.258007639794907e-05, |
| "loss": 1.7428884887695313, |
| "step": 7750, |
| "token_acc": 0.76526092110062 |
| }, |
| { |
| "epoch": 1.0253713684764034, |
| "grad_norm": 0.5390625, |
| "learning_rate": 5.203724334582875e-05, |
| "loss": 1.724066162109375, |
| "step": 7800, |
| "token_acc": 0.7678043302715252 |
| }, |
| { |
| "epoch": 1.0319442618640726, |
| "grad_norm": 0.578125, |
| "learning_rate": 5.1494169656721104e-05, |
| "loss": 1.7163406372070313, |
| "step": 7850, |
| "token_acc": 0.7682152833871836 |
| }, |
| { |
| "epoch": 1.0385171552517418, |
| "grad_norm": 0.734375, |
| "learning_rate": 5.095091947790472e-05, |
| "loss": 1.7165689086914062, |
| "step": 7900, |
| "token_acc": 0.7668357289737444 |
| }, |
| { |
| "epoch": 1.045090048639411, |
| "grad_norm": 0.58984375, |
| "learning_rate": 5.040755697750496e-05, |
| "loss": 1.7132667541503905, |
| "step": 7950, |
| "token_acc": 0.767668712380105 |
| }, |
| { |
| "epoch": 1.0516629420270802, |
| "grad_norm": 0.53515625, |
| "learning_rate": 4.9864146336914465e-05, |
| "loss": 1.7193359375, |
| "step": 8000, |
| "token_acc": 0.7657205294292607 |
| }, |
| { |
| "epoch": 1.0516629420270802, |
| "eval_loss": 1.837268590927124, |
| "eval_runtime": 235.8095, |
| "eval_samples_per_second": 186.901, |
| "eval_steps_per_second": 2.922, |
| "eval_token_acc": 0.7689604263199924, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.0582358354147496, |
| "grad_norm": 0.64453125, |
| "learning_rate": 4.9320751743212176e-05, |
| "loss": 1.7165196228027344, |
| "step": 8050, |
| "token_acc": 0.7653232432176571 |
| }, |
| { |
| "epoch": 1.0648087288024188, |
| "grad_norm": 0.59765625, |
| "learning_rate": 4.877743738158155e-05, |
| "loss": 1.7286593627929687, |
| "step": 8100, |
| "token_acc": 0.7641165060152924 |
| }, |
| { |
| "epoch": 1.071381622190088, |
| "grad_norm": 0.51171875, |
| "learning_rate": 4.823426742772917e-05, |
| "loss": 1.695826873779297, |
| "step": 8150, |
| "token_acc": 0.7685074226887527 |
| }, |
| { |
| "epoch": 1.0779545155777572, |
| "grad_norm": 0.5703125, |
| "learning_rate": 4.7691306040304306e-05, |
| "loss": 1.7172344970703124, |
| "step": 8200, |
| "token_acc": 0.7652489671997466 |
| }, |
| { |
| "epoch": 1.0845274089654267, |
| "grad_norm": 0.7109375, |
| "learning_rate": 4.714861735332058e-05, |
| "loss": 1.6970980834960938, |
| "step": 8250, |
| "token_acc": 0.7678146398472478 |
| }, |
| { |
| "epoch": 1.0911003023530959, |
| "grad_norm": 0.55859375, |
| "learning_rate": 4.6606265468580516e-05, |
| "loss": 1.6961888122558593, |
| "step": 8300, |
| "token_acc": 0.7714541070556682 |
| }, |
| { |
| "epoch": 1.097673195740765, |
| "grad_norm": 0.546875, |
| "learning_rate": 4.6064314448103974e-05, |
| "loss": 1.6937094116210938, |
| "step": 8350, |
| "token_acc": 0.7699465368393305 |
| }, |
| { |
| "epoch": 1.1042460891284342, |
| "grad_norm": 0.56640625, |
| "learning_rate": 4.5522828306561085e-05, |
| "loss": 1.6934506225585937, |
| "step": 8400, |
| "token_acc": 0.7680296355759598 |
| }, |
| { |
| "epoch": 1.1108189825161037, |
| "grad_norm": 0.734375, |
| "learning_rate": 4.498187100371105e-05, |
| "loss": 1.703126220703125, |
| "step": 8450, |
| "token_acc": 0.7660449721996203 |
| }, |
| { |
| "epoch": 1.1173918759037729, |
| "grad_norm": 0.57421875, |
| "learning_rate": 4.4441506436847194e-05, |
| "loss": 1.6976077270507812, |
| "step": 8500, |
| "token_acc": 0.7660062252481052 |
| }, |
| { |
| "epoch": 1.123964769291442, |
| "grad_norm": 0.58203125, |
| "learning_rate": 4.390179843324947e-05, |
| "loss": 1.6787896728515626, |
| "step": 8550, |
| "token_acc": 0.7675875080012128 |
| }, |
| { |
| "epoch": 1.1305376626791113, |
| "grad_norm": 0.5546875, |
| "learning_rate": 4.3362810742645344e-05, |
| "loss": 1.671527099609375, |
| "step": 8600, |
| "token_acc": 0.7726572467785786 |
| }, |
| { |
| "epoch": 1.1371105560667807, |
| "grad_norm": 0.58984375, |
| "learning_rate": 4.282460702967962e-05, |
| "loss": 1.6855081176757813, |
| "step": 8650, |
| "token_acc": 0.7697742729365689 |
| }, |
| { |
| "epoch": 1.1436834494544499, |
| "grad_norm": 0.52734375, |
| "learning_rate": 4.228725086639458e-05, |
| "loss": 1.6703143310546875, |
| "step": 8700, |
| "token_acc": 0.7704307602426076 |
| }, |
| { |
| "epoch": 1.150256342842119, |
| "grad_norm": 0.56640625, |
| "learning_rate": 4.175080572472082e-05, |
| "loss": 1.6878749084472657, |
| "step": 8750, |
| "token_acc": 0.7682984359233098 |
| }, |
| { |
| "epoch": 1.1568292362297883, |
| "grad_norm": 0.59375, |
| "learning_rate": 4.121533496898002e-05, |
| "loss": 1.69472412109375, |
| "step": 8800, |
| "token_acc": 0.7653907536202668 |
| }, |
| { |
| "epoch": 1.1634021296174577, |
| "grad_norm": 0.5703125, |
| "learning_rate": 4.068090184840047e-05, |
| "loss": 1.6784718322753907, |
| "step": 8850, |
| "token_acc": 0.7674758636654547 |
| }, |
| { |
| "epoch": 1.169975023005127, |
| "grad_norm": 0.53515625, |
| "learning_rate": 4.0147569489646135e-05, |
| "loss": 1.6566871643066405, |
| "step": 8900, |
| "token_acc": 0.7731254763401035 |
| }, |
| { |
| "epoch": 1.176547916392796, |
| "grad_norm": 0.55078125, |
| "learning_rate": 3.9615400889360146e-05, |
| "loss": 1.6721833801269532, |
| "step": 8950, |
| "token_acc": 0.7681256368871351 |
| }, |
| { |
| "epoch": 1.1831208097804653, |
| "grad_norm": 0.609375, |
| "learning_rate": 3.908445890672373e-05, |
| "loss": 1.6834414672851563, |
| "step": 9000, |
| "token_acc": 0.7689145513676909 |
| }, |
| { |
| "epoch": 1.1896937031681345, |
| "grad_norm": 0.5, |
| "learning_rate": 3.855480625603142e-05, |
| "loss": 1.6795899963378906, |
| "step": 9050, |
| "token_acc": 0.7689981482717203 |
| }, |
| { |
| "epoch": 1.196266596555804, |
| "grad_norm": 0.515625, |
| "learning_rate": 3.8026505499283184e-05, |
| "loss": 1.6775094604492187, |
| "step": 9100, |
| "token_acc": 0.7678651418602145 |
| }, |
| { |
| "epoch": 1.2028394899434731, |
| "grad_norm": 0.57421875, |
| "learning_rate": 3.749961903879477e-05, |
| "loss": 1.6525213623046875, |
| "step": 9150, |
| "token_acc": 0.7686804131998874 |
| }, |
| { |
| "epoch": 1.2094123833311423, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.6974209109826726e-05, |
| "loss": 1.6840940856933593, |
| "step": 9200, |
| "token_acc": 0.7642316750821783 |
| }, |
| { |
| "epoch": 1.2159852767188117, |
| "grad_norm": 0.494140625, |
| "learning_rate": 3.645033777323339e-05, |
| "loss": 1.6511599731445312, |
| "step": 9250, |
| "token_acc": 0.7737184378932712 |
| }, |
| { |
| "epoch": 1.222558170106481, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3.5928066908132144e-05, |
| "loss": 1.6393515014648437, |
| "step": 9300, |
| "token_acc": 0.7703170812446798 |
| }, |
| { |
| "epoch": 1.2291310634941501, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3.5407458204594426e-05, |
| "loss": 1.6625300598144532, |
| "step": 9350, |
| "token_acc": 0.7698629299985743 |
| }, |
| { |
| "epoch": 1.2357039568818193, |
| "grad_norm": 0.5859375, |
| "learning_rate": 3.488857315635893e-05, |
| "loss": 1.6773255920410157, |
| "step": 9400, |
| "token_acc": 0.7693895226882114 |
| }, |
| { |
| "epoch": 1.2422768502694885, |
| "grad_norm": 0.54296875, |
| "learning_rate": 3.437147305356807e-05, |
| "loss": 1.6579641723632812, |
| "step": 9450, |
| "token_acc": 0.7706150398406374 |
| }, |
| { |
| "epoch": 1.248849743657158, |
| "grad_norm": 0.5078125, |
| "learning_rate": 3.3856218975528434e-05, |
| "loss": 1.6528695678710938, |
| "step": 9500, |
| "token_acc": 0.7691562115211821 |
| }, |
| { |
| "epoch": 1.2554226370448271, |
| "grad_norm": 0.5234375, |
| "learning_rate": 3.334287178349611e-05, |
| "loss": 1.6566635131835938, |
| "step": 9550, |
| "token_acc": 0.7694686428681645 |
| }, |
| { |
| "epoch": 1.2619955304324963, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.2831492113487904e-05, |
| "loss": 1.6695437622070313, |
| "step": 9600, |
| "token_acc": 0.7692787363152542 |
| }, |
| { |
| "epoch": 1.2685684238201658, |
| "grad_norm": 0.50390625, |
| "learning_rate": 3.2322140369119045e-05, |
| "loss": 1.670698699951172, |
| "step": 9650, |
| "token_acc": 0.7669365404642725 |
| }, |
| { |
| "epoch": 1.275141317207835, |
| "grad_norm": 0.52734375, |
| "learning_rate": 3.181487671446836e-05, |
| "loss": 1.646166534423828, |
| "step": 9700, |
| "token_acc": 0.7694847008702814 |
| }, |
| { |
| "epoch": 1.2817142105955042, |
| "grad_norm": 0.53515625, |
| "learning_rate": 3.130976106697174e-05, |
| "loss": 1.6512442016601563, |
| "step": 9750, |
| "token_acc": 0.7689283254541929 |
| }, |
| { |
| "epoch": 1.2882871039831734, |
| "grad_norm": 0.484375, |
| "learning_rate": 3.080685309034487e-05, |
| "loss": 1.6418820190429688, |
| "step": 9800, |
| "token_acc": 0.7712326161812989 |
| }, |
| { |
| "epoch": 1.2948599973708426, |
| "grad_norm": 0.55078125, |
| "learning_rate": 3.0306212187535653e-05, |
| "loss": 1.6504058837890625, |
| "step": 9850, |
| "token_acc": 0.7678108326514603 |
| }, |
| { |
| "epoch": 1.301432890758512, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.9807897493707703e-05, |
| "loss": 1.6451980590820312, |
| "step": 9900, |
| "token_acc": 0.7685139265616777 |
| }, |
| { |
| "epoch": 1.3080057841461812, |
| "grad_norm": 0.48046875, |
| "learning_rate": 2.9311967869255324e-05, |
| "loss": 1.6350534057617188, |
| "step": 9950, |
| "token_acc": 0.7734443379837148 |
| }, |
| { |
| "epoch": 1.3145786775338504, |
| "grad_norm": 0.5390625, |
| "learning_rate": 2.881848189285105e-05, |
| "loss": 1.6401956176757813, |
| "step": 10000, |
| "token_acc": 0.7687987154069689 |
| }, |
| { |
| "epoch": 1.3145786775338504, |
| "eval_loss": 1.7793248891830444, |
| "eval_runtime": 236.4249, |
| "eval_samples_per_second": 186.414, |
| "eval_steps_per_second": 2.914, |
| "eval_token_acc": 0.7717742013781854, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.3211515709215196, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.8327497854526276e-05, |
| "loss": 1.6347137451171876, |
| "step": 10050, |
| "token_acc": 0.7699386839582517 |
| }, |
| { |
| "epoch": 1.3277244643091888, |
| "grad_norm": 0.546875, |
| "learning_rate": 2.783907374878623e-05, |
| "loss": 1.6400608825683594, |
| "step": 10100, |
| "token_acc": 0.7713103096257532 |
| }, |
| { |
| "epoch": 1.3342973576968582, |
| "grad_norm": 0.54296875, |
| "learning_rate": 2.7353267267759587e-05, |
| "loss": 1.6206582641601563, |
| "step": 10150, |
| "token_acc": 0.7717408144832179 |
| }, |
| { |
| "epoch": 1.3408702510845274, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.6870135794384084e-05, |
| "loss": 1.6268215942382813, |
| "step": 10200, |
| "token_acc": 0.7686959485978778 |
| }, |
| { |
| "epoch": 1.3474431444721966, |
| "grad_norm": 0.48828125, |
| "learning_rate": 2.63897363956284e-05, |
| "loss": 1.626171875, |
| "step": 10250, |
| "token_acc": 0.7716723626001355 |
| }, |
| { |
| "epoch": 1.354016037859866, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.591212581575153e-05, |
| "loss": 1.6198342895507813, |
| "step": 10300, |
| "token_acc": 0.7731112873028669 |
| }, |
| { |
| "epoch": 1.3605889312475352, |
| "grad_norm": 0.498046875, |
| "learning_rate": 2.543736046960019e-05, |
| "loss": 1.6164779663085938, |
| "step": 10350, |
| "token_acc": 0.769608566595914 |
| }, |
| { |
| "epoch": 1.3671618246352044, |
| "grad_norm": 0.490234375, |
| "learning_rate": 2.4965496435945106e-05, |
| "loss": 1.641104736328125, |
| "step": 10400, |
| "token_acc": 0.7738014461806595 |
| }, |
| { |
| "epoch": 1.3737347180228736, |
| "grad_norm": 0.486328125, |
| "learning_rate": 2.449658945085718e-05, |
| "loss": 1.6213189697265624, |
| "step": 10450, |
| "token_acc": 0.768932852837636 |
| }, |
| { |
| "epoch": 1.3803076114105428, |
| "grad_norm": 0.494140625, |
| "learning_rate": 2.4030694901123825e-05, |
| "loss": 1.6324661254882813, |
| "step": 10500, |
| "token_acc": 0.7704852495864681 |
| }, |
| { |
| "epoch": 1.3868805047982122, |
| "grad_norm": 0.453125, |
| "learning_rate": 2.3567867817706974e-05, |
| "loss": 1.6237179565429687, |
| "step": 10550, |
| "token_acc": 0.770884503460658 |
| }, |
| { |
| "epoch": 1.3934533981858814, |
| "grad_norm": 0.60546875, |
| "learning_rate": 2.310816286924261e-05, |
| "loss": 1.6309237670898438, |
| "step": 10600, |
| "token_acc": 0.7696517291192732 |
| }, |
| { |
| "epoch": 1.4000262915735506, |
| "grad_norm": 0.51953125, |
| "learning_rate": 2.2651634355583606e-05, |
| "loss": 1.6198001098632813, |
| "step": 10650, |
| "token_acc": 0.7706210696067582 |
| }, |
| { |
| "epoch": 1.40659918496122, |
| "grad_norm": 0.462890625, |
| "learning_rate": 2.2198336201385674e-05, |
| "loss": 1.637750244140625, |
| "step": 10700, |
| "token_acc": 0.768926374214257 |
| }, |
| { |
| "epoch": 1.4131720783488892, |
| "grad_norm": 0.52734375, |
| "learning_rate": 2.1748321949738088e-05, |
| "loss": 1.6378421020507812, |
| "step": 10750, |
| "token_acc": 0.767715041979971 |
| }, |
| { |
| "epoch": 1.4197449717365584, |
| "grad_norm": 0.482421875, |
| "learning_rate": 2.130164475583896e-05, |
| "loss": 1.6255686950683594, |
| "step": 10800, |
| "token_acc": 0.7689490174738093 |
| }, |
| { |
| "epoch": 1.4263178651242276, |
| "grad_norm": 0.47265625, |
| "learning_rate": 2.0858357380716826e-05, |
| "loss": 1.6103607177734376, |
| "step": 10850, |
| "token_acc": 0.7713349007488887 |
| }, |
| { |
| "epoch": 1.4328907585118968, |
| "grad_norm": 0.51171875, |
| "learning_rate": 2.041851218499844e-05, |
| "loss": 1.6049491882324218, |
| "step": 10900, |
| "token_acc": 0.7700552131398218 |
| }, |
| { |
| "epoch": 1.4394636518995663, |
| "grad_norm": 0.50390625, |
| "learning_rate": 1.998216112272407e-05, |
| "loss": 1.6167376708984376, |
| "step": 10950, |
| "token_acc": 0.770873658059624 |
| }, |
| { |
| "epoch": 1.4460365452872355, |
| "grad_norm": 0.56640625, |
| "learning_rate": 1.9549355735210663e-05, |
| "loss": 1.607739715576172, |
| "step": 11000, |
| "token_acc": 0.7714620438930024 |
| }, |
| { |
| "epoch": 1.4526094386749047, |
| "grad_norm": 0.5390625, |
| "learning_rate": 1.9120147144963918e-05, |
| "loss": 1.6082345581054687, |
| "step": 11050, |
| "token_acc": 0.7727137897326997 |
| }, |
| { |
| "epoch": 1.459182332062574, |
| "grad_norm": 0.51171875, |
| "learning_rate": 1.869458604963973e-05, |
| "loss": 1.6116729736328126, |
| "step": 11100, |
| "token_acc": 0.7693582542656137 |
| }, |
| { |
| "epoch": 1.465755225450243, |
| "grad_norm": 0.482421875, |
| "learning_rate": 1.827272271605581e-05, |
| "loss": 1.6167997741699218, |
| "step": 11150, |
| "token_acc": 0.7716410540840345 |
| }, |
| { |
| "epoch": 1.4723281188379125, |
| "grad_norm": 0.4609375, |
| "learning_rate": 1.785460697425422e-05, |
| "loss": 1.6175106811523436, |
| "step": 11200, |
| "token_acc": 0.7678629010587413 |
| }, |
| { |
| "epoch": 1.4789010122255817, |
| "grad_norm": 0.50390625, |
| "learning_rate": 1.7440288211615553e-05, |
| "loss": 1.6349208068847656, |
| "step": 11250, |
| "token_acc": 0.767691050989249 |
| }, |
| { |
| "epoch": 1.4854739056132509, |
| "grad_norm": 0.470703125, |
| "learning_rate": 1.7029815367025304e-05, |
| "loss": 1.5947479248046874, |
| "step": 11300, |
| "token_acc": 0.7744744172137812 |
| }, |
| { |
| "epoch": 1.4920467990009203, |
| "grad_norm": 0.46484375, |
| "learning_rate": 1.6623236925093293e-05, |
| "loss": 1.6072760009765625, |
| "step": 11350, |
| "token_acc": 0.7699475558805804 |
| }, |
| { |
| "epoch": 1.4986196923885895, |
| "grad_norm": 0.48046875, |
| "learning_rate": 1.622060091042666e-05, |
| "loss": 1.6194985961914063, |
| "step": 11400, |
| "token_acc": 0.771240942645837 |
| }, |
| { |
| "epoch": 1.5051925857762587, |
| "grad_norm": 0.48828125, |
| "learning_rate": 1.582195488195731e-05, |
| "loss": 1.5986326599121095, |
| "step": 11450, |
| "token_acc": 0.7726320047065744 |
| }, |
| { |
| "epoch": 1.5117654791639281, |
| "grad_norm": 0.458984375, |
| "learning_rate": 1.5427345927324305e-05, |
| "loss": 1.6070111083984375, |
| "step": 11500, |
| "token_acc": 0.773540255743937 |
| }, |
| { |
| "epoch": 1.518338372551597, |
| "grad_norm": 0.4921875, |
| "learning_rate": 1.5036820657311839e-05, |
| "loss": 1.6082412719726562, |
| "step": 11550, |
| "token_acc": 0.769586028833374 |
| }, |
| { |
| "epoch": 1.5249112659392665, |
| "grad_norm": 0.4609375, |
| "learning_rate": 1.4650425200343732e-05, |
| "loss": 1.6070376586914064, |
| "step": 11600, |
| "token_acc": 0.7711919617970434 |
| }, |
| { |
| "epoch": 1.5314841593269357, |
| "grad_norm": 0.455078125, |
| "learning_rate": 1.4268205197034717e-05, |
| "loss": 1.6123899841308593, |
| "step": 11650, |
| "token_acc": 0.7703268714219823 |
| }, |
| { |
| "epoch": 1.538057052714605, |
| "grad_norm": 0.455078125, |
| "learning_rate": 1.3890205794799476e-05, |
| "loss": 1.615906982421875, |
| "step": 11700, |
| "token_acc": 0.7700723973792927 |
| }, |
| { |
| "epoch": 1.5446299461022743, |
| "grad_norm": 0.4453125, |
| "learning_rate": 1.3516471642519784e-05, |
| "loss": 1.5967388916015626, |
| "step": 11750, |
| "token_acc": 0.772738502438927 |
| }, |
| { |
| "epoch": 1.5512028394899433, |
| "grad_norm": 0.462890625, |
| "learning_rate": 1.3147046885270736e-05, |
| "loss": 1.5981399536132812, |
| "step": 11800, |
| "token_acc": 0.7728519929058771 |
| }, |
| { |
| "epoch": 1.5577757328776127, |
| "grad_norm": 0.45703125, |
| "learning_rate": 1.2781975159106319e-05, |
| "loss": 1.6243299865722656, |
| "step": 11850, |
| "token_acc": 0.7696561945272465 |
| }, |
| { |
| "epoch": 1.564348626265282, |
| "grad_norm": 0.474609375, |
| "learning_rate": 1.24212995859052e-05, |
| "loss": 1.6280474853515625, |
| "step": 11900, |
| "token_acc": 0.7673742244402482 |
| }, |
| { |
| "epoch": 1.5709215196529511, |
| "grad_norm": 0.50390625, |
| "learning_rate": 1.2065062768277135e-05, |
| "loss": 1.5971218872070312, |
| "step": 11950, |
| "token_acc": 0.7735671474840562 |
| }, |
| { |
| "epoch": 1.5774944130406205, |
| "grad_norm": 0.427734375, |
| "learning_rate": 1.171330678453097e-05, |
| "loss": 1.591236572265625, |
| "step": 12000, |
| "token_acc": 0.7735620511595184 |
| }, |
| { |
| "epoch": 1.5774944130406205, |
| "eval_loss": 1.754050612449646, |
| "eval_runtime": 236.7394, |
| "eval_samples_per_second": 186.167, |
| "eval_steps_per_second": 2.91, |
| "eval_token_acc": 0.7731684672723433, |
| "step": 12000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 15214, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.831322460026044e+18, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|