diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,68319 @@ +{ + "best_global_step": 8530, + "best_metric": 3.07828903, + "best_model_checkpoint": "/inspire/hdd/project/deepanalysis/guitao-25013/Muse/workspace/Finals/ckpt/Muse_1.7b_main_3e-4/v0-20251228-133339/checkpoint-8530", + "epoch": 5.0, + "eval_steps": 500, + "global_step": 8530, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005863383172090296, + "grad_norm": 314.3364817194584, + "learning_rate": 1.7584994138335285e-07, + "loss": 21.1610107421875, + "step": 1, + "token_acc": 0.0073373194124074295 + }, + { + "epoch": 0.0011726766344180592, + "grad_norm": 312.62903799335515, + "learning_rate": 3.516998827667057e-07, + "loss": 21.174728393554688, + "step": 2, + "token_acc": 0.007801702527263515 + }, + { + "epoch": 0.001759014951627089, + "grad_norm": 314.0437118110648, + "learning_rate": 5.275498241500586e-07, + "loss": 21.19637107849121, + "step": 3, + "token_acc": 0.007983309100539968 + }, + { + "epoch": 0.0023453532688361184, + "grad_norm": 316.275601610318, + "learning_rate": 7.033997655334114e-07, + "loss": 21.180936813354492, + "step": 4, + "token_acc": 0.008063862867296412 + }, + { + "epoch": 0.002931691586045148, + "grad_norm": 314.08093038743345, + "learning_rate": 8.792497069167643e-07, + "loss": 21.104143142700195, + "step": 5, + "token_acc": 0.007759480832185766 + }, + { + "epoch": 0.003518029903254178, + "grad_norm": 312.4682647769296, + "learning_rate": 1.0550996483001172e-06, + "loss": 21.098918914794922, + "step": 6, + "token_acc": 0.007825808905820153 + }, + { + "epoch": 0.004104368220463207, + "grad_norm": 309.28568466174437, + "learning_rate": 1.2309495896834702e-06, + "loss": 20.95001220703125, + "step": 7, + "token_acc": 0.007920341602319884 + }, + { + "epoch": 0.004690706537672237, + "grad_norm": 308.1929230830652, + "learning_rate": 1.4067995310668228e-06, + "loss": 20.90537452697754, + "step": 8, + "token_acc": 0.007910641394804291 + }, + { + "epoch": 0.005277044854881266, + "grad_norm": 284.315801687854, + "learning_rate": 1.5826494724501758e-06, + "loss": 20.338947296142578, + "step": 9, + "token_acc": 0.008087204886554487 + }, + { + "epoch": 0.005863383172090296, + "grad_norm": 275.5431419082845, + "learning_rate": 1.7584994138335286e-06, + "loss": 20.183818817138672, + "step": 10, + "token_acc": 0.007973904505471146 + }, + { + "epoch": 0.006449721489299325, + "grad_norm": 139.3389914655089, + "learning_rate": 1.9343493552168814e-06, + "loss": 18.946557998657227, + "step": 11, + "token_acc": 0.008094707025011912 + }, + { + "epoch": 0.007036059806508356, + "grad_norm": 132.4378913844216, + "learning_rate": 2.1101992966002344e-06, + "loss": 18.853530883789062, + "step": 12, + "token_acc": 0.008527825313519165 + }, + { + "epoch": 0.007622398123717385, + "grad_norm": 121.25647774247827, + "learning_rate": 2.286049237983587e-06, + "loss": 18.666290283203125, + "step": 13, + "token_acc": 0.007512774624691704 + }, + { + "epoch": 0.008208736440926415, + "grad_norm": 116.29502434897812, + "learning_rate": 2.4618991793669404e-06, + "loss": 18.575685501098633, + "step": 14, + "token_acc": 0.007071640011564332 + }, + { + "epoch": 0.008795074758135445, + "grad_norm": 105.87930744716077, + "learning_rate": 2.637749120750293e-06, + "loss": 17.826993942260742, + "step": 15, + "token_acc": 0.0073926671608984616 + }, + { + "epoch": 0.009381413075344474, + "grad_norm": 107.22465537666224, + "learning_rate": 2.8135990621336455e-06, + "loss": 17.77083969116211, + "step": 16, + "token_acc": 0.007343478704933102 + }, + { + "epoch": 0.009967751392553504, + "grad_norm": 104.42431797549932, + "learning_rate": 2.9894490035169985e-06, + "loss": 17.671676635742188, + "step": 17, + "token_acc": 0.007584964061656072 + }, + { + "epoch": 0.010554089709762533, + "grad_norm": 97.1517823913138, + "learning_rate": 3.1652989449003515e-06, + "loss": 17.443300247192383, + "step": 18, + "token_acc": 0.008030187308725584 + }, + { + "epoch": 0.011140428026971563, + "grad_norm": 92.92248978237245, + "learning_rate": 3.3411488862837045e-06, + "loss": 17.25950813293457, + "step": 19, + "token_acc": 0.007852151242614505 + }, + { + "epoch": 0.011726766344180592, + "grad_norm": 90.75908176544272, + "learning_rate": 3.516998827667057e-06, + "loss": 17.083738327026367, + "step": 20, + "token_acc": 0.008399767699699065 + }, + { + "epoch": 0.012313104661389622, + "grad_norm": 89.40671696025731, + "learning_rate": 3.6928487690504097e-06, + "loss": 16.089990615844727, + "step": 21, + "token_acc": 0.008510286705775428 + }, + { + "epoch": 0.01289944297859865, + "grad_norm": 86.40882487547461, + "learning_rate": 3.868698710433763e-06, + "loss": 15.813337326049805, + "step": 22, + "token_acc": 0.007770580466637318 + }, + { + "epoch": 0.013485781295807681, + "grad_norm": 80.45076228251688, + "learning_rate": 4.044548651817115e-06, + "loss": 15.548316955566406, + "step": 23, + "token_acc": 0.007433676784055582 + }, + { + "epoch": 0.014072119613016711, + "grad_norm": 74.73025400426289, + "learning_rate": 4.220398593200469e-06, + "loss": 15.295755386352539, + "step": 24, + "token_acc": 0.00691768826619965 + }, + { + "epoch": 0.01465845793022574, + "grad_norm": 67.36641983888083, + "learning_rate": 4.396248534583821e-06, + "loss": 14.949100494384766, + "step": 25, + "token_acc": 0.007036510698205677 + }, + { + "epoch": 0.01524479624743477, + "grad_norm": 62.37981132861096, + "learning_rate": 4.572098475967174e-06, + "loss": 14.670554161071777, + "step": 26, + "token_acc": 0.006551926712826025 + }, + { + "epoch": 0.0158311345646438, + "grad_norm": 58.27214205890374, + "learning_rate": 4.7479484173505265e-06, + "loss": 14.401126861572266, + "step": 27, + "token_acc": 0.006652412715889741 + }, + { + "epoch": 0.01641747288185283, + "grad_norm": 50.65676588449631, + "learning_rate": 4.923798358733881e-06, + "loss": 13.930343627929688, + "step": 28, + "token_acc": 0.0062067179789200054 + }, + { + "epoch": 0.017003811199061858, + "grad_norm": 44.302864497754314, + "learning_rate": 5.099648300117233e-06, + "loss": 13.473762512207031, + "step": 29, + "token_acc": 0.006410205259560116 + }, + { + "epoch": 0.01759014951627089, + "grad_norm": 37.42459128972729, + "learning_rate": 5.275498241500586e-06, + "loss": 13.176072120666504, + "step": 30, + "token_acc": 0.005870950610727562 + }, + { + "epoch": 0.01817648783347992, + "grad_norm": 30.528500032723407, + "learning_rate": 5.4513481828839385e-06, + "loss": 12.937468528747559, + "step": 31, + "token_acc": 0.005615265927283757 + }, + { + "epoch": 0.018762826150688947, + "grad_norm": 28.116763303453823, + "learning_rate": 5.627198124267291e-06, + "loss": 12.739139556884766, + "step": 32, + "token_acc": 0.006037305845732461 + }, + { + "epoch": 0.019349164467897976, + "grad_norm": 21.40897064271037, + "learning_rate": 5.8030480656506445e-06, + "loss": 12.558815956115723, + "step": 33, + "token_acc": 0.005980296496805262 + }, + { + "epoch": 0.019935502785107008, + "grad_norm": 18.852855583335938, + "learning_rate": 5.978898007033997e-06, + "loss": 12.391048431396484, + "step": 34, + "token_acc": 0.006346509549639421 + }, + { + "epoch": 0.020521841102316037, + "grad_norm": 14.823310534361715, + "learning_rate": 6.15474794841735e-06, + "loss": 12.246835708618164, + "step": 35, + "token_acc": 0.007395827547076982 + }, + { + "epoch": 0.021108179419525065, + "grad_norm": 22.27816060587619, + "learning_rate": 6.330597889800703e-06, + "loss": 12.179847717285156, + "step": 36, + "token_acc": 0.007873791190272485 + }, + { + "epoch": 0.021694517736734097, + "grad_norm": 12.621183835541824, + "learning_rate": 6.506447831184056e-06, + "loss": 12.111846923828125, + "step": 37, + "token_acc": 0.008229866944553877 + }, + { + "epoch": 0.022280856053943126, + "grad_norm": 12.78972863393983, + "learning_rate": 6.682297772567409e-06, + "loss": 12.076262474060059, + "step": 38, + "token_acc": 0.008645978076269878 + }, + { + "epoch": 0.022867194371152155, + "grad_norm": 4.961409677636954, + "learning_rate": 6.858147713950762e-06, + "loss": 12.016976356506348, + "step": 39, + "token_acc": 0.009117272958534725 + }, + { + "epoch": 0.023453532688361183, + "grad_norm": 6.941489122321791, + "learning_rate": 7.033997655334114e-06, + "loss": 11.989545822143555, + "step": 40, + "token_acc": 0.009394958210124087 + }, + { + "epoch": 0.024039871005570215, + "grad_norm": 4.170593409239079, + "learning_rate": 7.209847596717467e-06, + "loss": 11.965530395507812, + "step": 41, + "token_acc": 0.00956037138569709 + }, + { + "epoch": 0.024626209322779244, + "grad_norm": 2.3209278853204203, + "learning_rate": 7.385697538100819e-06, + "loss": 11.943653106689453, + "step": 42, + "token_acc": 0.009718547282948487 + }, + { + "epoch": 0.025212547639988273, + "grad_norm": 1.5273919402872675, + "learning_rate": 7.561547479484174e-06, + "loss": 11.94092845916748, + "step": 43, + "token_acc": 0.008713464673327469 + }, + { + "epoch": 0.0257988859571973, + "grad_norm": 1.105332600457078, + "learning_rate": 7.737397420867525e-06, + "loss": 11.921232223510742, + "step": 44, + "token_acc": 0.00915239400365783 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.9484775593306607, + "learning_rate": 7.913247362250878e-06, + "loss": 11.907386779785156, + "step": 45, + "token_acc": 0.00945462625920596 + }, + { + "epoch": 0.026971562591615362, + "grad_norm": 0.7503746648125605, + "learning_rate": 8.08909730363423e-06, + "loss": 11.89285659790039, + "step": 46, + "token_acc": 0.009954296567402753 + }, + { + "epoch": 0.02755790090882439, + "grad_norm": 0.9308641772913042, + "learning_rate": 8.264947245017583e-06, + "loss": 11.886062622070312, + "step": 47, + "token_acc": 0.00974159406980461 + }, + { + "epoch": 0.028144239226033423, + "grad_norm": 0.7601436610741698, + "learning_rate": 8.440797186400937e-06, + "loss": 11.872444152832031, + "step": 48, + "token_acc": 0.010114053269194533 + }, + { + "epoch": 0.02873057754324245, + "grad_norm": 0.6319880135257516, + "learning_rate": 8.61664712778429e-06, + "loss": 11.867204666137695, + "step": 49, + "token_acc": 0.00965653692444547 + }, + { + "epoch": 0.02931691586045148, + "grad_norm": 0.628498674085733, + "learning_rate": 8.792497069167643e-06, + "loss": 11.849407196044922, + "step": 50, + "token_acc": 0.010177322843888137 + }, + { + "epoch": 0.02990325417766051, + "grad_norm": 0.7146005612349628, + "learning_rate": 8.968347010550995e-06, + "loss": 11.840921401977539, + "step": 51, + "token_acc": 0.00994983299427297 + }, + { + "epoch": 0.03048959249486954, + "grad_norm": 0.6985955833355848, + "learning_rate": 9.144196951934348e-06, + "loss": 11.820819854736328, + "step": 52, + "token_acc": 0.010665230272017236 + }, + { + "epoch": 0.03107593081207857, + "grad_norm": 0.7025370418351381, + "learning_rate": 9.320046893317702e-06, + "loss": 11.81194019317627, + "step": 53, + "token_acc": 0.010381088131673077 + }, + { + "epoch": 0.0316622691292876, + "grad_norm": 0.6873922833533456, + "learning_rate": 9.495896834701053e-06, + "loss": 11.805252075195312, + "step": 54, + "token_acc": 0.009833599549249058 + }, + { + "epoch": 0.03224860744649663, + "grad_norm": 0.6698310516097201, + "learning_rate": 9.671746776084405e-06, + "loss": 11.791781425476074, + "step": 55, + "token_acc": 0.009808991012105984 + }, + { + "epoch": 0.03283494576370566, + "grad_norm": 0.9373125049565595, + "learning_rate": 9.847596717467761e-06, + "loss": 11.773946762084961, + "step": 56, + "token_acc": 0.010166980877996229 + }, + { + "epoch": 0.03342128408091469, + "grad_norm": 0.8621283626238764, + "learning_rate": 1.0023446658851114e-05, + "loss": 11.760116577148438, + "step": 57, + "token_acc": 0.009926121865523092 + }, + { + "epoch": 0.034007622398123716, + "grad_norm": 0.8257835226977261, + "learning_rate": 1.0199296600234467e-05, + "loss": 11.747591972351074, + "step": 58, + "token_acc": 0.009541249531284511 + }, + { + "epoch": 0.034593960715332744, + "grad_norm": 1.0365097764198536, + "learning_rate": 1.037514654161782e-05, + "loss": 11.720434188842773, + "step": 59, + "token_acc": 0.010312098545578402 + }, + { + "epoch": 0.03518029903254178, + "grad_norm": 0.8725835366263507, + "learning_rate": 1.0550996483001172e-05, + "loss": 11.705840110778809, + "step": 60, + "token_acc": 0.009987452006606464 + }, + { + "epoch": 0.03576663734975081, + "grad_norm": 1.074544665508291, + "learning_rate": 1.0726846424384524e-05, + "loss": 11.683185577392578, + "step": 61, + "token_acc": 0.010081490959432815 + }, + { + "epoch": 0.03635297566695984, + "grad_norm": 0.9846162435087352, + "learning_rate": 1.0902696365767877e-05, + "loss": 11.665786743164062, + "step": 62, + "token_acc": 0.009682069716251167 + }, + { + "epoch": 0.036939313984168866, + "grad_norm": 1.3421678425032828, + "learning_rate": 1.107854630715123e-05, + "loss": 11.642001152038574, + "step": 63, + "token_acc": 0.009598904815762525 + }, + { + "epoch": 0.037525652301377894, + "grad_norm": 1.1774469227022621, + "learning_rate": 1.1254396248534582e-05, + "loss": 11.619728088378906, + "step": 64, + "token_acc": 0.009494669682006818 + }, + { + "epoch": 0.03811199061858692, + "grad_norm": 1.2998491921754616, + "learning_rate": 1.1430246189917935e-05, + "loss": 11.584670066833496, + "step": 65, + "token_acc": 0.009765726590864169 + }, + { + "epoch": 0.03869832893579595, + "grad_norm": 1.3331250444441887, + "learning_rate": 1.1606096131301289e-05, + "loss": 11.553152084350586, + "step": 66, + "token_acc": 0.009664113140836771 + }, + { + "epoch": 0.03928466725300499, + "grad_norm": 1.6226622150984584, + "learning_rate": 1.1781946072684642e-05, + "loss": 11.519734382629395, + "step": 67, + "token_acc": 0.009575511602198475 + }, + { + "epoch": 0.039871005570214016, + "grad_norm": 1.6983746926329137, + "learning_rate": 1.1957796014067994e-05, + "loss": 11.476907730102539, + "step": 68, + "token_acc": 0.009746186611111684 + }, + { + "epoch": 0.040457343887423045, + "grad_norm": 1.9643348599757893, + "learning_rate": 1.2133645955451347e-05, + "loss": 11.434260368347168, + "step": 69, + "token_acc": 0.01034937890285831 + }, + { + "epoch": 0.04104368220463207, + "grad_norm": 1.7242948284798472, + "learning_rate": 1.23094958968347e-05, + "loss": 11.389060020446777, + "step": 70, + "token_acc": 0.009599121765713144 + }, + { + "epoch": 0.0416300205218411, + "grad_norm": 2.5625702568153867, + "learning_rate": 1.2485345838218052e-05, + "loss": 11.341590881347656, + "step": 71, + "token_acc": 0.009460101329955669 + }, + { + "epoch": 0.04221635883905013, + "grad_norm": 3.350991324498559, + "learning_rate": 1.2661195779601406e-05, + "loss": 11.279542922973633, + "step": 72, + "token_acc": 0.010042107779976887 + }, + { + "epoch": 0.04280269715625916, + "grad_norm": 4.160970757348774, + "learning_rate": 1.2837045720984759e-05, + "loss": 11.229930877685547, + "step": 73, + "token_acc": 0.00965858873464549 + }, + { + "epoch": 0.043389035473468195, + "grad_norm": 2.9397270316351465, + "learning_rate": 1.3012895662368111e-05, + "loss": 11.172969818115234, + "step": 74, + "token_acc": 0.00988893140261154 + }, + { + "epoch": 0.04397537379067722, + "grad_norm": 4.680081569445933, + "learning_rate": 1.3188745603751464e-05, + "loss": 11.112567901611328, + "step": 75, + "token_acc": 0.010311249857864977 + }, + { + "epoch": 0.04456171210788625, + "grad_norm": 4.699299452058756, + "learning_rate": 1.3364595545134818e-05, + "loss": 11.063923835754395, + "step": 76, + "token_acc": 0.009845417740154582 + }, + { + "epoch": 0.04514805042509528, + "grad_norm": 5.538504792405696, + "learning_rate": 1.354044548651817e-05, + "loss": 11.009982109069824, + "step": 77, + "token_acc": 0.010230417527201116 + }, + { + "epoch": 0.04573438874230431, + "grad_norm": 6.241477072762899, + "learning_rate": 1.3716295427901523e-05, + "loss": 10.952494621276855, + "step": 78, + "token_acc": 0.009848835496833646 + }, + { + "epoch": 0.04632072705951334, + "grad_norm": 3.968667046539815, + "learning_rate": 1.3892145369284876e-05, + "loss": 10.884750366210938, + "step": 79, + "token_acc": 0.010374260292944979 + }, + { + "epoch": 0.046907065376722366, + "grad_norm": 4.08469338419477, + "learning_rate": 1.4067995310668228e-05, + "loss": 10.834638595581055, + "step": 80, + "token_acc": 0.01003153216963363 + }, + { + "epoch": 0.047493403693931395, + "grad_norm": 6.6259636972686575, + "learning_rate": 1.4243845252051581e-05, + "loss": 10.7899169921875, + "step": 81, + "token_acc": 0.010143984210553942 + }, + { + "epoch": 0.04807974201114043, + "grad_norm": 8.644032921976954, + "learning_rate": 1.4419695193434934e-05, + "loss": 10.74351692199707, + "step": 82, + "token_acc": 0.009235846796731522 + }, + { + "epoch": 0.04866608032834946, + "grad_norm": 4.081780657697027, + "learning_rate": 1.4595545134818286e-05, + "loss": 10.667047500610352, + "step": 83, + "token_acc": 0.009777380678504742 + }, + { + "epoch": 0.04925241864555849, + "grad_norm": 11.90175869423382, + "learning_rate": 1.4771395076201639e-05, + "loss": 10.624881744384766, + "step": 84, + "token_acc": 0.00937833267980549 + }, + { + "epoch": 0.049838756962767516, + "grad_norm": 7.994709891197614, + "learning_rate": 1.4947245017584991e-05, + "loss": 10.56396484375, + "step": 85, + "token_acc": 0.00981186471434669 + }, + { + "epoch": 0.050425095279976545, + "grad_norm": 8.543642313368, + "learning_rate": 1.5123094958968347e-05, + "loss": 10.482460021972656, + "step": 86, + "token_acc": 0.008725180999740156 + }, + { + "epoch": 0.051011433597185574, + "grad_norm": 7.198160934436331, + "learning_rate": 1.52989449003517e-05, + "loss": 10.4072265625, + "step": 87, + "token_acc": 0.009421692097833329 + }, + { + "epoch": 0.0515977719143946, + "grad_norm": 6.920251688037826, + "learning_rate": 1.547479484173505e-05, + "loss": 10.33627700805664, + "step": 88, + "token_acc": 0.00978225640119037 + }, + { + "epoch": 0.05218411023160364, + "grad_norm": 11.391122689653942, + "learning_rate": 1.5650644783118405e-05, + "loss": 10.275107383728027, + "step": 89, + "token_acc": 0.009228338721805463 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 6.991426182765631, + "learning_rate": 1.5826494724501756e-05, + "loss": 10.193984985351562, + "step": 90, + "token_acc": 0.009725028527649565 + }, + { + "epoch": 0.053356786866021695, + "grad_norm": 4.3113444626016895, + "learning_rate": 1.600234466588511e-05, + "loss": 10.132678985595703, + "step": 91, + "token_acc": 0.010158740210668524 + }, + { + "epoch": 0.053943125183230724, + "grad_norm": 11.940257207318611, + "learning_rate": 1.617819460726846e-05, + "loss": 10.072431564331055, + "step": 92, + "token_acc": 0.009815266901281929 + }, + { + "epoch": 0.05452946350043975, + "grad_norm": 9.681323523382128, + "learning_rate": 1.6354044548651815e-05, + "loss": 10.024391174316406, + "step": 93, + "token_acc": 0.009961478678110428 + }, + { + "epoch": 0.05511580181764878, + "grad_norm": 4.861946877215656, + "learning_rate": 1.6529894490035166e-05, + "loss": 9.97166919708252, + "step": 94, + "token_acc": 0.009475612488303746 + }, + { + "epoch": 0.05570214013485781, + "grad_norm": 4.3772589334491965, + "learning_rate": 1.670574443141852e-05, + "loss": 9.933237075805664, + "step": 95, + "token_acc": 0.01003043354640326 + }, + { + "epoch": 0.056288478452066845, + "grad_norm": 14.964595161107606, + "learning_rate": 1.6881594372801875e-05, + "loss": 9.862251281738281, + "step": 96, + "token_acc": 0.01016556857484778 + }, + { + "epoch": 0.056874816769275874, + "grad_norm": 4.65280285736792, + "learning_rate": 1.705744431418523e-05, + "loss": 9.84132194519043, + "step": 97, + "token_acc": 0.009567075299070265 + }, + { + "epoch": 0.0574611550864849, + "grad_norm": 14.368870071890083, + "learning_rate": 1.723329425556858e-05, + "loss": 9.78306770324707, + "step": 98, + "token_acc": 0.009784511204843476 + }, + { + "epoch": 0.05804749340369393, + "grad_norm": 12.283347945856411, + "learning_rate": 1.7409144196951934e-05, + "loss": 9.761861801147461, + "step": 99, + "token_acc": 0.009183320907057518 + }, + { + "epoch": 0.05863383172090296, + "grad_norm": 6.289929174215331, + "learning_rate": 1.7584994138335285e-05, + "loss": 9.696324348449707, + "step": 100, + "token_acc": 0.009611648446278735 + }, + { + "epoch": 0.05922017003811199, + "grad_norm": 35.78537698520067, + "learning_rate": 1.776084407971864e-05, + "loss": 9.738545417785645, + "step": 101, + "token_acc": 0.009845722446886983 + }, + { + "epoch": 0.05980650835532102, + "grad_norm": 23.291932077851612, + "learning_rate": 1.793669402110199e-05, + "loss": 9.63571548461914, + "step": 102, + "token_acc": 0.009211965975727201 + }, + { + "epoch": 0.06039284667253005, + "grad_norm": 15.106667317942211, + "learning_rate": 1.8112543962485345e-05, + "loss": 9.648144721984863, + "step": 103, + "token_acc": 0.009499501927771785 + }, + { + "epoch": 0.06097918498973908, + "grad_norm": 14.292865410024833, + "learning_rate": 1.8288393903868696e-05, + "loss": 9.568429946899414, + "step": 104, + "token_acc": 0.009660574412532636 + }, + { + "epoch": 0.06156552330694811, + "grad_norm": 8.461322653026425, + "learning_rate": 1.846424384525205e-05, + "loss": 9.560866355895996, + "step": 105, + "token_acc": 0.00936985872091505 + }, + { + "epoch": 0.06215186162415714, + "grad_norm": 4.292496271157349, + "learning_rate": 1.8640093786635404e-05, + "loss": 9.511852264404297, + "step": 106, + "token_acc": 0.010042017917074073 + }, + { + "epoch": 0.06273819994136617, + "grad_norm": 14.131710084857392, + "learning_rate": 1.8815943728018755e-05, + "loss": 9.496458053588867, + "step": 107, + "token_acc": 0.009481680473185036 + }, + { + "epoch": 0.0633245382585752, + "grad_norm": 12.579733144874412, + "learning_rate": 1.8991793669402106e-05, + "loss": 9.469629287719727, + "step": 108, + "token_acc": 0.009342529673495926 + }, + { + "epoch": 0.06391087657578423, + "grad_norm": 6.298168771686222, + "learning_rate": 1.916764361078546e-05, + "loss": 9.43764877319336, + "step": 109, + "token_acc": 0.009872353553661641 + }, + { + "epoch": 0.06449721489299326, + "grad_norm": 6.871301089559754, + "learning_rate": 1.934349355216881e-05, + "loss": 9.39526081085205, + "step": 110, + "token_acc": 0.010972125821086198 + }, + { + "epoch": 0.06508355321020229, + "grad_norm": 8.538197061328741, + "learning_rate": 1.951934349355217e-05, + "loss": 9.375219345092773, + "step": 111, + "token_acc": 0.011830798349507945 + }, + { + "epoch": 0.06566989152741132, + "grad_norm": 6.714641229555084, + "learning_rate": 1.9695193434935523e-05, + "loss": 9.35539436340332, + "step": 112, + "token_acc": 0.011527377521613832 + }, + { + "epoch": 0.06625622984462035, + "grad_norm": 11.079604572182392, + "learning_rate": 1.9871043376318874e-05, + "loss": 9.405292510986328, + "step": 113, + "token_acc": 0.01187268776480163 + }, + { + "epoch": 0.06684256816182937, + "grad_norm": 10.866949986340893, + "learning_rate": 2.0046893317702228e-05, + "loss": 9.310361862182617, + "step": 114, + "token_acc": 0.011660611129989283 + }, + { + "epoch": 0.0674289064790384, + "grad_norm": 5.631543407563885, + "learning_rate": 2.022274325908558e-05, + "loss": 9.294713020324707, + "step": 115, + "token_acc": 0.011267989449274766 + }, + { + "epoch": 0.06801524479624743, + "grad_norm": 4.298235727891895, + "learning_rate": 2.0398593200468933e-05, + "loss": 9.283777236938477, + "step": 116, + "token_acc": 0.011911465212838175 + }, + { + "epoch": 0.06860158311345646, + "grad_norm": 7.189953099216086, + "learning_rate": 2.0574443141852284e-05, + "loss": 9.287104606628418, + "step": 117, + "token_acc": 0.012719671956544894 + }, + { + "epoch": 0.06918792143066549, + "grad_norm": 4.938561701190064, + "learning_rate": 2.075029308323564e-05, + "loss": 9.19372272491455, + "step": 118, + "token_acc": 0.012120262190280664 + }, + { + "epoch": 0.06977425974787452, + "grad_norm": 5.582734578739099, + "learning_rate": 2.092614302461899e-05, + "loss": 9.248631477355957, + "step": 119, + "token_acc": 0.01245375783353619 + }, + { + "epoch": 0.07036059806508356, + "grad_norm": 5.747290575446226, + "learning_rate": 2.1101992966002344e-05, + "loss": 9.195615768432617, + "step": 120, + "token_acc": 0.011600977541594821 + }, + { + "epoch": 0.07094693638229259, + "grad_norm": 2.153343444667456, + "learning_rate": 2.1277842907385698e-05, + "loss": 9.178709030151367, + "step": 121, + "token_acc": 0.011623717839846424 + }, + { + "epoch": 0.07153327469950162, + "grad_norm": 2.6292161448695706, + "learning_rate": 2.145369284876905e-05, + "loss": 9.207082748413086, + "step": 122, + "token_acc": 0.012685543089992982 + }, + { + "epoch": 0.07211961301671065, + "grad_norm": 3.1307465945296826, + "learning_rate": 2.1629542790152403e-05, + "loss": 9.16843318939209, + "step": 123, + "token_acc": 0.01121300853208481 + }, + { + "epoch": 0.07270595133391967, + "grad_norm": 1.743800307119476, + "learning_rate": 2.1805392731535754e-05, + "loss": 9.134980201721191, + "step": 124, + "token_acc": 0.011759137608839346 + }, + { + "epoch": 0.0732922896511287, + "grad_norm": 6.051628470549426, + "learning_rate": 2.1981242672919108e-05, + "loss": 9.122943878173828, + "step": 125, + "token_acc": 0.011672197221885172 + }, + { + "epoch": 0.07387862796833773, + "grad_norm": 3.288730122021578, + "learning_rate": 2.215709261430246e-05, + "loss": 9.163206100463867, + "step": 126, + "token_acc": 0.01227240259302686 + }, + { + "epoch": 0.07446496628554676, + "grad_norm": 2.5453981083810584, + "learning_rate": 2.2332942555685813e-05, + "loss": 9.145343780517578, + "step": 127, + "token_acc": 0.011243629485630222 + }, + { + "epoch": 0.07505130460275579, + "grad_norm": 2.0002052789726297, + "learning_rate": 2.2508792497069164e-05, + "loss": 9.193897247314453, + "step": 128, + "token_acc": 0.01193651676083668 + }, + { + "epoch": 0.07563764291996482, + "grad_norm": 1.1792973472423087, + "learning_rate": 2.268464243845252e-05, + "loss": 9.138301849365234, + "step": 129, + "token_acc": 0.011933714892797067 + }, + { + "epoch": 0.07622398123717385, + "grad_norm": 1.615342470156613, + "learning_rate": 2.286049237983587e-05, + "loss": 9.069543838500977, + "step": 130, + "token_acc": 0.01162119458725517 + }, + { + "epoch": 0.07681031955438287, + "grad_norm": 1.2425419124387802, + "learning_rate": 2.3036342321219224e-05, + "loss": 9.160469055175781, + "step": 131, + "token_acc": 0.011514993959598123 + }, + { + "epoch": 0.0773966578715919, + "grad_norm": 1.1494220190885984, + "learning_rate": 2.3212192262602578e-05, + "loss": 9.09598159790039, + "step": 132, + "token_acc": 0.011932376180533798 + }, + { + "epoch": 0.07798299618880093, + "grad_norm": 1.0804073201496034, + "learning_rate": 2.338804220398593e-05, + "loss": 9.150649070739746, + "step": 133, + "token_acc": 0.011491540561662048 + }, + { + "epoch": 0.07856933450600997, + "grad_norm": 1.0158259606356494, + "learning_rate": 2.3563892145369283e-05, + "loss": 9.1259126663208, + "step": 134, + "token_acc": 0.011596265228679849 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 1.1407371077285182, + "learning_rate": 2.3739742086752634e-05, + "loss": 9.168603897094727, + "step": 135, + "token_acc": 0.011914977458501088 + }, + { + "epoch": 0.07974201114042803, + "grad_norm": 1.3923991247388563, + "learning_rate": 2.3915592028135988e-05, + "loss": 9.018424987792969, + "step": 136, + "token_acc": 0.012036606417946918 + }, + { + "epoch": 0.08032834945763706, + "grad_norm": 0.866237784620666, + "learning_rate": 2.409144196951934e-05, + "loss": 9.09442138671875, + "step": 137, + "token_acc": 0.011247779705740424 + }, + { + "epoch": 0.08091468777484609, + "grad_norm": 1.133934093662852, + "learning_rate": 2.4267291910902693e-05, + "loss": 9.096515655517578, + "step": 138, + "token_acc": 0.011347487130256101 + }, + { + "epoch": 0.08150102609205512, + "grad_norm": 0.7484998749546738, + "learning_rate": 2.4443141852286044e-05, + "loss": 9.116138458251953, + "step": 139, + "token_acc": 0.011388677224704985 + }, + { + "epoch": 0.08208736440926415, + "grad_norm": 1.2316770385528586, + "learning_rate": 2.46189917936694e-05, + "loss": 9.02778148651123, + "step": 140, + "token_acc": 0.012339055793991416 + }, + { + "epoch": 0.08267370272647317, + "grad_norm": 0.8202402062747113, + "learning_rate": 2.4794841735052756e-05, + "loss": 9.107624053955078, + "step": 141, + "token_acc": 0.01242040491313653 + }, + { + "epoch": 0.0832600410436822, + "grad_norm": 0.763879798575539, + "learning_rate": 2.4970691676436104e-05, + "loss": 9.10135269165039, + "step": 142, + "token_acc": 0.010823608152419579 + }, + { + "epoch": 0.08384637936089123, + "grad_norm": 0.8725460797459627, + "learning_rate": 2.514654161781946e-05, + "loss": 9.088061332702637, + "step": 143, + "token_acc": 0.011498018818663035 + }, + { + "epoch": 0.08443271767810026, + "grad_norm": 0.8102709264413125, + "learning_rate": 2.5322391559202812e-05, + "loss": 9.01123046875, + "step": 144, + "token_acc": 0.011969924970549557 + }, + { + "epoch": 0.08501905599530929, + "grad_norm": 0.9890874533700922, + "learning_rate": 2.5498241500586167e-05, + "loss": 9.156105995178223, + "step": 145, + "token_acc": 0.01190416711588879 + }, + { + "epoch": 0.08560539431251832, + "grad_norm": 0.822954879412597, + "learning_rate": 2.5674091441969517e-05, + "loss": 9.027963638305664, + "step": 146, + "token_acc": 0.011730904214639666 + }, + { + "epoch": 0.08619173262972735, + "grad_norm": 0.7282515853418581, + "learning_rate": 2.5849941383352872e-05, + "loss": 9.03160285949707, + "step": 147, + "token_acc": 0.01175353477542632 + }, + { + "epoch": 0.08677807094693639, + "grad_norm": 0.8368857688504768, + "learning_rate": 2.6025791324736223e-05, + "loss": 9.071564674377441, + "step": 148, + "token_acc": 0.010754451413656923 + }, + { + "epoch": 0.08736440926414542, + "grad_norm": 0.6278830429440386, + "learning_rate": 2.6201641266119577e-05, + "loss": 9.053972244262695, + "step": 149, + "token_acc": 0.011633656390825269 + }, + { + "epoch": 0.08795074758135445, + "grad_norm": 0.6397821727954456, + "learning_rate": 2.6377491207502928e-05, + "loss": 9.06360149383545, + "step": 150, + "token_acc": 0.011848974862455198 + }, + { + "epoch": 0.08853708589856348, + "grad_norm": 0.6004148577981634, + "learning_rate": 2.6553341148886282e-05, + "loss": 9.099259376525879, + "step": 151, + "token_acc": 0.013133470907262038 + }, + { + "epoch": 0.0891234242157725, + "grad_norm": 0.6865229704240047, + "learning_rate": 2.6729191090269636e-05, + "loss": 9.064840316772461, + "step": 152, + "token_acc": 0.013405342600970635 + }, + { + "epoch": 0.08970976253298153, + "grad_norm": 0.7632232621520254, + "learning_rate": 2.6905041031652987e-05, + "loss": 9.049582481384277, + "step": 153, + "token_acc": 0.013849242159155785 + }, + { + "epoch": 0.09029610085019056, + "grad_norm": 1.2343339843541559, + "learning_rate": 2.708089097303634e-05, + "loss": 9.073572158813477, + "step": 154, + "token_acc": 0.014364270067302789 + }, + { + "epoch": 0.09088243916739959, + "grad_norm": 2.4046900523131223, + "learning_rate": 2.7256740914419692e-05, + "loss": 9.104389190673828, + "step": 155, + "token_acc": 0.013243885722438934 + }, + { + "epoch": 0.09146877748460862, + "grad_norm": 0.7626259540919172, + "learning_rate": 2.7432590855803047e-05, + "loss": 9.041627883911133, + "step": 156, + "token_acc": 0.01367964734120157 + }, + { + "epoch": 0.09205511580181765, + "grad_norm": 1.0603069789698447, + "learning_rate": 2.7608440797186398e-05, + "loss": 9.078733444213867, + "step": 157, + "token_acc": 0.014265893766521792 + }, + { + "epoch": 0.09264145411902668, + "grad_norm": 2.7869681246712026, + "learning_rate": 2.7784290738569752e-05, + "loss": 9.044357299804688, + "step": 158, + "token_acc": 0.013175206298416975 + }, + { + "epoch": 0.0932277924362357, + "grad_norm": 1.0510916482917174, + "learning_rate": 2.7960140679953103e-05, + "loss": 9.011998176574707, + "step": 159, + "token_acc": 0.013896452113121632 + }, + { + "epoch": 0.09381413075344473, + "grad_norm": 1.7118843563336834, + "learning_rate": 2.8135990621336457e-05, + "loss": 9.12320327758789, + "step": 160, + "token_acc": 0.013295854970914525 + }, + { + "epoch": 0.09440046907065376, + "grad_norm": 1.5244266222430018, + "learning_rate": 2.831184056271981e-05, + "loss": 9.09963321685791, + "step": 161, + "token_acc": 0.0138716387466804 + }, + { + "epoch": 0.09498680738786279, + "grad_norm": 1.290101654649091, + "learning_rate": 2.8487690504103162e-05, + "loss": 9.068549156188965, + "step": 162, + "token_acc": 0.01399306481728461 + }, + { + "epoch": 0.09557314570507183, + "grad_norm": 0.9054428849915294, + "learning_rate": 2.8663540445486516e-05, + "loss": 9.050338745117188, + "step": 163, + "token_acc": 0.012928927807758884 + }, + { + "epoch": 0.09615948402228086, + "grad_norm": 0.7724962200914788, + "learning_rate": 2.8839390386869867e-05, + "loss": 9.063183784484863, + "step": 164, + "token_acc": 0.012395917789531936 + }, + { + "epoch": 0.09674582233948989, + "grad_norm": 1.0176998059967046, + "learning_rate": 2.901524032825322e-05, + "loss": 9.074644088745117, + "step": 165, + "token_acc": 0.01415834091881162 + }, + { + "epoch": 0.09733216065669892, + "grad_norm": 7.90819084770369, + "learning_rate": 2.9191090269636572e-05, + "loss": 9.12739372253418, + "step": 166, + "token_acc": 0.013206636782467937 + }, + { + "epoch": 0.09791849897390795, + "grad_norm": 3.423448470393187, + "learning_rate": 2.9366940211019927e-05, + "loss": 9.009647369384766, + "step": 167, + "token_acc": 0.01391107680709375 + }, + { + "epoch": 0.09850483729111698, + "grad_norm": 1.6457841087557514, + "learning_rate": 2.9542790152403278e-05, + "loss": 9.003883361816406, + "step": 168, + "token_acc": 0.0134522029454538 + }, + { + "epoch": 0.099091175608326, + "grad_norm": 2.198473395726332, + "learning_rate": 2.9718640093786632e-05, + "loss": 8.992986679077148, + "step": 169, + "token_acc": 0.014436789322999822 + }, + { + "epoch": 0.09967751392553503, + "grad_norm": 1.1970813445148931, + "learning_rate": 2.9894490035169983e-05, + "loss": 9.024124145507812, + "step": 170, + "token_acc": 0.012878132201587086 + }, + { + "epoch": 0.10026385224274406, + "grad_norm": 1.3429372910828548, + "learning_rate": 3.0070339976553337e-05, + "loss": 9.00933837890625, + "step": 171, + "token_acc": 0.01384946771697629 + }, + { + "epoch": 0.10085019055995309, + "grad_norm": 1.495172842814261, + "learning_rate": 3.0246189917936695e-05, + "loss": 9.008771896362305, + "step": 172, + "token_acc": 0.01305237598834097 + }, + { + "epoch": 0.10143652887716212, + "grad_norm": 3.111441907901365, + "learning_rate": 3.0422039859320042e-05, + "loss": 9.036519050598145, + "step": 173, + "token_acc": 0.014005433773596254 + }, + { + "epoch": 0.10202286719437115, + "grad_norm": 4.205589247307397, + "learning_rate": 3.05978898007034e-05, + "loss": 8.98863410949707, + "step": 174, + "token_acc": 0.01340088550485345 + }, + { + "epoch": 0.10260920551158018, + "grad_norm": 0.9788653267005818, + "learning_rate": 3.077373974208675e-05, + "loss": 9.032859802246094, + "step": 175, + "token_acc": 0.013573408634849708 + }, + { + "epoch": 0.1031955438287892, + "grad_norm": 1.9078470133423255, + "learning_rate": 3.09495896834701e-05, + "loss": 9.032963752746582, + "step": 176, + "token_acc": 0.014906284454244762 + }, + { + "epoch": 0.10378188214599825, + "grad_norm": 4.835289045264976, + "learning_rate": 3.112543962485345e-05, + "loss": 9.001845359802246, + "step": 177, + "token_acc": 0.01326270955625822 + }, + { + "epoch": 0.10436822046320728, + "grad_norm": 2.640550938937187, + "learning_rate": 3.130128956623681e-05, + "loss": 8.940900802612305, + "step": 178, + "token_acc": 0.014488685382291665 + }, + { + "epoch": 0.1049545587804163, + "grad_norm": 14.014989868496693, + "learning_rate": 3.147713950762016e-05, + "loss": 9.075488090515137, + "step": 179, + "token_acc": 0.014488467594862496 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 12.789330347693321, + "learning_rate": 3.165298944900351e-05, + "loss": 9.076740264892578, + "step": 180, + "token_acc": 0.014073287307488051 + }, + { + "epoch": 0.10612723541483436, + "grad_norm": 1.2837946677339216, + "learning_rate": 3.182883939038687e-05, + "loss": 9.014093399047852, + "step": 181, + "token_acc": 0.013479452603939608 + }, + { + "epoch": 0.10671357373204339, + "grad_norm": 3.4257055015124567, + "learning_rate": 3.200468933177022e-05, + "loss": 8.930099487304688, + "step": 182, + "token_acc": 0.0146207031420874 + }, + { + "epoch": 0.10729991204925242, + "grad_norm": 3.1713835244548565, + "learning_rate": 3.218053927315357e-05, + "loss": 9.024892807006836, + "step": 183, + "token_acc": 0.014562824791921922 + }, + { + "epoch": 0.10788625036646145, + "grad_norm": 2.588075444088043, + "learning_rate": 3.235638921453692e-05, + "loss": 8.943979263305664, + "step": 184, + "token_acc": 0.015625941420402006 + }, + { + "epoch": 0.10847258868367048, + "grad_norm": 2.0213650702088932, + "learning_rate": 3.253223915592028e-05, + "loss": 8.963343620300293, + "step": 185, + "token_acc": 0.014867770427393616 + }, + { + "epoch": 0.1090589270008795, + "grad_norm": 3.1662236379348765, + "learning_rate": 3.270808909730363e-05, + "loss": 8.966333389282227, + "step": 186, + "token_acc": 0.01482275064500185 + }, + { + "epoch": 0.10964526531808853, + "grad_norm": 0.8511160102277255, + "learning_rate": 3.288393903868698e-05, + "loss": 8.944194793701172, + "step": 187, + "token_acc": 0.014568748124071655 + }, + { + "epoch": 0.11023160363529756, + "grad_norm": 1.8901798550851767, + "learning_rate": 3.305978898007033e-05, + "loss": 8.878313064575195, + "step": 188, + "token_acc": 0.016518586737219787 + }, + { + "epoch": 0.11081794195250659, + "grad_norm": 0.6449265028125587, + "learning_rate": 3.323563892145369e-05, + "loss": 8.910615921020508, + "step": 189, + "token_acc": 0.016138486579772928 + }, + { + "epoch": 0.11140428026971562, + "grad_norm": 1.2665135816480648, + "learning_rate": 3.341148886283704e-05, + "loss": 8.943984985351562, + "step": 190, + "token_acc": 0.015834434958471803 + }, + { + "epoch": 0.11199061858692466, + "grad_norm": 3.125132540955922, + "learning_rate": 3.35873388042204e-05, + "loss": 8.913708686828613, + "step": 191, + "token_acc": 0.01523594838495425 + }, + { + "epoch": 0.11257695690413369, + "grad_norm": 2.8226529772726283, + "learning_rate": 3.376318874560375e-05, + "loss": 8.998905181884766, + "step": 192, + "token_acc": 0.0156307628373289 + }, + { + "epoch": 0.11316329522134272, + "grad_norm": 6.25845402660554, + "learning_rate": 3.39390386869871e-05, + "loss": 8.950105667114258, + "step": 193, + "token_acc": 0.015614232481913702 + }, + { + "epoch": 0.11374963353855175, + "grad_norm": 6.9238518423791975, + "learning_rate": 3.411488862837046e-05, + "loss": 8.931921005249023, + "step": 194, + "token_acc": 0.01613472933932224 + }, + { + "epoch": 0.11433597185576078, + "grad_norm": 0.964011993194305, + "learning_rate": 3.429073856975381e-05, + "loss": 8.915523529052734, + "step": 195, + "token_acc": 0.01629271261858786 + }, + { + "epoch": 0.1149223101729698, + "grad_norm": 2.1872053944233167, + "learning_rate": 3.446658851113716e-05, + "loss": 8.878673553466797, + "step": 196, + "token_acc": 0.016522522475200457 + }, + { + "epoch": 0.11550864849017883, + "grad_norm": 0.9929085286069286, + "learning_rate": 3.464243845252051e-05, + "loss": 8.950199127197266, + "step": 197, + "token_acc": 0.016348160576175472 + }, + { + "epoch": 0.11609498680738786, + "grad_norm": 2.0389524983194645, + "learning_rate": 3.481828839390387e-05, + "loss": 8.845256805419922, + "step": 198, + "token_acc": 0.017225800810283017 + }, + { + "epoch": 0.11668132512459689, + "grad_norm": 2.5318152346551335, + "learning_rate": 3.499413833528722e-05, + "loss": 8.826522827148438, + "step": 199, + "token_acc": 0.016965894507602764 + }, + { + "epoch": 0.11726766344180592, + "grad_norm": 1.3147557263545948, + "learning_rate": 3.516998827667057e-05, + "loss": 8.790542602539062, + "step": 200, + "token_acc": 0.015927815029847085 + }, + { + "epoch": 0.11785400175901495, + "grad_norm": 1.0529821309507168, + "learning_rate": 3.534583821805393e-05, + "loss": 8.906332015991211, + "step": 201, + "token_acc": 0.016561514195583597 + }, + { + "epoch": 0.11844034007622398, + "grad_norm": 2.198041382203583, + "learning_rate": 3.552168815943728e-05, + "loss": 8.760419845581055, + "step": 202, + "token_acc": 0.01693612737038842 + }, + { + "epoch": 0.119026678393433, + "grad_norm": 1.8636648000552203, + "learning_rate": 3.569753810082063e-05, + "loss": 8.882649421691895, + "step": 203, + "token_acc": 0.016757771881621718 + }, + { + "epoch": 0.11961301671064203, + "grad_norm": 2.334628036106764, + "learning_rate": 3.587338804220398e-05, + "loss": 8.732261657714844, + "step": 204, + "token_acc": 0.01689587109335083 + }, + { + "epoch": 0.12019935502785108, + "grad_norm": 3.6417434376433997, + "learning_rate": 3.604923798358734e-05, + "loss": 8.859725952148438, + "step": 205, + "token_acc": 0.016732605176935728 + }, + { + "epoch": 0.1207856933450601, + "grad_norm": 1.1461520465739654, + "learning_rate": 3.622508792497069e-05, + "loss": 8.789977073669434, + "step": 206, + "token_acc": 0.016978721576879307 + }, + { + "epoch": 0.12137203166226913, + "grad_norm": 5.478124128309508, + "learning_rate": 3.640093786635404e-05, + "loss": 8.8179292678833, + "step": 207, + "token_acc": 0.01666530404487685 + }, + { + "epoch": 0.12195836997947816, + "grad_norm": 1.9413287312440262, + "learning_rate": 3.657678780773739e-05, + "loss": 8.845304489135742, + "step": 208, + "token_acc": 0.01686858289122918 + }, + { + "epoch": 0.12254470829668719, + "grad_norm": 4.362775383731732, + "learning_rate": 3.675263774912075e-05, + "loss": 8.80113410949707, + "step": 209, + "token_acc": 0.016194827554091417 + }, + { + "epoch": 0.12313104661389622, + "grad_norm": 5.170439158455645, + "learning_rate": 3.69284876905041e-05, + "loss": 8.72726058959961, + "step": 210, + "token_acc": 0.016813022914523166 + }, + { + "epoch": 0.12371738493110525, + "grad_norm": 1.2544001304062333, + "learning_rate": 3.710433763188745e-05, + "loss": 8.8226957321167, + "step": 211, + "token_acc": 0.017576808370558257 + }, + { + "epoch": 0.12430372324831428, + "grad_norm": 4.064034908986902, + "learning_rate": 3.728018757327081e-05, + "loss": 8.691914558410645, + "step": 212, + "token_acc": 0.016029177630140615 + }, + { + "epoch": 0.1248900615655233, + "grad_norm": 5.66673725924405, + "learning_rate": 3.745603751465416e-05, + "loss": 8.654747009277344, + "step": 213, + "token_acc": 0.01710718629585339 + }, + { + "epoch": 0.12547639988273235, + "grad_norm": 1.8032447842916424, + "learning_rate": 3.763188745603751e-05, + "loss": 8.759397506713867, + "step": 214, + "token_acc": 0.016445591198048948 + }, + { + "epoch": 0.12606273819994138, + "grad_norm": 3.881359930935999, + "learning_rate": 3.780773739742087e-05, + "loss": 8.737679481506348, + "step": 215, + "token_acc": 0.015620007719264166 + }, + { + "epoch": 0.1266490765171504, + "grad_norm": 5.353254463820007, + "learning_rate": 3.798358733880421e-05, + "loss": 8.705495834350586, + "step": 216, + "token_acc": 0.017007621773868435 + }, + { + "epoch": 0.12723541483435943, + "grad_norm": 3.084177121198912, + "learning_rate": 3.815943728018757e-05, + "loss": 8.637994766235352, + "step": 217, + "token_acc": 0.01672063190892592 + }, + { + "epoch": 0.12782175315156846, + "grad_norm": 4.906960521179897, + "learning_rate": 3.833528722157092e-05, + "loss": 8.752494812011719, + "step": 218, + "token_acc": 0.016543936245969365 + }, + { + "epoch": 0.1284080914687775, + "grad_norm": 4.68498485188667, + "learning_rate": 3.851113716295428e-05, + "loss": 8.630697250366211, + "step": 219, + "token_acc": 0.01716079216489793 + }, + { + "epoch": 0.12899442978598652, + "grad_norm": 4.421641931399822, + "learning_rate": 3.868698710433762e-05, + "loss": 8.625170707702637, + "step": 220, + "token_acc": 0.01652241526557979 + }, + { + "epoch": 0.12958076810319555, + "grad_norm": 3.6243561487515494, + "learning_rate": 3.886283704572098e-05, + "loss": 8.68350601196289, + "step": 221, + "token_acc": 0.017743194418739387 + }, + { + "epoch": 0.13016710642040458, + "grad_norm": 6.731615648535092, + "learning_rate": 3.903868698710434e-05, + "loss": 8.61314868927002, + "step": 222, + "token_acc": 0.016826560453083856 + }, + { + "epoch": 0.1307534447376136, + "grad_norm": 3.8951372260537527, + "learning_rate": 3.921453692848769e-05, + "loss": 8.674982070922852, + "step": 223, + "token_acc": 0.01645953334735309 + }, + { + "epoch": 0.13133978305482263, + "grad_norm": 5.0161197585727555, + "learning_rate": 3.9390386869871046e-05, + "loss": 8.68167495727539, + "step": 224, + "token_acc": 0.016467335622679977 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 7.150299923498034, + "learning_rate": 3.956623681125439e-05, + "loss": 8.575281143188477, + "step": 225, + "token_acc": 0.016570919559520274 + }, + { + "epoch": 0.1325124596892407, + "grad_norm": 2.30292100939367, + "learning_rate": 3.974208675263775e-05, + "loss": 8.574335098266602, + "step": 226, + "token_acc": 0.017459028310447343 + }, + { + "epoch": 0.13309879800644972, + "grad_norm": 6.818015262946809, + "learning_rate": 3.99179366940211e-05, + "loss": 8.623686790466309, + "step": 227, + "token_acc": 0.016416210808624765 + }, + { + "epoch": 0.13368513632365875, + "grad_norm": 8.081805549191621, + "learning_rate": 4.0093786635404456e-05, + "loss": 8.6261625289917, + "step": 228, + "token_acc": 0.015715693644919895 + }, + { + "epoch": 0.13427147464086778, + "grad_norm": 2.9986313751668847, + "learning_rate": 4.02696365767878e-05, + "loss": 8.618754386901855, + "step": 229, + "token_acc": 0.016192745002832402 + }, + { + "epoch": 0.1348578129580768, + "grad_norm": 4.477099581502254, + "learning_rate": 4.044548651817116e-05, + "loss": 8.553512573242188, + "step": 230, + "token_acc": 0.0174617371039869 + }, + { + "epoch": 0.13544415127528583, + "grad_norm": 5.3811405918676245, + "learning_rate": 4.062133645955451e-05, + "loss": 8.630196571350098, + "step": 231, + "token_acc": 0.01684603615051477 + }, + { + "epoch": 0.13603048959249486, + "grad_norm": 2.3900266067121794, + "learning_rate": 4.0797186400937866e-05, + "loss": 8.560566902160645, + "step": 232, + "token_acc": 0.01697804930027705 + }, + { + "epoch": 0.1366168279097039, + "grad_norm": 3.2667017471281685, + "learning_rate": 4.097303634232122e-05, + "loss": 8.522208213806152, + "step": 233, + "token_acc": 0.017987175160310495 + }, + { + "epoch": 0.13720316622691292, + "grad_norm": 13.065247310607706, + "learning_rate": 4.114888628370457e-05, + "loss": 8.572076797485352, + "step": 234, + "token_acc": 0.01687565618236646 + }, + { + "epoch": 0.13778950454412195, + "grad_norm": 7.388370678270873, + "learning_rate": 4.132473622508792e-05, + "loss": 8.521028518676758, + "step": 235, + "token_acc": 0.01644102089325559 + }, + { + "epoch": 0.13837584286133098, + "grad_norm": 20.440602046224782, + "learning_rate": 4.150058616647128e-05, + "loss": 8.51113510131836, + "step": 236, + "token_acc": 0.017621761295818262 + }, + { + "epoch": 0.13896218117854, + "grad_norm": 18.541476919661964, + "learning_rate": 4.167643610785463e-05, + "loss": 8.62530517578125, + "step": 237, + "token_acc": 0.017359247213363224 + }, + { + "epoch": 0.13954851949574903, + "grad_norm": 2.5282089501337346, + "learning_rate": 4.185228604923798e-05, + "loss": 8.510663986206055, + "step": 238, + "token_acc": 0.019206657079544116 + }, + { + "epoch": 0.14013485781295806, + "grad_norm": 9.401277860536702, + "learning_rate": 4.202813599062133e-05, + "loss": 8.505946159362793, + "step": 239, + "token_acc": 0.018394363576213713 + }, + { + "epoch": 0.14072119613016712, + "grad_norm": 5.567415916578747, + "learning_rate": 4.220398593200469e-05, + "loss": 8.481810569763184, + "step": 240, + "token_acc": 0.017143819031710885 + }, + { + "epoch": 0.14130753444737615, + "grad_norm": 2.1675158960189513, + "learning_rate": 4.237983587338804e-05, + "loss": 8.493383407592773, + "step": 241, + "token_acc": 0.01822428262522767 + }, + { + "epoch": 0.14189387276458518, + "grad_norm": 8.929633214833405, + "learning_rate": 4.2555685814771396e-05, + "loss": 8.513069152832031, + "step": 242, + "token_acc": 0.017859501381529246 + }, + { + "epoch": 0.1424802110817942, + "grad_norm": 3.6897452741754173, + "learning_rate": 4.273153575615474e-05, + "loss": 8.453872680664062, + "step": 243, + "token_acc": 0.018653342204239025 + }, + { + "epoch": 0.14306654939900323, + "grad_norm": 4.625537063062409, + "learning_rate": 4.29073856975381e-05, + "loss": 8.47091293334961, + "step": 244, + "token_acc": 0.018603806974158858 + }, + { + "epoch": 0.14365288771621226, + "grad_norm": 7.870832093972414, + "learning_rate": 4.308323563892145e-05, + "loss": 8.564062118530273, + "step": 245, + "token_acc": 0.01740024713169218 + }, + { + "epoch": 0.1442392260334213, + "grad_norm": 4.259223115226154, + "learning_rate": 4.3259085580304806e-05, + "loss": 8.369973182678223, + "step": 246, + "token_acc": 0.01828073112807893 + }, + { + "epoch": 0.14482556435063032, + "grad_norm": 2.2867801139341593, + "learning_rate": 4.343493552168815e-05, + "loss": 8.349054336547852, + "step": 247, + "token_acc": 0.018252581127587885 + }, + { + "epoch": 0.14541190266783935, + "grad_norm": 11.668153759621728, + "learning_rate": 4.361078546307151e-05, + "loss": 8.439401626586914, + "step": 248, + "token_acc": 0.017820318677740622 + }, + { + "epoch": 0.14599824098504838, + "grad_norm": 2.1768025335983254, + "learning_rate": 4.378663540445486e-05, + "loss": 8.447366714477539, + "step": 249, + "token_acc": 0.018823249920810897 + }, + { + "epoch": 0.1465845793022574, + "grad_norm": 17.87671040443596, + "learning_rate": 4.3962485345838216e-05, + "loss": 8.423364639282227, + "step": 250, + "token_acc": 0.01875484276925797 + }, + { + "epoch": 0.14717091761946643, + "grad_norm": 14.547813900905627, + "learning_rate": 4.4138335287221574e-05, + "loss": 8.468502044677734, + "step": 251, + "token_acc": 0.01637811915792896 + }, + { + "epoch": 0.14775725593667546, + "grad_norm": 7.662344409516869, + "learning_rate": 4.431418522860492e-05, + "loss": 8.409443855285645, + "step": 252, + "token_acc": 0.018396897829317815 + }, + { + "epoch": 0.1483435942538845, + "grad_norm": 4.610852221268246, + "learning_rate": 4.449003516998827e-05, + "loss": 8.30703353881836, + "step": 253, + "token_acc": 0.018592773049497083 + }, + { + "epoch": 0.14892993257109352, + "grad_norm": 15.582364757774345, + "learning_rate": 4.466588511137163e-05, + "loss": 8.416754722595215, + "step": 254, + "token_acc": 0.01929136306630321 + }, + { + "epoch": 0.14951627088830255, + "grad_norm": 12.564851101593918, + "learning_rate": 4.4841735052754984e-05, + "loss": 8.483627319335938, + "step": 255, + "token_acc": 0.01938644858848068 + }, + { + "epoch": 0.15010260920551158, + "grad_norm": 8.015261268760222, + "learning_rate": 4.501758499413833e-05, + "loss": 8.404739379882812, + "step": 256, + "token_acc": 0.01865865619087942 + }, + { + "epoch": 0.1506889475227206, + "grad_norm": 7.364009400186939, + "learning_rate": 4.5193434935521686e-05, + "loss": 8.311761856079102, + "step": 257, + "token_acc": 0.019295205673469283 + }, + { + "epoch": 0.15127528583992964, + "grad_norm": 11.539319319363312, + "learning_rate": 4.536928487690504e-05, + "loss": 8.299919128417969, + "step": 258, + "token_acc": 0.02016530879220357 + }, + { + "epoch": 0.15186162415713866, + "grad_norm": 8.916585738074229, + "learning_rate": 4.5545134818288395e-05, + "loss": 8.352657318115234, + "step": 259, + "token_acc": 0.01924070080372876 + }, + { + "epoch": 0.1524479624743477, + "grad_norm": 14.799399472136862, + "learning_rate": 4.572098475967174e-05, + "loss": 8.296621322631836, + "step": 260, + "token_acc": 0.018318659471740984 + }, + { + "epoch": 0.15303430079155672, + "grad_norm": 12.52200798352578, + "learning_rate": 4.5896834701055096e-05, + "loss": 8.391961097717285, + "step": 261, + "token_acc": 0.019134395123230914 + }, + { + "epoch": 0.15362063910876575, + "grad_norm": 7.726866027566938, + "learning_rate": 4.607268464243845e-05, + "loss": 8.309759140014648, + "step": 262, + "token_acc": 0.018757194540371648 + }, + { + "epoch": 0.15420697742597478, + "grad_norm": 6.663272791005106, + "learning_rate": 4.6248534583821805e-05, + "loss": 8.346975326538086, + "step": 263, + "token_acc": 0.019628551090403158 + }, + { + "epoch": 0.1547933157431838, + "grad_norm": 8.849650123022013, + "learning_rate": 4.6424384525205156e-05, + "loss": 8.265969276428223, + "step": 264, + "token_acc": 0.01917901182204468 + }, + { + "epoch": 0.15537965406039284, + "grad_norm": 6.278597013879949, + "learning_rate": 4.660023446658851e-05, + "loss": 8.262683868408203, + "step": 265, + "token_acc": 0.020237367940757707 + }, + { + "epoch": 0.15596599237760186, + "grad_norm": 14.83723807809333, + "learning_rate": 4.677608440797186e-05, + "loss": 8.24969482421875, + "step": 266, + "token_acc": 0.02095503230969343 + }, + { + "epoch": 0.1565523306948109, + "grad_norm": 14.013194643054666, + "learning_rate": 4.6951934349355215e-05, + "loss": 8.28728199005127, + "step": 267, + "token_acc": 0.020356644541047995 + }, + { + "epoch": 0.15713866901201995, + "grad_norm": 3.299461201611789, + "learning_rate": 4.7127784290738566e-05, + "loss": 8.28223705291748, + "step": 268, + "token_acc": 0.01932153093571812 + }, + { + "epoch": 0.15772500732922898, + "grad_norm": 1.4370879623196615, + "learning_rate": 4.730363423212192e-05, + "loss": 8.154370307922363, + "step": 269, + "token_acc": 0.020513134692529394 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 11.690469326957365, + "learning_rate": 4.747948417350527e-05, + "loss": 8.307836532592773, + "step": 270, + "token_acc": 0.0186597039960265 + }, + { + "epoch": 0.15889768396364704, + "grad_norm": 7.034490509655638, + "learning_rate": 4.7655334114888626e-05, + "loss": 8.156492233276367, + "step": 271, + "token_acc": 0.020116473273681313 + }, + { + "epoch": 0.15948402228085606, + "grad_norm": 8.610409267769095, + "learning_rate": 4.7831184056271977e-05, + "loss": 8.175551414489746, + "step": 272, + "token_acc": 0.020806123220426027 + }, + { + "epoch": 0.1600703605980651, + "grad_norm": 4.911789439993146, + "learning_rate": 4.8007033997655334e-05, + "loss": 8.205076217651367, + "step": 273, + "token_acc": 0.019922229874769194 + }, + { + "epoch": 0.16065669891527412, + "grad_norm": 7.221215383403958, + "learning_rate": 4.818288393903868e-05, + "loss": 8.238790512084961, + "step": 274, + "token_acc": 0.019895751451924324 + }, + { + "epoch": 0.16124303723248315, + "grad_norm": 4.435402344576878, + "learning_rate": 4.8358733880422036e-05, + "loss": 8.225730895996094, + "step": 275, + "token_acc": 0.020369095309631113 + }, + { + "epoch": 0.16182937554969218, + "grad_norm": 6.71909090750322, + "learning_rate": 4.853458382180539e-05, + "loss": 8.196975708007812, + "step": 276, + "token_acc": 0.02137586040254431 + }, + { + "epoch": 0.1624157138669012, + "grad_norm": 3.6340838143635477, + "learning_rate": 4.8710433763188744e-05, + "loss": 8.132326126098633, + "step": 277, + "token_acc": 0.02181880979874629 + }, + { + "epoch": 0.16300205218411024, + "grad_norm": 3.374835225665365, + "learning_rate": 4.888628370457209e-05, + "loss": 8.0545654296875, + "step": 278, + "token_acc": 0.021444238143077304 + }, + { + "epoch": 0.16358839050131926, + "grad_norm": 12.31289490424, + "learning_rate": 4.9062133645955446e-05, + "loss": 8.135686874389648, + "step": 279, + "token_acc": 0.02138458539874521 + }, + { + "epoch": 0.1641747288185283, + "grad_norm": 9.090867008689703, + "learning_rate": 4.92379835873388e-05, + "loss": 8.082733154296875, + "step": 280, + "token_acc": 0.021298596585837062 + }, + { + "epoch": 0.16476106713573732, + "grad_norm": 13.052408170393587, + "learning_rate": 4.9413833528722155e-05, + "loss": 8.079601287841797, + "step": 281, + "token_acc": 0.021224666252589126 + }, + { + "epoch": 0.16534740545294635, + "grad_norm": 11.778781185258065, + "learning_rate": 4.958968347010551e-05, + "loss": 8.064671516418457, + "step": 282, + "token_acc": 0.021834183530466955 + }, + { + "epoch": 0.16593374377015538, + "grad_norm": 6.78001123641741, + "learning_rate": 4.9765533411488857e-05, + "loss": 8.138179779052734, + "step": 283, + "token_acc": 0.0203531588851116 + }, + { + "epoch": 0.1665200820873644, + "grad_norm": 5.049128069374832, + "learning_rate": 4.994138335287221e-05, + "loss": 8.085536003112793, + "step": 284, + "token_acc": 0.022625034070080593 + }, + { + "epoch": 0.16710642040457344, + "grad_norm": 12.528670847425603, + "learning_rate": 5.0117233294255565e-05, + "loss": 8.023820877075195, + "step": 285, + "token_acc": 0.022043720754737036 + }, + { + "epoch": 0.16769275872178246, + "grad_norm": 10.784200550540923, + "learning_rate": 5.029308323563892e-05, + "loss": 8.108606338500977, + "step": 286, + "token_acc": 0.02092908479595299 + }, + { + "epoch": 0.1682790970389915, + "grad_norm": 8.126861750504569, + "learning_rate": 5.046893317702227e-05, + "loss": 7.985041618347168, + "step": 287, + "token_acc": 0.02380128160747117 + }, + { + "epoch": 0.16886543535620052, + "grad_norm": 6.561806312125951, + "learning_rate": 5.0644783118405625e-05, + "loss": 8.01936149597168, + "step": 288, + "token_acc": 0.0228410804370893 + }, + { + "epoch": 0.16945177367340955, + "grad_norm": 7.933481262999368, + "learning_rate": 5.0820633059788975e-05, + "loss": 8.02853012084961, + "step": 289, + "token_acc": 0.022237328872628023 + }, + { + "epoch": 0.17003811199061858, + "grad_norm": 4.366074984576223, + "learning_rate": 5.099648300117233e-05, + "loss": 7.944267749786377, + "step": 290, + "token_acc": 0.025122139500665743 + }, + { + "epoch": 0.1706244503078276, + "grad_norm": 11.692846671580012, + "learning_rate": 5.117233294255568e-05, + "loss": 7.985602378845215, + "step": 291, + "token_acc": 0.023169997823331328 + }, + { + "epoch": 0.17121078862503664, + "grad_norm": 13.074716716989984, + "learning_rate": 5.1348182883939035e-05, + "loss": 7.920740604400635, + "step": 292, + "token_acc": 0.024088712879455885 + }, + { + "epoch": 0.17179712694224566, + "grad_norm": 3.668348138002314, + "learning_rate": 5.1524032825322386e-05, + "loss": 7.948727607727051, + "step": 293, + "token_acc": 0.024311563090047828 + }, + { + "epoch": 0.1723834652594547, + "grad_norm": 3.6593951371713347, + "learning_rate": 5.1699882766705743e-05, + "loss": 7.921501159667969, + "step": 294, + "token_acc": 0.02397616188662831 + }, + { + "epoch": 0.17296980357666372, + "grad_norm": 2.3311775966786583, + "learning_rate": 5.1875732708089094e-05, + "loss": 7.8770036697387695, + "step": 295, + "token_acc": 0.02581199573513264 + }, + { + "epoch": 0.17355614189387278, + "grad_norm": 6.422375969880502, + "learning_rate": 5.2051582649472445e-05, + "loss": 7.884852409362793, + "step": 296, + "token_acc": 0.024368458420172687 + }, + { + "epoch": 0.1741424802110818, + "grad_norm": 5.166149248909966, + "learning_rate": 5.2227432590855796e-05, + "loss": 7.842950820922852, + "step": 297, + "token_acc": 0.024369453211405143 + }, + { + "epoch": 0.17472881852829084, + "grad_norm": 5.864410777685613, + "learning_rate": 5.2403282532239154e-05, + "loss": 7.882455825805664, + "step": 298, + "token_acc": 0.0243029159866849 + }, + { + "epoch": 0.17531515684549986, + "grad_norm": 4.555465681072166, + "learning_rate": 5.2579132473622505e-05, + "loss": 7.719220161437988, + "step": 299, + "token_acc": 0.026509222169598015 + }, + { + "epoch": 0.1759014951627089, + "grad_norm": 8.146444251304837, + "learning_rate": 5.2754982415005856e-05, + "loss": 7.7608819007873535, + "step": 300, + "token_acc": 0.02517310504945128 + }, + { + "epoch": 0.17648783347991792, + "grad_norm": 5.81969392025838, + "learning_rate": 5.2930832356389206e-05, + "loss": 7.8868088722229, + "step": 301, + "token_acc": 0.026045193782157562 + }, + { + "epoch": 0.17707417179712695, + "grad_norm": 3.55087871225099, + "learning_rate": 5.3106682297772564e-05, + "loss": 7.767951488494873, + "step": 302, + "token_acc": 0.026804178446754007 + }, + { + "epoch": 0.17766051011433598, + "grad_norm": 6.103333882605994, + "learning_rate": 5.3282532239155915e-05, + "loss": 7.703026294708252, + "step": 303, + "token_acc": 0.028081661460994327 + }, + { + "epoch": 0.178246848431545, + "grad_norm": 6.364365303635605, + "learning_rate": 5.345838218053927e-05, + "loss": 7.696715354919434, + "step": 304, + "token_acc": 0.02772256186845454 + }, + { + "epoch": 0.17883318674875404, + "grad_norm": 3.1244894707851656, + "learning_rate": 5.363423212192262e-05, + "loss": 7.673195838928223, + "step": 305, + "token_acc": 0.027334839978056535 + }, + { + "epoch": 0.17941952506596306, + "grad_norm": 2.245808538247048, + "learning_rate": 5.3810082063305974e-05, + "loss": 7.595062255859375, + "step": 306, + "token_acc": 0.029556739893409595 + }, + { + "epoch": 0.1800058633831721, + "grad_norm": 3.2670214584944497, + "learning_rate": 5.3985932004689325e-05, + "loss": 7.662883758544922, + "step": 307, + "token_acc": 0.030627175805047868 + }, + { + "epoch": 0.18059220170038112, + "grad_norm": 9.105293586837126, + "learning_rate": 5.416178194607268e-05, + "loss": 7.673460483551025, + "step": 308, + "token_acc": 0.027661091777773858 + }, + { + "epoch": 0.18117854001759015, + "grad_norm": 3.1321173873360406, + "learning_rate": 5.433763188745603e-05, + "loss": 7.641384124755859, + "step": 309, + "token_acc": 0.0288999058869512 + }, + { + "epoch": 0.18176487833479918, + "grad_norm": 13.919280211384155, + "learning_rate": 5.4513481828839385e-05, + "loss": 7.5895843505859375, + "step": 310, + "token_acc": 0.031850296366024974 + }, + { + "epoch": 0.1823512166520082, + "grad_norm": 10.78428048946607, + "learning_rate": 5.4689331770222736e-05, + "loss": 7.651038646697998, + "step": 311, + "token_acc": 0.03031201206764231 + }, + { + "epoch": 0.18293755496921724, + "grad_norm": 11.317942213652632, + "learning_rate": 5.486518171160609e-05, + "loss": 7.543618202209473, + "step": 312, + "token_acc": 0.028990082072378468 + }, + { + "epoch": 0.18352389328642627, + "grad_norm": 10.367792510592034, + "learning_rate": 5.504103165298945e-05, + "loss": 7.542943954467773, + "step": 313, + "token_acc": 0.03029279538609945 + }, + { + "epoch": 0.1841102316036353, + "grad_norm": 6.268839393388123, + "learning_rate": 5.5216881594372795e-05, + "loss": 7.507347106933594, + "step": 314, + "token_acc": 0.03205096252579529 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 4.508204916844255, + "learning_rate": 5.5392731535756146e-05, + "loss": 7.416807174682617, + "step": 315, + "token_acc": 0.03401838061716707 + }, + { + "epoch": 0.18528290823805335, + "grad_norm": 11.750848656420674, + "learning_rate": 5.5568581477139504e-05, + "loss": 7.428244113922119, + "step": 316, + "token_acc": 0.032806324110671935 + }, + { + "epoch": 0.18586924655526238, + "grad_norm": 11.627313030276985, + "learning_rate": 5.574443141852286e-05, + "loss": 7.453977108001709, + "step": 317, + "token_acc": 0.03260713590193003 + }, + { + "epoch": 0.1864555848724714, + "grad_norm": 5.214110239696652, + "learning_rate": 5.5920281359906205e-05, + "loss": 7.3778076171875, + "step": 318, + "token_acc": 0.03553857399851728 + }, + { + "epoch": 0.18704192318968044, + "grad_norm": 6.391817140754896, + "learning_rate": 5.609613130128956e-05, + "loss": 7.440142631530762, + "step": 319, + "token_acc": 0.03374119201973295 + }, + { + "epoch": 0.18762826150688947, + "grad_norm": 6.405246099739629, + "learning_rate": 5.6271981242672914e-05, + "loss": 7.383110523223877, + "step": 320, + "token_acc": 0.03369989912707122 + }, + { + "epoch": 0.1882145998240985, + "grad_norm": 4.008618115738209, + "learning_rate": 5.644783118405627e-05, + "loss": 7.389673709869385, + "step": 321, + "token_acc": 0.03433148960518787 + }, + { + "epoch": 0.18880093814130752, + "grad_norm": 5.767811435642842, + "learning_rate": 5.662368112543962e-05, + "loss": 7.326108932495117, + "step": 322, + "token_acc": 0.0385379304515945 + }, + { + "epoch": 0.18938727645851655, + "grad_norm": 3.7583662533508426, + "learning_rate": 5.679953106682297e-05, + "loss": 7.279698848724365, + "step": 323, + "token_acc": 0.03957132602866975 + }, + { + "epoch": 0.18997361477572558, + "grad_norm": 5.943633506455475, + "learning_rate": 5.6975381008206324e-05, + "loss": 7.280203342437744, + "step": 324, + "token_acc": 0.0367412308614768 + }, + { + "epoch": 0.19055995309293464, + "grad_norm": 3.2342148152791133, + "learning_rate": 5.715123094958968e-05, + "loss": 7.163398742675781, + "step": 325, + "token_acc": 0.04174379956665334 + }, + { + "epoch": 0.19114629141014366, + "grad_norm": 3.165610295503623, + "learning_rate": 5.732708089097303e-05, + "loss": 7.16325044631958, + "step": 326, + "token_acc": 0.04046002494111126 + }, + { + "epoch": 0.1917326297273527, + "grad_norm": 5.039046477812103, + "learning_rate": 5.7502930832356384e-05, + "loss": 7.215337753295898, + "step": 327, + "token_acc": 0.03968510953556506 + }, + { + "epoch": 0.19231896804456172, + "grad_norm": 2.883439081513902, + "learning_rate": 5.7678780773739735e-05, + "loss": 7.11888313293457, + "step": 328, + "token_acc": 0.04250615736031269 + }, + { + "epoch": 0.19290530636177075, + "grad_norm": 3.6553171107398628, + "learning_rate": 5.785463071512309e-05, + "loss": 7.125880241394043, + "step": 329, + "token_acc": 0.04175203327745701 + }, + { + "epoch": 0.19349164467897978, + "grad_norm": 5.058030098319214, + "learning_rate": 5.803048065650644e-05, + "loss": 7.135167121887207, + "step": 330, + "token_acc": 0.04496769124697356 + }, + { + "epoch": 0.1940779829961888, + "grad_norm": 3.816963267149248, + "learning_rate": 5.8206330597889794e-05, + "loss": 7.082459449768066, + "step": 331, + "token_acc": 0.04591925420991124 + }, + { + "epoch": 0.19466432131339784, + "grad_norm": 4.971782718161779, + "learning_rate": 5.8382180539273145e-05, + "loss": 7.1755266189575195, + "step": 332, + "token_acc": 0.04363740940406607 + }, + { + "epoch": 0.19525065963060687, + "grad_norm": 4.059838755757486, + "learning_rate": 5.85580304806565e-05, + "loss": 7.031644344329834, + "step": 333, + "token_acc": 0.04727827833332024 + }, + { + "epoch": 0.1958369979478159, + "grad_norm": 6.9386590373189, + "learning_rate": 5.8733880422039853e-05, + "loss": 7.099215507507324, + "step": 334, + "token_acc": 0.045780148259895674 + }, + { + "epoch": 0.19642333626502492, + "grad_norm": 2.6967921812083, + "learning_rate": 5.890973036342321e-05, + "loss": 6.985239028930664, + "step": 335, + "token_acc": 0.05013862557090652 + }, + { + "epoch": 0.19700967458223395, + "grad_norm": 7.550407977501261, + "learning_rate": 5.9085580304806555e-05, + "loss": 7.04837703704834, + "step": 336, + "token_acc": 0.04612652287519834 + }, + { + "epoch": 0.19759601289944298, + "grad_norm": 5.27521125963567, + "learning_rate": 5.926143024618991e-05, + "loss": 6.9652886390686035, + "step": 337, + "token_acc": 0.051093080386549805 + }, + { + "epoch": 0.198182351216652, + "grad_norm": 3.6158798804507555, + "learning_rate": 5.9437280187573264e-05, + "loss": 6.937475204467773, + "step": 338, + "token_acc": 0.05021848416495554 + }, + { + "epoch": 0.19876868953386104, + "grad_norm": 6.222175978899776, + "learning_rate": 5.961313012895662e-05, + "loss": 7.00045108795166, + "step": 339, + "token_acc": 0.04891469585423589 + }, + { + "epoch": 0.19935502785107007, + "grad_norm": 3.1054397194678796, + "learning_rate": 5.9788980070339966e-05, + "loss": 6.8834381103515625, + "step": 340, + "token_acc": 0.0526119639685359 + }, + { + "epoch": 0.1999413661682791, + "grad_norm": 4.397389351686467, + "learning_rate": 5.996483001172332e-05, + "loss": 6.9578094482421875, + "step": 341, + "token_acc": 0.052191083494864225 + }, + { + "epoch": 0.20052770448548812, + "grad_norm": 4.801319634690989, + "learning_rate": 6.0140679953106674e-05, + "loss": 6.844263076782227, + "step": 342, + "token_acc": 0.052490422443043486 + }, + { + "epoch": 0.20111404280269715, + "grad_norm": 4.767657258047003, + "learning_rate": 6.031652989449003e-05, + "loss": 6.938053131103516, + "step": 343, + "token_acc": 0.05282566016014473 + }, + { + "epoch": 0.20170038111990618, + "grad_norm": 2.253603185039558, + "learning_rate": 6.049237983587339e-05, + "loss": 6.819204807281494, + "step": 344, + "token_acc": 0.05629140825095781 + }, + { + "epoch": 0.2022867194371152, + "grad_norm": 4.344731956548509, + "learning_rate": 6.0668229777256734e-05, + "loss": 6.8875885009765625, + "step": 345, + "token_acc": 0.05337461936760417 + }, + { + "epoch": 0.20287305775432424, + "grad_norm": 2.811385796889861, + "learning_rate": 6.0844079718640084e-05, + "loss": 6.836453437805176, + "step": 346, + "token_acc": 0.05516626219443596 + }, + { + "epoch": 0.20345939607153327, + "grad_norm": 6.092241745161295, + "learning_rate": 6.101992966002344e-05, + "loss": 6.767938613891602, + "step": 347, + "token_acc": 0.058459332297080176 + }, + { + "epoch": 0.2040457343887423, + "grad_norm": 3.9864307005875705, + "learning_rate": 6.11957796014068e-05, + "loss": 6.770617485046387, + "step": 348, + "token_acc": 0.05724495126315673 + }, + { + "epoch": 0.20463207270595132, + "grad_norm": 4.058901327550995, + "learning_rate": 6.137162954279014e-05, + "loss": 6.76531982421875, + "step": 349, + "token_acc": 0.06027002080049777 + }, + { + "epoch": 0.20521841102316035, + "grad_norm": 4.317791212637859, + "learning_rate": 6.15474794841735e-05, + "loss": 6.773094177246094, + "step": 350, + "token_acc": 0.062060392762690104 + }, + { + "epoch": 0.20580474934036938, + "grad_norm": 4.934932155666583, + "learning_rate": 6.172332942555685e-05, + "loss": 6.6705732345581055, + "step": 351, + "token_acc": 0.06350626808100289 + }, + { + "epoch": 0.2063910876575784, + "grad_norm": 3.9845913931971464, + "learning_rate": 6.18991793669402e-05, + "loss": 6.6311445236206055, + "step": 352, + "token_acc": 0.06489040217171262 + }, + { + "epoch": 0.20697742597478747, + "grad_norm": 5.705320434840597, + "learning_rate": 6.207502930832357e-05, + "loss": 6.754546165466309, + "step": 353, + "token_acc": 0.06055406773701478 + }, + { + "epoch": 0.2075637642919965, + "grad_norm": 3.9814825127524904, + "learning_rate": 6.22508792497069e-05, + "loss": 6.669569969177246, + "step": 354, + "token_acc": 0.0641871223260956 + }, + { + "epoch": 0.20815010260920552, + "grad_norm": 2.932258869299944, + "learning_rate": 6.242672919109027e-05, + "loss": 6.554462432861328, + "step": 355, + "token_acc": 0.06948275862068966 + }, + { + "epoch": 0.20873644092641455, + "grad_norm": 4.052589265141277, + "learning_rate": 6.260257913247362e-05, + "loss": 6.661326885223389, + "step": 356, + "token_acc": 0.06407248499617613 + }, + { + "epoch": 0.20932277924362358, + "grad_norm": 2.5844799515788988, + "learning_rate": 6.277842907385697e-05, + "loss": 6.592467308044434, + "step": 357, + "token_acc": 0.06882969230088795 + }, + { + "epoch": 0.2099091175608326, + "grad_norm": 6.63740309439479, + "learning_rate": 6.295427901524032e-05, + "loss": 6.533319473266602, + "step": 358, + "token_acc": 0.07381255312297078 + }, + { + "epoch": 0.21049545587804164, + "grad_norm": 2.867880931416568, + "learning_rate": 6.313012895662367e-05, + "loss": 6.496272563934326, + "step": 359, + "token_acc": 0.07298163480118769 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 6.466092282670102, + "learning_rate": 6.330597889800702e-05, + "loss": 6.492397308349609, + "step": 360, + "token_acc": 0.0708952157789965 + }, + { + "epoch": 0.2116681325124597, + "grad_norm": 4.081236904055039, + "learning_rate": 6.348182883939039e-05, + "loss": 6.564023494720459, + "step": 361, + "token_acc": 0.07080278501869934 + }, + { + "epoch": 0.21225447082966872, + "grad_norm": 4.576791166092956, + "learning_rate": 6.365767878077374e-05, + "loss": 6.539773941040039, + "step": 362, + "token_acc": 0.07262635282723645 + }, + { + "epoch": 0.21284080914687775, + "grad_norm": 3.4615545149694227, + "learning_rate": 6.383352872215709e-05, + "loss": 6.419950485229492, + "step": 363, + "token_acc": 0.07525542701156895 + }, + { + "epoch": 0.21342714746408678, + "grad_norm": 3.8621845560128123, + "learning_rate": 6.400937866354044e-05, + "loss": 6.485685348510742, + "step": 364, + "token_acc": 0.0747659922549649 + }, + { + "epoch": 0.2140134857812958, + "grad_norm": 4.310845577707348, + "learning_rate": 6.418522860492379e-05, + "loss": 6.360536575317383, + "step": 365, + "token_acc": 0.08212773393915768 + }, + { + "epoch": 0.21459982409850484, + "grad_norm": 5.130310485378813, + "learning_rate": 6.436107854630714e-05, + "loss": 6.332820892333984, + "step": 366, + "token_acc": 0.08399961197565657 + }, + { + "epoch": 0.21518616241571387, + "grad_norm": 3.2627227667341243, + "learning_rate": 6.45369284876905e-05, + "loss": 6.339569091796875, + "step": 367, + "token_acc": 0.0825173213119373 + }, + { + "epoch": 0.2157725007329229, + "grad_norm": 2.7698115072089378, + "learning_rate": 6.471277842907384e-05, + "loss": 6.3140997886657715, + "step": 368, + "token_acc": 0.08358076882747544 + }, + { + "epoch": 0.21635883905013192, + "grad_norm": 6.237769593166118, + "learning_rate": 6.488862837045721e-05, + "loss": 6.258779525756836, + "step": 369, + "token_acc": 0.0876622606762058 + }, + { + "epoch": 0.21694517736734095, + "grad_norm": 3.198082732767584, + "learning_rate": 6.506447831184056e-05, + "loss": 6.240322589874268, + "step": 370, + "token_acc": 0.08722307407603236 + }, + { + "epoch": 0.21753151568454998, + "grad_norm": 5.419124343030608, + "learning_rate": 6.524032825322391e-05, + "loss": 6.224976539611816, + "step": 371, + "token_acc": 0.09026953041111344 + }, + { + "epoch": 0.218117854001759, + "grad_norm": 2.691783564878796, + "learning_rate": 6.541617819460726e-05, + "loss": 6.274032115936279, + "step": 372, + "token_acc": 0.08869360367395016 + }, + { + "epoch": 0.21870419231896804, + "grad_norm": 6.321056109320655, + "learning_rate": 6.559202813599061e-05, + "loss": 6.243311882019043, + "step": 373, + "token_acc": 0.08884785339860503 + }, + { + "epoch": 0.21929053063617707, + "grad_norm": 3.5963317047768624, + "learning_rate": 6.576787807737396e-05, + "loss": 6.225800037384033, + "step": 374, + "token_acc": 0.08838464533626929 + }, + { + "epoch": 0.2198768689533861, + "grad_norm": 4.891786070067984, + "learning_rate": 6.594372801875733e-05, + "loss": 6.2503886222839355, + "step": 375, + "token_acc": 0.08695025951783629 + }, + { + "epoch": 0.22046320727059512, + "grad_norm": 3.3030936613989357, + "learning_rate": 6.611957796014067e-05, + "loss": 6.132002353668213, + "step": 376, + "token_acc": 0.09638771588071487 + }, + { + "epoch": 0.22104954558780415, + "grad_norm": 5.179555144508683, + "learning_rate": 6.629542790152403e-05, + "loss": 6.224666595458984, + "step": 377, + "token_acc": 0.09155244630836488 + }, + { + "epoch": 0.22163588390501318, + "grad_norm": 4.359283222832193, + "learning_rate": 6.647127784290738e-05, + "loss": 6.115947246551514, + "step": 378, + "token_acc": 0.09782691594585567 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 3.2069770676956164, + "learning_rate": 6.664712778429073e-05, + "loss": 6.088150978088379, + "step": 379, + "token_acc": 0.09868489873210612 + }, + { + "epoch": 0.22280856053943124, + "grad_norm": 5.21906238950017, + "learning_rate": 6.682297772567408e-05, + "loss": 6.070379257202148, + "step": 380, + "token_acc": 0.09922988287031165 + }, + { + "epoch": 0.2233948988566403, + "grad_norm": 3.671361985579552, + "learning_rate": 6.699882766705743e-05, + "loss": 6.007410049438477, + "step": 381, + "token_acc": 0.10148445547954252 + }, + { + "epoch": 0.22398123717384932, + "grad_norm": 3.6539275815743, + "learning_rate": 6.71746776084408e-05, + "loss": 5.959519386291504, + "step": 382, + "token_acc": 0.10463915681433661 + }, + { + "epoch": 0.22456757549105835, + "grad_norm": 3.484047399329108, + "learning_rate": 6.735052754982415e-05, + "loss": 5.968883514404297, + "step": 383, + "token_acc": 0.10419276129566506 + }, + { + "epoch": 0.22515391380826738, + "grad_norm": 3.7392798077416334, + "learning_rate": 6.75263774912075e-05, + "loss": 5.93845272064209, + "step": 384, + "token_acc": 0.107687439169029 + }, + { + "epoch": 0.2257402521254764, + "grad_norm": 5.104027979643277, + "learning_rate": 6.770222743259085e-05, + "loss": 5.954387664794922, + "step": 385, + "token_acc": 0.10229585508398396 + }, + { + "epoch": 0.22632659044268544, + "grad_norm": 3.2295895983045044, + "learning_rate": 6.78780773739742e-05, + "loss": 5.954471588134766, + "step": 386, + "token_acc": 0.10523225769795812 + }, + { + "epoch": 0.22691292875989447, + "grad_norm": 3.902262975674632, + "learning_rate": 6.805392731535755e-05, + "loss": 5.940810203552246, + "step": 387, + "token_acc": 0.10337094696979714 + }, + { + "epoch": 0.2274992670771035, + "grad_norm": 4.215680276073613, + "learning_rate": 6.822977725674092e-05, + "loss": 5.91622257232666, + "step": 388, + "token_acc": 0.11124847764272017 + }, + { + "epoch": 0.22808560539431252, + "grad_norm": 4.02224878232534, + "learning_rate": 6.840562719812425e-05, + "loss": 5.936064720153809, + "step": 389, + "token_acc": 0.10599353059489497 + }, + { + "epoch": 0.22867194371152155, + "grad_norm": 4.400738551378376, + "learning_rate": 6.858147713950762e-05, + "loss": 5.819920063018799, + "step": 390, + "token_acc": 0.11496178107801813 + }, + { + "epoch": 0.22925828202873058, + "grad_norm": 3.106468815757425, + "learning_rate": 6.875732708089097e-05, + "loss": 5.793968200683594, + "step": 391, + "token_acc": 0.11380289997024727 + }, + { + "epoch": 0.2298446203459396, + "grad_norm": 4.270459329469786, + "learning_rate": 6.893317702227432e-05, + "loss": 5.831222057342529, + "step": 392, + "token_acc": 0.1124405064729474 + }, + { + "epoch": 0.23043095866314864, + "grad_norm": 3.667073317503492, + "learning_rate": 6.910902696365767e-05, + "loss": 5.7944536209106445, + "step": 393, + "token_acc": 0.11555909335349523 + }, + { + "epoch": 0.23101729698035767, + "grad_norm": 4.715840322471299, + "learning_rate": 6.928487690504102e-05, + "loss": 5.698849678039551, + "step": 394, + "token_acc": 0.11793997132776834 + }, + { + "epoch": 0.2316036352975667, + "grad_norm": 3.5271256105819915, + "learning_rate": 6.946072684642437e-05, + "loss": 5.790862083435059, + "step": 395, + "token_acc": 0.11205064554684319 + }, + { + "epoch": 0.23218997361477572, + "grad_norm": 4.123248336581656, + "learning_rate": 6.963657678780774e-05, + "loss": 5.726106643676758, + "step": 396, + "token_acc": 0.122620957309185 + }, + { + "epoch": 0.23277631193198475, + "grad_norm": 4.601396240828345, + "learning_rate": 6.981242672919109e-05, + "loss": 5.763605117797852, + "step": 397, + "token_acc": 0.11514154754465278 + }, + { + "epoch": 0.23336265024919378, + "grad_norm": 4.1784116657872366, + "learning_rate": 6.998827667057444e-05, + "loss": 5.816640853881836, + "step": 398, + "token_acc": 0.11001271044238153 + }, + { + "epoch": 0.2339489885664028, + "grad_norm": 4.303445038580127, + "learning_rate": 7.016412661195779e-05, + "loss": 5.6869001388549805, + "step": 399, + "token_acc": 0.1205205334365583 + }, + { + "epoch": 0.23453532688361184, + "grad_norm": 3.483165080467426, + "learning_rate": 7.033997655334114e-05, + "loss": 5.675357818603516, + "step": 400, + "token_acc": 0.12406015037593984 + }, + { + "epoch": 0.23512166520082087, + "grad_norm": 4.513334435815259, + "learning_rate": 7.051582649472449e-05, + "loss": 5.637521266937256, + "step": 401, + "token_acc": 0.12292262927553806 + }, + { + "epoch": 0.2357080035180299, + "grad_norm": 4.018562496654571, + "learning_rate": 7.069167643610786e-05, + "loss": 5.609130382537842, + "step": 402, + "token_acc": 0.1258798253374222 + }, + { + "epoch": 0.23629434183523892, + "grad_norm": 4.310385199445542, + "learning_rate": 7.08675263774912e-05, + "loss": 5.596505165100098, + "step": 403, + "token_acc": 0.12747931954290223 + }, + { + "epoch": 0.23688068015244795, + "grad_norm": 3.6719739018035673, + "learning_rate": 7.104337631887456e-05, + "loss": 5.603360176086426, + "step": 404, + "token_acc": 0.12725657856703648 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 5.107645944979772, + "learning_rate": 7.121922626025791e-05, + "loss": 5.512840270996094, + "step": 405, + "token_acc": 0.13123216097307128 + }, + { + "epoch": 0.238053356786866, + "grad_norm": 3.4242288333948783, + "learning_rate": 7.139507620164126e-05, + "loss": 5.566375732421875, + "step": 406, + "token_acc": 0.12973804470499303 + }, + { + "epoch": 0.23863969510407504, + "grad_norm": 5.634245072157221, + "learning_rate": 7.157092614302461e-05, + "loss": 5.549043655395508, + "step": 407, + "token_acc": 0.13178647372061392 + }, + { + "epoch": 0.23922603342128407, + "grad_norm": 3.1258971889919236, + "learning_rate": 7.174677608440796e-05, + "loss": 5.5727949142456055, + "step": 408, + "token_acc": 0.12761968317523872 + }, + { + "epoch": 0.23981237173849312, + "grad_norm": 5.159396424110232, + "learning_rate": 7.192262602579131e-05, + "loss": 5.5011444091796875, + "step": 409, + "token_acc": 0.1339217887080706 + }, + { + "epoch": 0.24039871005570215, + "grad_norm": 3.651002171915324, + "learning_rate": 7.209847596717468e-05, + "loss": 5.50866174697876, + "step": 410, + "token_acc": 0.13396123861211842 + }, + { + "epoch": 0.24098504837291118, + "grad_norm": 3.511854270534502, + "learning_rate": 7.227432590855801e-05, + "loss": 5.360819339752197, + "step": 411, + "token_acc": 0.1451115018560384 + }, + { + "epoch": 0.2415713866901202, + "grad_norm": 3.8741646298104424, + "learning_rate": 7.245017584994138e-05, + "loss": 5.452428817749023, + "step": 412, + "token_acc": 0.1357357762727176 + }, + { + "epoch": 0.24215772500732924, + "grad_norm": 4.140739124810105, + "learning_rate": 7.262602579132473e-05, + "loss": 5.4548258781433105, + "step": 413, + "token_acc": 0.13496192185614472 + }, + { + "epoch": 0.24274406332453827, + "grad_norm": 3.6336401396883113, + "learning_rate": 7.280187573270808e-05, + "loss": 5.409458160400391, + "step": 414, + "token_acc": 0.1404517432057197 + }, + { + "epoch": 0.2433304016417473, + "grad_norm": 4.455997416997323, + "learning_rate": 7.297772567409144e-05, + "loss": 5.440122127532959, + "step": 415, + "token_acc": 0.13411181413925255 + }, + { + "epoch": 0.24391673995895632, + "grad_norm": 3.997954407390372, + "learning_rate": 7.315357561547478e-05, + "loss": 5.495820045471191, + "step": 416, + "token_acc": 0.13153217390851768 + }, + { + "epoch": 0.24450307827616535, + "grad_norm": 2.6517978433218174, + "learning_rate": 7.332942555685815e-05, + "loss": 5.326896667480469, + "step": 417, + "token_acc": 0.14230757201388672 + }, + { + "epoch": 0.24508941659337438, + "grad_norm": 4.808739982288075, + "learning_rate": 7.35052754982415e-05, + "loss": 5.362764835357666, + "step": 418, + "token_acc": 0.1434285236929573 + }, + { + "epoch": 0.2456757549105834, + "grad_norm": 4.112335927290701, + "learning_rate": 7.368112543962485e-05, + "loss": 5.388121604919434, + "step": 419, + "token_acc": 0.13707269712610917 + }, + { + "epoch": 0.24626209322779244, + "grad_norm": 4.197959879073062, + "learning_rate": 7.38569753810082e-05, + "loss": 5.376265525817871, + "step": 420, + "token_acc": 0.1362363219270727 + }, + { + "epoch": 0.24684843154500147, + "grad_norm": 3.5180398232718897, + "learning_rate": 7.403282532239155e-05, + "loss": 5.385644435882568, + "step": 421, + "token_acc": 0.14158413430974454 + }, + { + "epoch": 0.2474347698622105, + "grad_norm": 3.4469219080041453, + "learning_rate": 7.42086752637749e-05, + "loss": 5.353095054626465, + "step": 422, + "token_acc": 0.14258298020324836 + }, + { + "epoch": 0.24802110817941952, + "grad_norm": 5.808916479823406, + "learning_rate": 7.438452520515827e-05, + "loss": 5.38712739944458, + "step": 423, + "token_acc": 0.13573487786979768 + }, + { + "epoch": 0.24860744649662855, + "grad_norm": 2.6387248735386915, + "learning_rate": 7.456037514654162e-05, + "loss": 5.203732490539551, + "step": 424, + "token_acc": 0.15196587265665865 + }, + { + "epoch": 0.24919378481383758, + "grad_norm": 5.867979613408349, + "learning_rate": 7.473622508792497e-05, + "loss": 5.249689102172852, + "step": 425, + "token_acc": 0.15209264427618951 + }, + { + "epoch": 0.2497801231310466, + "grad_norm": 3.5276269628564294, + "learning_rate": 7.491207502930832e-05, + "loss": 5.266202449798584, + "step": 426, + "token_acc": 0.14987937820826758 + }, + { + "epoch": 0.25036646144825564, + "grad_norm": 3.946052632053937, + "learning_rate": 7.508792497069167e-05, + "loss": 5.274470806121826, + "step": 427, + "token_acc": 0.14590072504182933 + }, + { + "epoch": 0.2509527997654647, + "grad_norm": 3.662742158072335, + "learning_rate": 7.526377491207502e-05, + "loss": 5.274651050567627, + "step": 428, + "token_acc": 0.14401235857865766 + }, + { + "epoch": 0.2515391380826737, + "grad_norm": 4.264731892009234, + "learning_rate": 7.543962485345838e-05, + "loss": 5.256073951721191, + "step": 429, + "token_acc": 0.15129322948510063 + }, + { + "epoch": 0.25212547639988275, + "grad_norm": 2.697176414371289, + "learning_rate": 7.561547479484174e-05, + "loss": 5.273626804351807, + "step": 430, + "token_acc": 0.147494960395838 + }, + { + "epoch": 0.25271181471709175, + "grad_norm": 3.4506169814872174, + "learning_rate": 7.579132473622507e-05, + "loss": 5.143132209777832, + "step": 431, + "token_acc": 0.15700888715082084 + }, + { + "epoch": 0.2532981530343008, + "grad_norm": 4.122652348529465, + "learning_rate": 7.596717467760842e-05, + "loss": 5.217945098876953, + "step": 432, + "token_acc": 0.1505449056450857 + }, + { + "epoch": 0.2538844913515098, + "grad_norm": 2.290202740182317, + "learning_rate": 7.614302461899179e-05, + "loss": 5.189580917358398, + "step": 433, + "token_acc": 0.15147783566518605 + }, + { + "epoch": 0.25447082966871887, + "grad_norm": 6.086241792160422, + "learning_rate": 7.631887456037514e-05, + "loss": 5.193511962890625, + "step": 434, + "token_acc": 0.14781076057130063 + }, + { + "epoch": 0.25505716798592787, + "grad_norm": 3.937639776278552, + "learning_rate": 7.649472450175849e-05, + "loss": 5.153887748718262, + "step": 435, + "token_acc": 0.15459014119955417 + }, + { + "epoch": 0.2556435063031369, + "grad_norm": 3.5701669774395466, + "learning_rate": 7.667057444314184e-05, + "loss": 5.151963233947754, + "step": 436, + "token_acc": 0.1553613392837893 + }, + { + "epoch": 0.2562298446203459, + "grad_norm": 4.321933004097814, + "learning_rate": 7.68464243845252e-05, + "loss": 5.219295501708984, + "step": 437, + "token_acc": 0.14973951439553573 + }, + { + "epoch": 0.256816182937555, + "grad_norm": 3.6446538960209045, + "learning_rate": 7.702227432590856e-05, + "loss": 5.178834438323975, + "step": 438, + "token_acc": 0.15094032488008002 + }, + { + "epoch": 0.257402521254764, + "grad_norm": 4.215823977064856, + "learning_rate": 7.71981242672919e-05, + "loss": 5.084681987762451, + "step": 439, + "token_acc": 0.1602725085422162 + }, + { + "epoch": 0.25798885957197304, + "grad_norm": 2.082208848443483, + "learning_rate": 7.737397420867524e-05, + "loss": 5.073691368103027, + "step": 440, + "token_acc": 0.16210726514031104 + }, + { + "epoch": 0.25857519788918204, + "grad_norm": 4.609460695706422, + "learning_rate": 7.754982415005861e-05, + "loss": 5.116462707519531, + "step": 441, + "token_acc": 0.15574350719150967 + }, + { + "epoch": 0.2591615362063911, + "grad_norm": 3.525751510753823, + "learning_rate": 7.772567409144196e-05, + "loss": 5.100196838378906, + "step": 442, + "token_acc": 0.15798017903801853 + }, + { + "epoch": 0.2597478745236001, + "grad_norm": 4.617431093703994, + "learning_rate": 7.790152403282531e-05, + "loss": 5.1339311599731445, + "step": 443, + "token_acc": 0.15189041921971075 + }, + { + "epoch": 0.26033421284080915, + "grad_norm": 3.017863480202891, + "learning_rate": 7.807737397420867e-05, + "loss": 5.0896759033203125, + "step": 444, + "token_acc": 0.15756112126815827 + }, + { + "epoch": 0.26092055115801815, + "grad_norm": 4.171071570802497, + "learning_rate": 7.825322391559203e-05, + "loss": 5.082404613494873, + "step": 445, + "token_acc": 0.15682236355408383 + }, + { + "epoch": 0.2615068894752272, + "grad_norm": 3.3636820336741913, + "learning_rate": 7.842907385697538e-05, + "loss": 5.070326805114746, + "step": 446, + "token_acc": 0.15744733466252453 + }, + { + "epoch": 0.2620932277924362, + "grad_norm": 3.3012431566827765, + "learning_rate": 7.860492379835873e-05, + "loss": 5.013023376464844, + "step": 447, + "token_acc": 0.1633589272139496 + }, + { + "epoch": 0.26267956610964527, + "grad_norm": 3.845746490923582, + "learning_rate": 7.878077373974209e-05, + "loss": 5.1147918701171875, + "step": 448, + "token_acc": 0.1536516467588527 + }, + { + "epoch": 0.26326590442685427, + "grad_norm": 3.5345488842199626, + "learning_rate": 7.895662368112543e-05, + "loss": 4.982331275939941, + "step": 449, + "token_acc": 0.1635154152537532 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 3.3299318537006655, + "learning_rate": 7.913247362250878e-05, + "loss": 5.0843048095703125, + "step": 450, + "token_acc": 0.1566409435481122 + }, + { + "epoch": 0.2644385810612723, + "grad_norm": 3.8323982942537427, + "learning_rate": 7.930832356389213e-05, + "loss": 5.060731410980225, + "step": 451, + "token_acc": 0.15935043929874754 + }, + { + "epoch": 0.2650249193784814, + "grad_norm": 4.121507885756696, + "learning_rate": 7.94841735052755e-05, + "loss": 5.041284084320068, + "step": 452, + "token_acc": 0.16059973702967037 + }, + { + "epoch": 0.26561125769569044, + "grad_norm": 3.5580531202180463, + "learning_rate": 7.966002344665885e-05, + "loss": 4.961620330810547, + "step": 453, + "token_acc": 0.16602636059784323 + }, + { + "epoch": 0.26619759601289944, + "grad_norm": 3.0163193561626094, + "learning_rate": 7.98358733880422e-05, + "loss": 4.878084659576416, + "step": 454, + "token_acc": 0.17350987909780022 + }, + { + "epoch": 0.2667839343301085, + "grad_norm": 3.431317820756719, + "learning_rate": 8.001172332942555e-05, + "loss": 5.0134687423706055, + "step": 455, + "token_acc": 0.16012941459456173 + }, + { + "epoch": 0.2673702726473175, + "grad_norm": 2.705360648318704, + "learning_rate": 8.018757327080891e-05, + "loss": 4.891533374786377, + "step": 456, + "token_acc": 0.17029746225505943 + }, + { + "epoch": 0.26795661096452655, + "grad_norm": 5.1314399238445985, + "learning_rate": 8.036342321219226e-05, + "loss": 4.998048782348633, + "step": 457, + "token_acc": 0.16485212848348013 + }, + { + "epoch": 0.26854294928173555, + "grad_norm": 2.3647801017291785, + "learning_rate": 8.05392731535756e-05, + "loss": 4.95503568649292, + "step": 458, + "token_acc": 0.16324241382047144 + }, + { + "epoch": 0.2691292875989446, + "grad_norm": 5.2296717763972715, + "learning_rate": 8.071512309495895e-05, + "loss": 5.009551048278809, + "step": 459, + "token_acc": 0.1602560937901933 + }, + { + "epoch": 0.2697156259161536, + "grad_norm": 2.9545150306653385, + "learning_rate": 8.089097303634232e-05, + "loss": 4.97419548034668, + "step": 460, + "token_acc": 0.16263180369871563 + }, + { + "epoch": 0.27030196423336267, + "grad_norm": 4.945318371027142, + "learning_rate": 8.106682297772567e-05, + "loss": 5.022619724273682, + "step": 461, + "token_acc": 0.15620776778894177 + }, + { + "epoch": 0.27088830255057167, + "grad_norm": 3.397431079817508, + "learning_rate": 8.124267291910902e-05, + "loss": 4.894867897033691, + "step": 462, + "token_acc": 0.17118931692385772 + }, + { + "epoch": 0.2714746408677807, + "grad_norm": 4.150211893682385, + "learning_rate": 8.141852286049237e-05, + "loss": 4.851693153381348, + "step": 463, + "token_acc": 0.1745461052905577 + }, + { + "epoch": 0.2720609791849897, + "grad_norm": 3.0270975998642644, + "learning_rate": 8.159437280187573e-05, + "loss": 4.882989883422852, + "step": 464, + "token_acc": 0.1707711409927879 + }, + { + "epoch": 0.2726473175021988, + "grad_norm": 2.977653736872163, + "learning_rate": 8.177022274325908e-05, + "loss": 4.971857070922852, + "step": 465, + "token_acc": 0.16176777654636276 + }, + { + "epoch": 0.2732336558194078, + "grad_norm": 2.944303587003063, + "learning_rate": 8.194607268464243e-05, + "loss": 4.874970436096191, + "step": 466, + "token_acc": 0.16954215285843582 + }, + { + "epoch": 0.27381999413661684, + "grad_norm": 3.1896761394478617, + "learning_rate": 8.212192262602577e-05, + "loss": 4.909939289093018, + "step": 467, + "token_acc": 0.165068006112229 + }, + { + "epoch": 0.27440633245382584, + "grad_norm": 4.293542865739274, + "learning_rate": 8.229777256740914e-05, + "loss": 4.90077018737793, + "step": 468, + "token_acc": 0.17014762781262913 + }, + { + "epoch": 0.2749926707710349, + "grad_norm": 3.1330889330046467, + "learning_rate": 8.247362250879249e-05, + "loss": 4.843530654907227, + "step": 469, + "token_acc": 0.16903433113478775 + }, + { + "epoch": 0.2755790090882439, + "grad_norm": 4.1621931176916265, + "learning_rate": 8.264947245017584e-05, + "loss": 4.883255958557129, + "step": 470, + "token_acc": 0.1669339839505141 + }, + { + "epoch": 0.27616534740545295, + "grad_norm": 3.8399961219958905, + "learning_rate": 8.282532239155919e-05, + "loss": 4.845980644226074, + "step": 471, + "token_acc": 0.1738849117154105 + }, + { + "epoch": 0.27675168572266196, + "grad_norm": 2.9752735036838027, + "learning_rate": 8.300117233294255e-05, + "loss": 4.86611270904541, + "step": 472, + "token_acc": 0.16837167979722756 + }, + { + "epoch": 0.277338024039871, + "grad_norm": 4.740099255306814, + "learning_rate": 8.31770222743259e-05, + "loss": 4.873641490936279, + "step": 473, + "token_acc": 0.17010540902164664 + }, + { + "epoch": 0.27792436235708, + "grad_norm": 2.470868235822677, + "learning_rate": 8.335287221570926e-05, + "loss": 4.788944244384766, + "step": 474, + "token_acc": 0.17623830305281293 + }, + { + "epoch": 0.27851070067428907, + "grad_norm": 4.622013550828369, + "learning_rate": 8.352872215709262e-05, + "loss": 4.74179744720459, + "step": 475, + "token_acc": 0.17933142703676203 + }, + { + "epoch": 0.27909703899149807, + "grad_norm": 2.8544668474720765, + "learning_rate": 8.370457209847596e-05, + "loss": 4.844178676605225, + "step": 476, + "token_acc": 0.17099172926625691 + }, + { + "epoch": 0.2796833773087071, + "grad_norm": 3.7744684942917344, + "learning_rate": 8.388042203985931e-05, + "loss": 4.800683975219727, + "step": 477, + "token_acc": 0.17339959664316332 + }, + { + "epoch": 0.2802697156259161, + "grad_norm": 3.405329217039666, + "learning_rate": 8.405627198124266e-05, + "loss": 4.812726020812988, + "step": 478, + "token_acc": 0.1753202502074898 + }, + { + "epoch": 0.2808560539431252, + "grad_norm": 4.9716612594809115, + "learning_rate": 8.423212192262602e-05, + "loss": 4.8752336502075195, + "step": 479, + "token_acc": 0.16625593114356127 + }, + { + "epoch": 0.28144239226033424, + "grad_norm": 3.3820338181300476, + "learning_rate": 8.440797186400937e-05, + "loss": 4.74002742767334, + "step": 480, + "token_acc": 0.17751355588946233 + }, + { + "epoch": 0.28202873057754324, + "grad_norm": 3.9900300042135575, + "learning_rate": 8.458382180539273e-05, + "loss": 4.752828121185303, + "step": 481, + "token_acc": 0.1753159773799286 + }, + { + "epoch": 0.2826150688947523, + "grad_norm": 3.4269369834902768, + "learning_rate": 8.475967174677608e-05, + "loss": 4.7765607833862305, + "step": 482, + "token_acc": 0.17440990121973074 + }, + { + "epoch": 0.2832014072119613, + "grad_norm": 4.231019460263849, + "learning_rate": 8.493552168815944e-05, + "loss": 4.792545318603516, + "step": 483, + "token_acc": 0.17406913421442466 + }, + { + "epoch": 0.28378774552917035, + "grad_norm": 3.139852211027999, + "learning_rate": 8.511137162954279e-05, + "loss": 4.7755303382873535, + "step": 484, + "token_acc": 0.17380265764877958 + }, + { + "epoch": 0.28437408384637936, + "grad_norm": 3.592729782124692, + "learning_rate": 8.528722157092613e-05, + "loss": 4.770051956176758, + "step": 485, + "token_acc": 0.17371479612439794 + }, + { + "epoch": 0.2849604221635884, + "grad_norm": 3.6187401118272366, + "learning_rate": 8.546307151230948e-05, + "loss": 4.7032694816589355, + "step": 486, + "token_acc": 0.1809716962696539 + }, + { + "epoch": 0.2855467604807974, + "grad_norm": 3.0045234882401974, + "learning_rate": 8.563892145369284e-05, + "loss": 4.725955963134766, + "step": 487, + "token_acc": 0.17817942676699738 + }, + { + "epoch": 0.28613309879800647, + "grad_norm": 4.53269932950948, + "learning_rate": 8.58147713950762e-05, + "loss": 4.784139633178711, + "step": 488, + "token_acc": 0.1743480132779322 + }, + { + "epoch": 0.28671943711521547, + "grad_norm": 2.6017795957109873, + "learning_rate": 8.599062133645955e-05, + "loss": 4.644981384277344, + "step": 489, + "token_acc": 0.18411112787487527 + }, + { + "epoch": 0.2873057754324245, + "grad_norm": 4.297472803697688, + "learning_rate": 8.61664712778429e-05, + "loss": 4.752431869506836, + "step": 490, + "token_acc": 0.17491059019496108 + }, + { + "epoch": 0.2878921137496335, + "grad_norm": 2.6511703891836964, + "learning_rate": 8.634232121922626e-05, + "loss": 4.645848274230957, + "step": 491, + "token_acc": 0.1850725632069195 + }, + { + "epoch": 0.2884784520668426, + "grad_norm": 4.107964009200134, + "learning_rate": 8.651817116060961e-05, + "loss": 4.729785919189453, + "step": 492, + "token_acc": 0.17807644837625158 + }, + { + "epoch": 0.2890647903840516, + "grad_norm": 2.9920451555349845, + "learning_rate": 8.669402110199296e-05, + "loss": 4.704058647155762, + "step": 493, + "token_acc": 0.18044468256094295 + }, + { + "epoch": 0.28965112870126064, + "grad_norm": 2.9469871810814134, + "learning_rate": 8.68698710433763e-05, + "loss": 4.713854789733887, + "step": 494, + "token_acc": 0.17756162788189878 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 4.06770303189972, + "learning_rate": 8.704572098475966e-05, + "loss": 4.729300498962402, + "step": 495, + "token_acc": 0.17551730736409854 + }, + { + "epoch": 0.2908238053356787, + "grad_norm": 2.7396804316485883, + "learning_rate": 8.722157092614302e-05, + "loss": 4.679933071136475, + "step": 496, + "token_acc": 0.17935259463524694 + }, + { + "epoch": 0.2914101436528877, + "grad_norm": 2.384751615316691, + "learning_rate": 8.739742086752637e-05, + "loss": 4.717489242553711, + "step": 497, + "token_acc": 0.1777613188759721 + }, + { + "epoch": 0.29199648197009676, + "grad_norm": 4.735842023000457, + "learning_rate": 8.757327080890972e-05, + "loss": 4.666316032409668, + "step": 498, + "token_acc": 0.18178123094720716 + }, + { + "epoch": 0.29258282028730576, + "grad_norm": 3.0220471186635507, + "learning_rate": 8.774912075029308e-05, + "loss": 4.643878936767578, + "step": 499, + "token_acc": 0.18439965365922964 + }, + { + "epoch": 0.2931691586045148, + "grad_norm": 4.441331901875158, + "learning_rate": 8.792497069167643e-05, + "loss": 4.796988487243652, + "step": 500, + "token_acc": 0.16914572075667747 + }, + { + "epoch": 0.2937554969217238, + "grad_norm": 2.7759859802199425, + "learning_rate": 8.810082063305978e-05, + "loss": 4.621725082397461, + "step": 501, + "token_acc": 0.18621704262064828 + }, + { + "epoch": 0.29434183523893287, + "grad_norm": 3.667398896824923, + "learning_rate": 8.827667057444315e-05, + "loss": 4.643711090087891, + "step": 502, + "token_acc": 0.18315541211519365 + }, + { + "epoch": 0.29492817355614187, + "grad_norm": 3.627269345593771, + "learning_rate": 8.845252051582649e-05, + "loss": 4.585537910461426, + "step": 503, + "token_acc": 0.19074013340669527 + }, + { + "epoch": 0.2955145118733509, + "grad_norm": 2.589392057406107, + "learning_rate": 8.862837045720984e-05, + "loss": 4.666231155395508, + "step": 504, + "token_acc": 0.17902322230917128 + }, + { + "epoch": 0.2961008501905599, + "grad_norm": 3.8257322006077916, + "learning_rate": 8.880422039859319e-05, + "loss": 4.708887100219727, + "step": 505, + "token_acc": 0.1744964024146941 + }, + { + "epoch": 0.296687188507769, + "grad_norm": 2.6419895463335266, + "learning_rate": 8.898007033997654e-05, + "loss": 4.63877010345459, + "step": 506, + "token_acc": 0.18197642962997812 + }, + { + "epoch": 0.297273526824978, + "grad_norm": 3.9906103928536814, + "learning_rate": 8.91559202813599e-05, + "loss": 4.671177387237549, + "step": 507, + "token_acc": 0.1801969474719116 + }, + { + "epoch": 0.29785986514218704, + "grad_norm": 3.152919983469968, + "learning_rate": 8.933177022274325e-05, + "loss": 4.592166423797607, + "step": 508, + "token_acc": 0.18710155788229335 + }, + { + "epoch": 0.2984462034593961, + "grad_norm": 3.655488570590753, + "learning_rate": 8.95076201641266e-05, + "loss": 4.644621849060059, + "step": 509, + "token_acc": 0.18176177812277072 + }, + { + "epoch": 0.2990325417766051, + "grad_norm": 2.750159953084596, + "learning_rate": 8.968347010550997e-05, + "loss": 4.562835693359375, + "step": 510, + "token_acc": 0.18830252762930944 + }, + { + "epoch": 0.29961888009381415, + "grad_norm": 2.4937239782422487, + "learning_rate": 8.98593200468933e-05, + "loss": 4.600914001464844, + "step": 511, + "token_acc": 0.1866240176996692 + }, + { + "epoch": 0.30020521841102316, + "grad_norm": 3.957279725521061, + "learning_rate": 9.003516998827666e-05, + "loss": 4.687961101531982, + "step": 512, + "token_acc": 0.17401908059621032 + }, + { + "epoch": 0.3007915567282322, + "grad_norm": 3.1854409145444658, + "learning_rate": 9.021101992966001e-05, + "loss": 4.646778106689453, + "step": 513, + "token_acc": 0.1782020477069379 + }, + { + "epoch": 0.3013778950454412, + "grad_norm": 3.0475452320689795, + "learning_rate": 9.038686987104337e-05, + "loss": 4.586252212524414, + "step": 514, + "token_acc": 0.18421430642140973 + }, + { + "epoch": 0.30196423336265027, + "grad_norm": 4.192017323997992, + "learning_rate": 9.056271981242672e-05, + "loss": 4.605769634246826, + "step": 515, + "token_acc": 0.1835988645528915 + }, + { + "epoch": 0.30255057167985927, + "grad_norm": 2.647703459193893, + "learning_rate": 9.073856975381007e-05, + "loss": 4.662216663360596, + "step": 516, + "token_acc": 0.1794005688948381 + }, + { + "epoch": 0.3031369099970683, + "grad_norm": 3.863217520699813, + "learning_rate": 9.091441969519342e-05, + "loss": 4.577106475830078, + "step": 517, + "token_acc": 0.1861207437025398 + }, + { + "epoch": 0.3037232483142773, + "grad_norm": 2.5968035759892323, + "learning_rate": 9.109026963657679e-05, + "loss": 4.558673858642578, + "step": 518, + "token_acc": 0.18617459820496765 + }, + { + "epoch": 0.3043095866314864, + "grad_norm": 4.1088138179372855, + "learning_rate": 9.126611957796014e-05, + "loss": 4.632473945617676, + "step": 519, + "token_acc": 0.1817299846101913 + }, + { + "epoch": 0.3048959249486954, + "grad_norm": 2.8407541010955017, + "learning_rate": 9.144196951934348e-05, + "loss": 4.575881481170654, + "step": 520, + "token_acc": 0.1831301395875154 + }, + { + "epoch": 0.30548226326590444, + "grad_norm": 3.525659096633153, + "learning_rate": 9.161781946072683e-05, + "loss": 4.6428117752075195, + "step": 521, + "token_acc": 0.1788675985954233 + }, + { + "epoch": 0.30606860158311344, + "grad_norm": 3.491119122494435, + "learning_rate": 9.179366940211019e-05, + "loss": 4.5430474281311035, + "step": 522, + "token_acc": 0.188283048651937 + }, + { + "epoch": 0.3066549399003225, + "grad_norm": 2.1428754929118288, + "learning_rate": 9.196951934349354e-05, + "loss": 4.5450439453125, + "step": 523, + "token_acc": 0.18679540229885058 + }, + { + "epoch": 0.3072412782175315, + "grad_norm": 3.0113952438674327, + "learning_rate": 9.21453692848769e-05, + "loss": 4.616668701171875, + "step": 524, + "token_acc": 0.1794623708766547 + }, + { + "epoch": 0.30782761653474056, + "grad_norm": 3.6984238294448684, + "learning_rate": 9.232121922626025e-05, + "loss": 4.533224582672119, + "step": 525, + "token_acc": 0.18899855325414847 + }, + { + "epoch": 0.30841395485194956, + "grad_norm": 3.3527459968983324, + "learning_rate": 9.249706916764361e-05, + "loss": 4.5262064933776855, + "step": 526, + "token_acc": 0.1895146528468351 + }, + { + "epoch": 0.3090002931691586, + "grad_norm": 2.9330124270233995, + "learning_rate": 9.267291910902696e-05, + "loss": 4.551050662994385, + "step": 527, + "token_acc": 0.1870796960986278 + }, + { + "epoch": 0.3095866314863676, + "grad_norm": 3.1240841176918908, + "learning_rate": 9.284876905041031e-05, + "loss": 4.535248756408691, + "step": 528, + "token_acc": 0.189337813691762 + }, + { + "epoch": 0.31017296980357667, + "grad_norm": 2.5427880866609045, + "learning_rate": 9.302461899179365e-05, + "loss": 4.5670037269592285, + "step": 529, + "token_acc": 0.18567923509948453 + }, + { + "epoch": 0.31075930812078567, + "grad_norm": 3.3224454254455917, + "learning_rate": 9.320046893317701e-05, + "loss": 4.515107154846191, + "step": 530, + "token_acc": 0.18920715739291882 + }, + { + "epoch": 0.3113456464379947, + "grad_norm": 3.0015687472127315, + "learning_rate": 9.337631887456036e-05, + "loss": 4.516177177429199, + "step": 531, + "token_acc": 0.1877613323520593 + }, + { + "epoch": 0.31193198475520373, + "grad_norm": 3.8973902201992665, + "learning_rate": 9.355216881594372e-05, + "loss": 4.48250675201416, + "step": 532, + "token_acc": 0.19162787497386688 + }, + { + "epoch": 0.3125183230724128, + "grad_norm": 3.5247420708446597, + "learning_rate": 9.372801875732707e-05, + "loss": 4.455173492431641, + "step": 533, + "token_acc": 0.19637005033275398 + }, + { + "epoch": 0.3131046613896218, + "grad_norm": 3.1246786854744526, + "learning_rate": 9.390386869871043e-05, + "loss": 4.451881408691406, + "step": 534, + "token_acc": 0.19472246072902177 + }, + { + "epoch": 0.31369099970683084, + "grad_norm": 3.4385940497905843, + "learning_rate": 9.407971864009378e-05, + "loss": 4.48879337310791, + "step": 535, + "token_acc": 0.19036605683808225 + }, + { + "epoch": 0.3142773380240399, + "grad_norm": 2.597097818650952, + "learning_rate": 9.425556858147713e-05, + "loss": 4.520144462585449, + "step": 536, + "token_acc": 0.18691259070500538 + }, + { + "epoch": 0.3148636763412489, + "grad_norm": 3.4226212241298195, + "learning_rate": 9.44314185228605e-05, + "loss": 4.540506362915039, + "step": 537, + "token_acc": 0.18508584126606056 + }, + { + "epoch": 0.31545001465845796, + "grad_norm": 2.4824914064908863, + "learning_rate": 9.460726846424383e-05, + "loss": 4.465728282928467, + "step": 538, + "token_acc": 0.1915556646554954 + }, + { + "epoch": 0.31603635297566696, + "grad_norm": 2.882341396541977, + "learning_rate": 9.478311840562719e-05, + "loss": 4.470610618591309, + "step": 539, + "token_acc": 0.19236570611700032 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 2.8894471474624734, + "learning_rate": 9.495896834701054e-05, + "loss": 4.502852439880371, + "step": 540, + "token_acc": 0.19015903572175072 + }, + { + "epoch": 0.317209029610085, + "grad_norm": 3.023208244491889, + "learning_rate": 9.51348182883939e-05, + "loss": 4.464916229248047, + "step": 541, + "token_acc": 0.19146902867502907 + }, + { + "epoch": 0.31779536792729407, + "grad_norm": 2.6554894722122824, + "learning_rate": 9.531066822977725e-05, + "loss": 4.520737171173096, + "step": 542, + "token_acc": 0.1874107119148145 + }, + { + "epoch": 0.31838170624450307, + "grad_norm": 2.6524765319037744, + "learning_rate": 9.54865181711606e-05, + "loss": 4.5033159255981445, + "step": 543, + "token_acc": 0.18648980221769396 + }, + { + "epoch": 0.3189680445617121, + "grad_norm": 4.00175608469235, + "learning_rate": 9.566236811254395e-05, + "loss": 4.572734832763672, + "step": 544, + "token_acc": 0.18185341252883927 + }, + { + "epoch": 0.31955438287892113, + "grad_norm": 1.9363148550412852, + "learning_rate": 9.583821805392732e-05, + "loss": 4.3937578201293945, + "step": 545, + "token_acc": 0.19903716700733623 + }, + { + "epoch": 0.3201407211961302, + "grad_norm": 4.791396426586729, + "learning_rate": 9.601406799531067e-05, + "loss": 4.483650207519531, + "step": 546, + "token_acc": 0.18751971428280712 + }, + { + "epoch": 0.3207270595133392, + "grad_norm": 2.479850389374486, + "learning_rate": 9.6189917936694e-05, + "loss": 4.4839067459106445, + "step": 547, + "token_acc": 0.19110425840509074 + }, + { + "epoch": 0.32131339783054824, + "grad_norm": 4.057648818161033, + "learning_rate": 9.636576787807736e-05, + "loss": 4.555606842041016, + "step": 548, + "token_acc": 0.18468672920757026 + }, + { + "epoch": 0.32189973614775724, + "grad_norm": 3.0281361500272883, + "learning_rate": 9.654161781946072e-05, + "loss": 4.48748779296875, + "step": 549, + "token_acc": 0.1893791359047858 + }, + { + "epoch": 0.3224860744649663, + "grad_norm": 2.533216036098531, + "learning_rate": 9.671746776084407e-05, + "loss": 4.413464546203613, + "step": 550, + "token_acc": 0.19599664254347804 + }, + { + "epoch": 0.3230724127821753, + "grad_norm": 3.4476353025632274, + "learning_rate": 9.689331770222742e-05, + "loss": 4.490583419799805, + "step": 551, + "token_acc": 0.18889616934157516 + }, + { + "epoch": 0.32365875109938436, + "grad_norm": 2.2956474462733985, + "learning_rate": 9.706916764361077e-05, + "loss": 4.4333600997924805, + "step": 552, + "token_acc": 0.19354196538820573 + }, + { + "epoch": 0.32424508941659336, + "grad_norm": 3.4186757712042337, + "learning_rate": 9.724501758499414e-05, + "loss": 4.487241744995117, + "step": 553, + "token_acc": 0.1880563631111547 + }, + { + "epoch": 0.3248314277338024, + "grad_norm": 3.0509457750185494, + "learning_rate": 9.742086752637749e-05, + "loss": 4.4048943519592285, + "step": 554, + "token_acc": 0.19973247110884276 + }, + { + "epoch": 0.3254177660510114, + "grad_norm": 2.4010549397705123, + "learning_rate": 9.759671746776084e-05, + "loss": 4.403026580810547, + "step": 555, + "token_acc": 0.1987846912851113 + }, + { + "epoch": 0.32600410436822047, + "grad_norm": 3.5560588153925865, + "learning_rate": 9.777256740914418e-05, + "loss": 4.4723663330078125, + "step": 556, + "token_acc": 0.18985294391046498 + }, + { + "epoch": 0.32659044268542947, + "grad_norm": 3.58796477016802, + "learning_rate": 9.794841735052754e-05, + "loss": 4.515393257141113, + "step": 557, + "token_acc": 0.18431624659840234 + }, + { + "epoch": 0.32717678100263853, + "grad_norm": 3.286244235333189, + "learning_rate": 9.812426729191089e-05, + "loss": 4.417842864990234, + "step": 558, + "token_acc": 0.19316545292982115 + }, + { + "epoch": 0.32776311931984753, + "grad_norm": 3.4206552730028688, + "learning_rate": 9.830011723329424e-05, + "loss": 4.427712440490723, + "step": 559, + "token_acc": 0.19389070662139743 + }, + { + "epoch": 0.3283494576370566, + "grad_norm": 2.912858358910191, + "learning_rate": 9.84759671746776e-05, + "loss": 4.458911418914795, + "step": 560, + "token_acc": 0.19085674411960243 + }, + { + "epoch": 0.3289357959542656, + "grad_norm": 2.9886374417871826, + "learning_rate": 9.865181711606096e-05, + "loss": 4.424046993255615, + "step": 561, + "token_acc": 0.19432127840272603 + }, + { + "epoch": 0.32952213427147464, + "grad_norm": 3.031565975255019, + "learning_rate": 9.882766705744431e-05, + "loss": 4.379822731018066, + "step": 562, + "token_acc": 0.19693805093364478 + }, + { + "epoch": 0.33010847258868364, + "grad_norm": 2.217911618496524, + "learning_rate": 9.900351699882766e-05, + "loss": 4.356549263000488, + "step": 563, + "token_acc": 0.20240716012847812 + }, + { + "epoch": 0.3306948109058927, + "grad_norm": 3.542208484689449, + "learning_rate": 9.917936694021102e-05, + "loss": 4.406285285949707, + "step": 564, + "token_acc": 0.19587714060267428 + }, + { + "epoch": 0.33128114922310176, + "grad_norm": 2.8100336992529793, + "learning_rate": 9.935521688159436e-05, + "loss": 4.383203029632568, + "step": 565, + "token_acc": 0.1975497224701587 + }, + { + "epoch": 0.33186748754031076, + "grad_norm": 3.047854864923898, + "learning_rate": 9.953106682297771e-05, + "loss": 4.409743309020996, + "step": 566, + "token_acc": 0.19337355691375438 + }, + { + "epoch": 0.3324538258575198, + "grad_norm": 2.57546646636222, + "learning_rate": 9.970691676436106e-05, + "loss": 4.444423675537109, + "step": 567, + "token_acc": 0.19188884618979862 + }, + { + "epoch": 0.3330401641747288, + "grad_norm": 2.8645692339766526, + "learning_rate": 9.988276670574441e-05, + "loss": 4.416524887084961, + "step": 568, + "token_acc": 0.19409797988740699 + }, + { + "epoch": 0.33362650249193787, + "grad_norm": 2.4472261273565428, + "learning_rate": 0.00010005861664712778, + "loss": 4.395447254180908, + "step": 569, + "token_acc": 0.1946796231039515 + }, + { + "epoch": 0.33421284080914687, + "grad_norm": 2.631956417804853, + "learning_rate": 0.00010023446658851113, + "loss": 4.44894552230835, + "step": 570, + "token_acc": 0.18897370470265215 + }, + { + "epoch": 0.33479917912635593, + "grad_norm": 2.4865008225004246, + "learning_rate": 0.00010041031652989448, + "loss": 4.335729598999023, + "step": 571, + "token_acc": 0.20097561750288503 + }, + { + "epoch": 0.33538551744356493, + "grad_norm": 3.043697322532231, + "learning_rate": 0.00010058616647127785, + "loss": 4.373074054718018, + "step": 572, + "token_acc": 0.1970346345418742 + }, + { + "epoch": 0.335971855760774, + "grad_norm": 2.4409304224176256, + "learning_rate": 0.0001007620164126612, + "loss": 4.353499412536621, + "step": 573, + "token_acc": 0.19993152211994777 + }, + { + "epoch": 0.336558194077983, + "grad_norm": 3.301446026270895, + "learning_rate": 0.00010093786635404453, + "loss": 4.405664443969727, + "step": 574, + "token_acc": 0.19363844537156122 + }, + { + "epoch": 0.33714453239519204, + "grad_norm": 2.864402730813161, + "learning_rate": 0.00010111371629542788, + "loss": 4.3503851890563965, + "step": 575, + "token_acc": 0.20062480377580755 + }, + { + "epoch": 0.33773087071240104, + "grad_norm": 2.858342867160818, + "learning_rate": 0.00010128956623681125, + "loss": 4.364161968231201, + "step": 576, + "token_acc": 0.1978775154897844 + }, + { + "epoch": 0.3383172090296101, + "grad_norm": 3.1758831041141122, + "learning_rate": 0.0001014654161781946, + "loss": 4.376925468444824, + "step": 577, + "token_acc": 0.19712086473176674 + }, + { + "epoch": 0.3389035473468191, + "grad_norm": 3.220219510727813, + "learning_rate": 0.00010164126611957795, + "loss": 4.422541618347168, + "step": 578, + "token_acc": 0.19094777177834732 + }, + { + "epoch": 0.33948988566402816, + "grad_norm": 2.2530355822059134, + "learning_rate": 0.0001018171160609613, + "loss": 4.348875045776367, + "step": 579, + "token_acc": 0.19977448538088372 + }, + { + "epoch": 0.34007622398123716, + "grad_norm": 3.634942922257256, + "learning_rate": 0.00010199296600234467, + "loss": 4.317217826843262, + "step": 580, + "token_acc": 0.201233524626348 + }, + { + "epoch": 0.3406625622984462, + "grad_norm": 2.3007134675353265, + "learning_rate": 0.00010216881594372802, + "loss": 4.3593339920043945, + "step": 581, + "token_acc": 0.19616560671854927 + }, + { + "epoch": 0.3412489006156552, + "grad_norm": 3.2173056706541288, + "learning_rate": 0.00010234466588511135, + "loss": 4.379700660705566, + "step": 582, + "token_acc": 0.19660178935802858 + }, + { + "epoch": 0.34183523893286427, + "grad_norm": 2.549620696797565, + "learning_rate": 0.0001025205158264947, + "loss": 4.396203994750977, + "step": 583, + "token_acc": 0.1929887877702395 + }, + { + "epoch": 0.3424215772500733, + "grad_norm": 2.7627648261715705, + "learning_rate": 0.00010269636576787807, + "loss": 4.392889499664307, + "step": 584, + "token_acc": 0.1943292951280639 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 2.4908181813130406, + "learning_rate": 0.00010287221570926142, + "loss": 4.396252632141113, + "step": 585, + "token_acc": 0.1933978866250837 + }, + { + "epoch": 0.34359425388449133, + "grad_norm": 3.4710452873198543, + "learning_rate": 0.00010304806565064477, + "loss": 4.371337413787842, + "step": 586, + "token_acc": 0.19775096212354693 + }, + { + "epoch": 0.3441805922017004, + "grad_norm": 2.1366264885882886, + "learning_rate": 0.00010322391559202812, + "loss": 4.3373212814331055, + "step": 587, + "token_acc": 0.1990349207027401 + }, + { + "epoch": 0.3447669305189094, + "grad_norm": 2.793102114015419, + "learning_rate": 0.00010339976553341149, + "loss": 4.3192620277404785, + "step": 588, + "token_acc": 0.20043531213122015 + }, + { + "epoch": 0.34535326883611844, + "grad_norm": 3.2110398057106577, + "learning_rate": 0.00010357561547479484, + "loss": 4.3520426750183105, + "step": 589, + "token_acc": 0.19888331982015184 + }, + { + "epoch": 0.34593960715332744, + "grad_norm": 2.818657134738977, + "learning_rate": 0.00010375146541617819, + "loss": 4.319580078125, + "step": 590, + "token_acc": 0.20022879471458502 + }, + { + "epoch": 0.3465259454705365, + "grad_norm": 3.1142050834669304, + "learning_rate": 0.00010392731535756153, + "loss": 4.402172088623047, + "step": 591, + "token_acc": 0.19109598588234056 + }, + { + "epoch": 0.34711228378774556, + "grad_norm": 2.481377475388217, + "learning_rate": 0.00010410316529894489, + "loss": 4.254935264587402, + "step": 592, + "token_acc": 0.2041862762942354 + }, + { + "epoch": 0.34769862210495456, + "grad_norm": 2.7899472797326803, + "learning_rate": 0.00010427901524032824, + "loss": 4.301620960235596, + "step": 593, + "token_acc": 0.201591428676059 + }, + { + "epoch": 0.3482849604221636, + "grad_norm": 2.306420877176881, + "learning_rate": 0.00010445486518171159, + "loss": 4.326542854309082, + "step": 594, + "token_acc": 0.1977195497922805 + }, + { + "epoch": 0.3488712987393726, + "grad_norm": 3.790533075561843, + "learning_rate": 0.00010463071512309494, + "loss": 4.306258201599121, + "step": 595, + "token_acc": 0.19972011730456374 + }, + { + "epoch": 0.34945763705658167, + "grad_norm": 2.131097301711391, + "learning_rate": 0.00010480656506447831, + "loss": 4.290352821350098, + "step": 596, + "token_acc": 0.2039510487647645 + }, + { + "epoch": 0.3500439753737907, + "grad_norm": 3.4011241264489187, + "learning_rate": 0.00010498241500586166, + "loss": 4.348827362060547, + "step": 597, + "token_acc": 0.19694226298252568 + }, + { + "epoch": 0.35063031369099973, + "grad_norm": 2.3812567144661907, + "learning_rate": 0.00010515826494724501, + "loss": 4.334071159362793, + "step": 598, + "token_acc": 0.19651532832614105 + }, + { + "epoch": 0.35121665200820873, + "grad_norm": 3.361051181363213, + "learning_rate": 0.00010533411488862837, + "loss": 4.35249137878418, + "step": 599, + "token_acc": 0.19578733204581367 + }, + { + "epoch": 0.3518029903254178, + "grad_norm": 2.0689154004556807, + "learning_rate": 0.00010550996483001171, + "loss": 4.308032035827637, + "step": 600, + "token_acc": 0.2002745968991027 + }, + { + "epoch": 0.3523893286426268, + "grad_norm": 3.3945659225723444, + "learning_rate": 0.00010568581477139506, + "loss": 4.315279960632324, + "step": 601, + "token_acc": 0.19940086223738882 + }, + { + "epoch": 0.35297566695983584, + "grad_norm": 2.394583028279431, + "learning_rate": 0.00010586166471277841, + "loss": 4.375241756439209, + "step": 602, + "token_acc": 0.1922472512387402 + }, + { + "epoch": 0.35356200527704484, + "grad_norm": 2.5966869299785698, + "learning_rate": 0.00010603751465416176, + "loss": 4.277009963989258, + "step": 603, + "token_acc": 0.20603324495261147 + }, + { + "epoch": 0.3541483435942539, + "grad_norm": 2.7232463688132698, + "learning_rate": 0.00010621336459554513, + "loss": 4.286871910095215, + "step": 604, + "token_acc": 0.2023142725869977 + }, + { + "epoch": 0.3547346819114629, + "grad_norm": 2.8417721138990166, + "learning_rate": 0.00010638921453692848, + "loss": 4.277947425842285, + "step": 605, + "token_acc": 0.2047835391203939 + }, + { + "epoch": 0.35532102022867196, + "grad_norm": 2.682389888868531, + "learning_rate": 0.00010656506447831183, + "loss": 4.279004096984863, + "step": 606, + "token_acc": 0.204964619805314 + }, + { + "epoch": 0.35590735854588096, + "grad_norm": 2.5595296108724948, + "learning_rate": 0.0001067409144196952, + "loss": 4.303957939147949, + "step": 607, + "token_acc": 0.20128981673075075 + }, + { + "epoch": 0.35649369686309, + "grad_norm": 2.837959032860929, + "learning_rate": 0.00010691676436107855, + "loss": 4.348471641540527, + "step": 608, + "token_acc": 0.1935732804072 + }, + { + "epoch": 0.357080035180299, + "grad_norm": 2.2017623995570577, + "learning_rate": 0.00010709261430246188, + "loss": 4.283245086669922, + "step": 609, + "token_acc": 0.20343393742860674 + }, + { + "epoch": 0.35766637349750807, + "grad_norm": 2.4879621393285696, + "learning_rate": 0.00010726846424384523, + "loss": 4.3325886726379395, + "step": 610, + "token_acc": 0.19625486754644958 + }, + { + "epoch": 0.3582527118147171, + "grad_norm": 3.0133670311804273, + "learning_rate": 0.0001074443141852286, + "loss": 4.32450008392334, + "step": 611, + "token_acc": 0.19944296375266524 + }, + { + "epoch": 0.35883905013192613, + "grad_norm": 2.983908195578399, + "learning_rate": 0.00010762016412661195, + "loss": 4.303168296813965, + "step": 612, + "token_acc": 0.20124181467953536 + }, + { + "epoch": 0.35942538844913513, + "grad_norm": 2.4041157929692636, + "learning_rate": 0.0001077960140679953, + "loss": 4.294432640075684, + "step": 613, + "token_acc": 0.20261561643765524 + }, + { + "epoch": 0.3600117267663442, + "grad_norm": 3.153780987731585, + "learning_rate": 0.00010797186400937865, + "loss": 4.37457275390625, + "step": 614, + "token_acc": 0.19388188385159102 + }, + { + "epoch": 0.3605980650835532, + "grad_norm": 1.950667897766203, + "learning_rate": 0.00010814771395076202, + "loss": 4.259453773498535, + "step": 615, + "token_acc": 0.20556281620577127 + }, + { + "epoch": 0.36118440340076224, + "grad_norm": 3.157750730971191, + "learning_rate": 0.00010832356389214537, + "loss": 4.238499641418457, + "step": 616, + "token_acc": 0.20794253185145026 + }, + { + "epoch": 0.36177074171797124, + "grad_norm": 1.8891889691306176, + "learning_rate": 0.00010849941383352872, + "loss": 4.216545581817627, + "step": 617, + "token_acc": 0.20939565325467746 + }, + { + "epoch": 0.3623570800351803, + "grad_norm": 2.8123821881610946, + "learning_rate": 0.00010867526377491205, + "loss": 4.283915996551514, + "step": 618, + "token_acc": 0.20297554659149647 + }, + { + "epoch": 0.3629434183523893, + "grad_norm": 2.314371069394081, + "learning_rate": 0.00010885111371629542, + "loss": 4.32310676574707, + "step": 619, + "token_acc": 0.19798544100220078 + }, + { + "epoch": 0.36352975666959836, + "grad_norm": 2.444273779518254, + "learning_rate": 0.00010902696365767877, + "loss": 4.273144721984863, + "step": 620, + "token_acc": 0.19962574558496746 + }, + { + "epoch": 0.3641160949868074, + "grad_norm": 2.5626319954303383, + "learning_rate": 0.00010920281359906212, + "loss": 4.27290678024292, + "step": 621, + "token_acc": 0.20420688587209002 + }, + { + "epoch": 0.3647024333040164, + "grad_norm": 2.3094596676423484, + "learning_rate": 0.00010937866354044547, + "loss": 4.219508171081543, + "step": 622, + "token_acc": 0.20828157133759956 + }, + { + "epoch": 0.36528877162122547, + "grad_norm": 2.8523401518170703, + "learning_rate": 0.00010955451348182884, + "loss": 4.30706787109375, + "step": 623, + "token_acc": 0.2004713204445251 + }, + { + "epoch": 0.3658751099384345, + "grad_norm": 2.8177216268057608, + "learning_rate": 0.00010973036342321219, + "loss": 4.278560638427734, + "step": 624, + "token_acc": 0.20058306429364967 + }, + { + "epoch": 0.36646144825564353, + "grad_norm": 2.6895108643874464, + "learning_rate": 0.00010990621336459554, + "loss": 4.2217254638671875, + "step": 625, + "token_acc": 0.20855649584958172 + }, + { + "epoch": 0.36704778657285253, + "grad_norm": 2.7574646136296157, + "learning_rate": 0.0001100820633059789, + "loss": 4.265744209289551, + "step": 626, + "token_acc": 0.20324141228766132 + }, + { + "epoch": 0.3676341248900616, + "grad_norm": 2.7351237617082598, + "learning_rate": 0.00011025791324736224, + "loss": 4.2529072761535645, + "step": 627, + "token_acc": 0.20443375057449512 + }, + { + "epoch": 0.3682204632072706, + "grad_norm": 2.4151446067812237, + "learning_rate": 0.00011043376318874559, + "loss": 4.239971160888672, + "step": 628, + "token_acc": 0.20523713465038262 + }, + { + "epoch": 0.36880680152447964, + "grad_norm": 2.081580812922123, + "learning_rate": 0.00011060961313012894, + "loss": 4.287620544433594, + "step": 629, + "token_acc": 0.20006133661848102 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 2.8298152822135116, + "learning_rate": 0.00011078546307151229, + "loss": 4.236255168914795, + "step": 630, + "token_acc": 0.20463056475457536 + }, + { + "epoch": 0.3699794781588977, + "grad_norm": 2.5510842902255084, + "learning_rate": 0.00011096131301289566, + "loss": 4.258484363555908, + "step": 631, + "token_acc": 0.20242658123646584 + }, + { + "epoch": 0.3705658164761067, + "grad_norm": 2.6950367431077433, + "learning_rate": 0.00011113716295427901, + "loss": 4.230443954467773, + "step": 632, + "token_acc": 0.20410526425107228 + }, + { + "epoch": 0.37115215479331576, + "grad_norm": 2.1898464073824186, + "learning_rate": 0.00011131301289566236, + "loss": 4.242029190063477, + "step": 633, + "token_acc": 0.20153682109107485 + }, + { + "epoch": 0.37173849311052476, + "grad_norm": 2.53603820659676, + "learning_rate": 0.00011148886283704572, + "loss": 4.224470615386963, + "step": 634, + "token_acc": 0.206255272208163 + }, + { + "epoch": 0.3723248314277338, + "grad_norm": 2.3673784961554274, + "learning_rate": 0.00011166471277842907, + "loss": 4.228389739990234, + "step": 635, + "token_acc": 0.20586544294116096 + }, + { + "epoch": 0.3729111697449428, + "grad_norm": 2.3218680839207737, + "learning_rate": 0.00011184056271981241, + "loss": 4.2403340339660645, + "step": 636, + "token_acc": 0.20616360392140776 + }, + { + "epoch": 0.3734975080621519, + "grad_norm": 2.637923432956788, + "learning_rate": 0.00011201641266119576, + "loss": 4.205411911010742, + "step": 637, + "token_acc": 0.20736103108180387 + }, + { + "epoch": 0.3740838463793609, + "grad_norm": 2.579854481699513, + "learning_rate": 0.00011219226260257913, + "loss": 4.257320404052734, + "step": 638, + "token_acc": 0.20180283509459596 + }, + { + "epoch": 0.37467018469656993, + "grad_norm": 2.2090510651164434, + "learning_rate": 0.00011236811254396248, + "loss": 4.209136009216309, + "step": 639, + "token_acc": 0.20608312144419577 + }, + { + "epoch": 0.37525652301377893, + "grad_norm": 3.867673247689839, + "learning_rate": 0.00011254396248534583, + "loss": 4.302282333374023, + "step": 640, + "token_acc": 0.196326914892245 + }, + { + "epoch": 0.375842861330988, + "grad_norm": 2.1654413354163498, + "learning_rate": 0.00011271981242672918, + "loss": 4.222521781921387, + "step": 641, + "token_acc": 0.20369304518540127 + }, + { + "epoch": 0.376429199648197, + "grad_norm": 3.087594103609828, + "learning_rate": 0.00011289566236811254, + "loss": 4.248073101043701, + "step": 642, + "token_acc": 0.2020766406022564 + }, + { + "epoch": 0.37701553796540604, + "grad_norm": 2.1813258304419834, + "learning_rate": 0.0001130715123094959, + "loss": 4.154412746429443, + "step": 643, + "token_acc": 0.21357638275083968 + }, + { + "epoch": 0.37760187628261505, + "grad_norm": 3.3076625794336065, + "learning_rate": 0.00011324736225087924, + "loss": 4.20400333404541, + "step": 644, + "token_acc": 0.20735396905902806 + }, + { + "epoch": 0.3781882145998241, + "grad_norm": 2.644500244743364, + "learning_rate": 0.00011342321219226258, + "loss": 4.2519965171813965, + "step": 645, + "token_acc": 0.20269862100570762 + }, + { + "epoch": 0.3787745529170331, + "grad_norm": 2.6667352505531925, + "learning_rate": 0.00011359906213364595, + "loss": 4.311054229736328, + "step": 646, + "token_acc": 0.19621835768676682 + }, + { + "epoch": 0.37936089123424216, + "grad_norm": 2.317800667982946, + "learning_rate": 0.0001137749120750293, + "loss": 4.217830657958984, + "step": 647, + "token_acc": 0.20569010622603304 + }, + { + "epoch": 0.37994722955145116, + "grad_norm": 2.3337578059583532, + "learning_rate": 0.00011395076201641265, + "loss": 4.20598030090332, + "step": 648, + "token_acc": 0.20874728009092175 + }, + { + "epoch": 0.3805335678686602, + "grad_norm": 2.7678048866848206, + "learning_rate": 0.000114126611957796, + "loss": 4.308103561401367, + "step": 649, + "token_acc": 0.19802635305263916 + }, + { + "epoch": 0.3811199061858693, + "grad_norm": 2.3082928696944434, + "learning_rate": 0.00011430246189917936, + "loss": 4.234099864959717, + "step": 650, + "token_acc": 0.201914854157866 + }, + { + "epoch": 0.3817062445030783, + "grad_norm": 2.3968763914936235, + "learning_rate": 0.00011447831184056271, + "loss": 4.15907096862793, + "step": 651, + "token_acc": 0.21309123583573925 + }, + { + "epoch": 0.38229258282028733, + "grad_norm": 2.098436379878401, + "learning_rate": 0.00011465416178194607, + "loss": 4.287763595581055, + "step": 652, + "token_acc": 0.19773169624454057 + }, + { + "epoch": 0.38287892113749633, + "grad_norm": 2.993266576352787, + "learning_rate": 0.00011483001172332943, + "loss": 4.265036582946777, + "step": 653, + "token_acc": 0.20070331774496447 + }, + { + "epoch": 0.3834652594547054, + "grad_norm": 1.993385308869202, + "learning_rate": 0.00011500586166471277, + "loss": 4.185324668884277, + "step": 654, + "token_acc": 0.20959614325641318 + }, + { + "epoch": 0.3840515977719144, + "grad_norm": 2.805813696939635, + "learning_rate": 0.00011518171160609612, + "loss": 4.205728530883789, + "step": 655, + "token_acc": 0.2054641234455115 + }, + { + "epoch": 0.38463793608912344, + "grad_norm": 1.8986969693751168, + "learning_rate": 0.00011535756154747947, + "loss": 4.19041109085083, + "step": 656, + "token_acc": 0.2067963687612689 + }, + { + "epoch": 0.38522427440633245, + "grad_norm": 3.0643280073221306, + "learning_rate": 0.00011553341148886282, + "loss": 4.218249797821045, + "step": 657, + "token_acc": 0.20460666375971479 + }, + { + "epoch": 0.3858106127235415, + "grad_norm": 2.089830978419018, + "learning_rate": 0.00011570926143024618, + "loss": 4.217906951904297, + "step": 658, + "token_acc": 0.20533666437482614 + }, + { + "epoch": 0.3863969510407505, + "grad_norm": 2.36016733149989, + "learning_rate": 0.00011588511137162954, + "loss": 4.215489864349365, + "step": 659, + "token_acc": 0.20456186439097432 + }, + { + "epoch": 0.38698328935795956, + "grad_norm": 1.7515316899120266, + "learning_rate": 0.00011606096131301289, + "loss": 4.174760818481445, + "step": 660, + "token_acc": 0.20799731780232508 + }, + { + "epoch": 0.38756962767516856, + "grad_norm": 2.4015701846986444, + "learning_rate": 0.00011623681125439625, + "loss": 4.227086067199707, + "step": 661, + "token_acc": 0.20463953133450785 + }, + { + "epoch": 0.3881559659923776, + "grad_norm": 2.214148268420861, + "learning_rate": 0.00011641266119577959, + "loss": 4.22163200378418, + "step": 662, + "token_acc": 0.2040581809843447 + }, + { + "epoch": 0.3887423043095866, + "grad_norm": 2.5996513782251367, + "learning_rate": 0.00011658851113716294, + "loss": 4.125226974487305, + "step": 663, + "token_acc": 0.21308594008114287 + }, + { + "epoch": 0.3893286426267957, + "grad_norm": 2.4595480251748545, + "learning_rate": 0.00011676436107854629, + "loss": 4.155521869659424, + "step": 664, + "token_acc": 0.2104404733733833 + }, + { + "epoch": 0.3899149809440047, + "grad_norm": 2.9530630107298155, + "learning_rate": 0.00011694021101992964, + "loss": 4.183135032653809, + "step": 665, + "token_acc": 0.20813966313202137 + }, + { + "epoch": 0.39050131926121373, + "grad_norm": 2.155911546664777, + "learning_rate": 0.000117116060961313, + "loss": 4.197364330291748, + "step": 666, + "token_acc": 0.20681188405722106 + }, + { + "epoch": 0.39108765757842273, + "grad_norm": 2.5509361326367093, + "learning_rate": 0.00011729191090269636, + "loss": 4.22589111328125, + "step": 667, + "token_acc": 0.20329033424436102 + }, + { + "epoch": 0.3916739958956318, + "grad_norm": 2.2358115413515742, + "learning_rate": 0.00011746776084407971, + "loss": 4.122133255004883, + "step": 668, + "token_acc": 0.21357097298148756 + }, + { + "epoch": 0.3922603342128408, + "grad_norm": 2.8407879200869472, + "learning_rate": 0.00011764361078546307, + "loss": 4.207402229309082, + "step": 669, + "token_acc": 0.20472913509328908 + }, + { + "epoch": 0.39284667253004985, + "grad_norm": 1.8183668097342147, + "learning_rate": 0.00011781946072684642, + "loss": 4.237727165222168, + "step": 670, + "token_acc": 0.20184391156729947 + }, + { + "epoch": 0.39343301084725885, + "grad_norm": 3.066399712432076, + "learning_rate": 0.00011799531066822976, + "loss": 4.168529510498047, + "step": 671, + "token_acc": 0.20791175331945214 + }, + { + "epoch": 0.3940193491644679, + "grad_norm": 1.8797947561138206, + "learning_rate": 0.00011817116060961311, + "loss": 4.148011684417725, + "step": 672, + "token_acc": 0.21082481445175746 + }, + { + "epoch": 0.3946056874816769, + "grad_norm": 3.0675257288605158, + "learning_rate": 0.00011834701055099647, + "loss": 4.161212921142578, + "step": 673, + "token_acc": 0.21053175423257292 + }, + { + "epoch": 0.39519202579888596, + "grad_norm": 1.9745773494119627, + "learning_rate": 0.00011852286049237983, + "loss": 4.147210121154785, + "step": 674, + "token_acc": 0.21428515538399343 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 3.347622964535245, + "learning_rate": 0.00011869871043376318, + "loss": 4.205878257751465, + "step": 675, + "token_acc": 0.20560969431965126 + }, + { + "epoch": 0.396364702433304, + "grad_norm": 1.8890336780637331, + "learning_rate": 0.00011887456037514653, + "loss": 4.118740081787109, + "step": 676, + "token_acc": 0.21519614131410833 + }, + { + "epoch": 0.3969510407505131, + "grad_norm": 2.896736132578702, + "learning_rate": 0.00011905041031652989, + "loss": 4.152634620666504, + "step": 677, + "token_acc": 0.21091203352553353 + }, + { + "epoch": 0.3975373790677221, + "grad_norm": 2.4418785524177085, + "learning_rate": 0.00011922626025791324, + "loss": 4.120591640472412, + "step": 678, + "token_acc": 0.21206497987500625 + }, + { + "epoch": 0.39812371738493113, + "grad_norm": 2.6209867136785245, + "learning_rate": 0.0001194021101992966, + "loss": 4.203255653381348, + "step": 679, + "token_acc": 0.20534793897665768 + }, + { + "epoch": 0.39871005570214013, + "grad_norm": 2.745224729395226, + "learning_rate": 0.00011957796014067993, + "loss": 4.108470439910889, + "step": 680, + "token_acc": 0.21366317434515705 + }, + { + "epoch": 0.3992963940193492, + "grad_norm": 2.01585643774109, + "learning_rate": 0.0001197538100820633, + "loss": 4.250920295715332, + "step": 681, + "token_acc": 0.19864673880981756 + }, + { + "epoch": 0.3998827323365582, + "grad_norm": 2.302510751856157, + "learning_rate": 0.00011992966002344665, + "loss": 4.157665252685547, + "step": 682, + "token_acc": 0.206980312926534 + }, + { + "epoch": 0.40046907065376725, + "grad_norm": 2.4412374279570797, + "learning_rate": 0.00012010550996483, + "loss": 4.190195083618164, + "step": 683, + "token_acc": 0.20590817684878351 + }, + { + "epoch": 0.40105540897097625, + "grad_norm": 2.3821393502986323, + "learning_rate": 0.00012028135990621335, + "loss": 4.153336524963379, + "step": 684, + "token_acc": 0.2114520132006094 + }, + { + "epoch": 0.4016417472881853, + "grad_norm": 1.7445297299224478, + "learning_rate": 0.00012045720984759671, + "loss": 4.111824989318848, + "step": 685, + "token_acc": 0.21490644946029513 + }, + { + "epoch": 0.4022280856053943, + "grad_norm": 2.8060739284301577, + "learning_rate": 0.00012063305978898006, + "loss": 4.160216331481934, + "step": 686, + "token_acc": 0.20775726209173198 + }, + { + "epoch": 0.40281442392260336, + "grad_norm": 2.100722450230539, + "learning_rate": 0.00012080890973036341, + "loss": 4.152647018432617, + "step": 687, + "token_acc": 0.2080494564739898 + }, + { + "epoch": 0.40340076223981236, + "grad_norm": 2.3389537338761506, + "learning_rate": 0.00012098475967174678, + "loss": 4.160815715789795, + "step": 688, + "token_acc": 0.2103728859607417 + }, + { + "epoch": 0.4039871005570214, + "grad_norm": 1.9789338697041972, + "learning_rate": 0.00012116060961313012, + "loss": 4.152812480926514, + "step": 689, + "token_acc": 0.21057015699745346 + }, + { + "epoch": 0.4045734388742304, + "grad_norm": 2.6818345095100184, + "learning_rate": 0.00012133645955451347, + "loss": 4.12637996673584, + "step": 690, + "token_acc": 0.21129294216812977 + }, + { + "epoch": 0.4051597771914395, + "grad_norm": 1.6244029783563276, + "learning_rate": 0.00012151230949589682, + "loss": 4.156166076660156, + "step": 691, + "token_acc": 0.20764431291188676 + }, + { + "epoch": 0.4057461155086485, + "grad_norm": 3.167276504124015, + "learning_rate": 0.00012168815943728017, + "loss": 4.206124305725098, + "step": 692, + "token_acc": 0.20199026921829388 + }, + { + "epoch": 0.40633245382585753, + "grad_norm": 1.8681993380613036, + "learning_rate": 0.00012186400937866353, + "loss": 4.181087017059326, + "step": 693, + "token_acc": 0.2067197459318676 + }, + { + "epoch": 0.40691879214306653, + "grad_norm": 2.9568119691184727, + "learning_rate": 0.00012203985932004688, + "loss": 4.21808385848999, + "step": 694, + "token_acc": 0.20395758325103489 + }, + { + "epoch": 0.4075051304602756, + "grad_norm": 2.0223093452670757, + "learning_rate": 0.00012221570926143025, + "loss": 4.093393802642822, + "step": 695, + "token_acc": 0.21467751036989294 + }, + { + "epoch": 0.4080914687774846, + "grad_norm": 2.5376842035994196, + "learning_rate": 0.0001223915592028136, + "loss": 4.146512508392334, + "step": 696, + "token_acc": 0.2089104967012327 + }, + { + "epoch": 0.40867780709469365, + "grad_norm": 2.400952595156885, + "learning_rate": 0.00012256740914419695, + "loss": 4.119112014770508, + "step": 697, + "token_acc": 0.2138407156976025 + }, + { + "epoch": 0.40926414541190265, + "grad_norm": 2.051615767217494, + "learning_rate": 0.00012274325908558027, + "loss": 4.1630353927612305, + "step": 698, + "token_acc": 0.20607875209548546 + }, + { + "epoch": 0.4098504837291117, + "grad_norm": 2.4070556767455265, + "learning_rate": 0.00012291910902696365, + "loss": 4.065162658691406, + "step": 699, + "token_acc": 0.2178281688708309 + }, + { + "epoch": 0.4104368220463207, + "grad_norm": 1.7767561279855022, + "learning_rate": 0.000123094958968347, + "loss": 4.160017967224121, + "step": 700, + "token_acc": 0.20539266823583685 + }, + { + "epoch": 0.41102316036352976, + "grad_norm": 2.1998659108616163, + "learning_rate": 0.00012327080890973035, + "loss": 4.164645671844482, + "step": 701, + "token_acc": 0.20756730548799607 + }, + { + "epoch": 0.41160949868073876, + "grad_norm": 2.475840659601759, + "learning_rate": 0.0001234466588511137, + "loss": 4.168675422668457, + "step": 702, + "token_acc": 0.20841045210664227 + }, + { + "epoch": 0.4121958369979478, + "grad_norm": 2.0596512499590083, + "learning_rate": 0.00012362250879249706, + "loss": 4.1293182373046875, + "step": 703, + "token_acc": 0.21049262945697367 + }, + { + "epoch": 0.4127821753151568, + "grad_norm": 2.0376017958400836, + "learning_rate": 0.0001237983587338804, + "loss": 4.152271270751953, + "step": 704, + "token_acc": 0.20728880788457588 + }, + { + "epoch": 0.4133685136323659, + "grad_norm": 2.10338543770458, + "learning_rate": 0.00012397420867526376, + "loss": 4.191035747528076, + "step": 705, + "token_acc": 0.20290883662190864 + }, + { + "epoch": 0.41395485194957493, + "grad_norm": 2.4527919996484133, + "learning_rate": 0.00012415005861664714, + "loss": 4.105487823486328, + "step": 706, + "token_acc": 0.21175861728189577 + }, + { + "epoch": 0.41454119026678393, + "grad_norm": 2.076112620592698, + "learning_rate": 0.00012432590855803046, + "loss": 4.096744060516357, + "step": 707, + "token_acc": 0.21373644682541718 + }, + { + "epoch": 0.415127528583993, + "grad_norm": 2.4501405078458998, + "learning_rate": 0.0001245017584994138, + "loss": 4.080202102661133, + "step": 708, + "token_acc": 0.2151697631678618 + }, + { + "epoch": 0.415713866901202, + "grad_norm": 2.7005766434794767, + "learning_rate": 0.00012467760844079716, + "loss": 4.129821300506592, + "step": 709, + "token_acc": 0.21251860720988933 + }, + { + "epoch": 0.41630020521841105, + "grad_norm": 2.134171405553577, + "learning_rate": 0.00012485345838218054, + "loss": 4.124048233032227, + "step": 710, + "token_acc": 0.21154408364384392 + }, + { + "epoch": 0.41688654353562005, + "grad_norm": 2.4148764514565535, + "learning_rate": 0.0001250293083235639, + "loss": 4.112054824829102, + "step": 711, + "token_acc": 0.21169364507636662 + }, + { + "epoch": 0.4174728818528291, + "grad_norm": 2.0495547010381676, + "learning_rate": 0.00012520515826494724, + "loss": 4.157209396362305, + "step": 712, + "token_acc": 0.20692587091511516 + }, + { + "epoch": 0.4180592201700381, + "grad_norm": 2.3772134147936392, + "learning_rate": 0.0001253810082063306, + "loss": 4.160642623901367, + "step": 713, + "token_acc": 0.20857684472166943 + }, + { + "epoch": 0.41864555848724716, + "grad_norm": 2.3651051237165013, + "learning_rate": 0.00012555685814771394, + "loss": 4.122062683105469, + "step": 714, + "token_acc": 0.20909662902728834 + }, + { + "epoch": 0.41923189680445616, + "grad_norm": 2.122922504207128, + "learning_rate": 0.0001257327080890973, + "loss": 4.099079132080078, + "step": 715, + "token_acc": 0.21240977833447983 + }, + { + "epoch": 0.4198182351216652, + "grad_norm": 2.437035772579928, + "learning_rate": 0.00012590855803048064, + "loss": 4.065517425537109, + "step": 716, + "token_acc": 0.21513740691195213 + }, + { + "epoch": 0.4204045734388742, + "grad_norm": 1.8086811220859031, + "learning_rate": 0.000126084407971864, + "loss": 4.120763778686523, + "step": 717, + "token_acc": 0.21111116827261925 + }, + { + "epoch": 0.4209909117560833, + "grad_norm": 3.16220286536545, + "learning_rate": 0.00012626025791324735, + "loss": 4.158533573150635, + "step": 718, + "token_acc": 0.20581005527464008 + }, + { + "epoch": 0.4215772500732923, + "grad_norm": 1.679460962803792, + "learning_rate": 0.0001264361078546307, + "loss": 4.08233642578125, + "step": 719, + "token_acc": 0.2153272678999072 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 2.804379712588806, + "learning_rate": 0.00012661195779601405, + "loss": 4.186943054199219, + "step": 720, + "token_acc": 0.2050246496088657 + }, + { + "epoch": 0.42274992670771033, + "grad_norm": 2.391357394029273, + "learning_rate": 0.0001267878077373974, + "loss": 4.042555809020996, + "step": 721, + "token_acc": 0.21814060011994127 + }, + { + "epoch": 0.4233362650249194, + "grad_norm": 2.126805986126612, + "learning_rate": 0.00012696365767878078, + "loss": 4.137605667114258, + "step": 722, + "token_acc": 0.20761405227816362 + }, + { + "epoch": 0.4239226033421284, + "grad_norm": 2.316979942646839, + "learning_rate": 0.00012713950762016413, + "loss": 4.13259220123291, + "step": 723, + "token_acc": 0.21073668832495607 + }, + { + "epoch": 0.42450894165933745, + "grad_norm": 1.8990744805574709, + "learning_rate": 0.00012731535756154748, + "loss": 4.040066719055176, + "step": 724, + "token_acc": 0.21839272368776733 + }, + { + "epoch": 0.42509527997654645, + "grad_norm": 2.564686818764265, + "learning_rate": 0.0001274912075029308, + "loss": 4.155131816864014, + "step": 725, + "token_acc": 0.20508214090890017 + }, + { + "epoch": 0.4256816182937555, + "grad_norm": 2.3381168743589287, + "learning_rate": 0.00012766705744431418, + "loss": 4.170192718505859, + "step": 726, + "token_acc": 0.2039467182354088 + }, + { + "epoch": 0.4262679566109645, + "grad_norm": 2.5315902542143087, + "learning_rate": 0.00012784290738569753, + "loss": 4.092622756958008, + "step": 727, + "token_acc": 0.21180726637490152 + }, + { + "epoch": 0.42685429492817356, + "grad_norm": 2.1824803687804173, + "learning_rate": 0.00012801875732708088, + "loss": 4.109076499938965, + "step": 728, + "token_acc": 0.21181478696422426 + }, + { + "epoch": 0.42744063324538256, + "grad_norm": 2.036872391261052, + "learning_rate": 0.00012819460726846423, + "loss": 4.081572532653809, + "step": 729, + "token_acc": 0.21358976825722592 + }, + { + "epoch": 0.4280269715625916, + "grad_norm": 1.9012882818076624, + "learning_rate": 0.00012837045720984758, + "loss": 4.100808620452881, + "step": 730, + "token_acc": 0.21383671182236574 + }, + { + "epoch": 0.4286133098798006, + "grad_norm": 2.2856031050832324, + "learning_rate": 0.00012854630715123093, + "loss": 4.118395805358887, + "step": 731, + "token_acc": 0.2106478149100257 + }, + { + "epoch": 0.4291996481970097, + "grad_norm": 2.127030448882872, + "learning_rate": 0.00012872215709261429, + "loss": 4.081466197967529, + "step": 732, + "token_acc": 0.21411614365840226 + }, + { + "epoch": 0.42978598651421873, + "grad_norm": 2.415004072002858, + "learning_rate": 0.00012889800703399766, + "loss": 4.068760871887207, + "step": 733, + "token_acc": 0.21585425691862434 + }, + { + "epoch": 0.43037232483142773, + "grad_norm": 2.7470040560462947, + "learning_rate": 0.000129073856975381, + "loss": 4.063611030578613, + "step": 734, + "token_acc": 0.21337652724079142 + }, + { + "epoch": 0.4309586631486368, + "grad_norm": 1.8332247268435393, + "learning_rate": 0.00012924970691676434, + "loss": 4.081305503845215, + "step": 735, + "token_acc": 0.21493379559620845 + }, + { + "epoch": 0.4315450014658458, + "grad_norm": 2.3766773644982737, + "learning_rate": 0.0001294255568581477, + "loss": 4.039984226226807, + "step": 736, + "token_acc": 0.21901808893073973 + }, + { + "epoch": 0.43213133978305485, + "grad_norm": 1.925927827075631, + "learning_rate": 0.00012960140679953107, + "loss": 4.093031883239746, + "step": 737, + "token_acc": 0.21356167080821198 + }, + { + "epoch": 0.43271767810026385, + "grad_norm": 2.7307736873525905, + "learning_rate": 0.00012977725674091442, + "loss": 4.111199378967285, + "step": 738, + "token_acc": 0.21000319009541285 + }, + { + "epoch": 0.4333040164174729, + "grad_norm": 2.0236355732370246, + "learning_rate": 0.00012995310668229777, + "loss": 4.092278480529785, + "step": 739, + "token_acc": 0.21215870362034595 + }, + { + "epoch": 0.4338903547346819, + "grad_norm": 2.711432149669485, + "learning_rate": 0.00013012895662368112, + "loss": 4.053503513336182, + "step": 740, + "token_acc": 0.21752965989981546 + }, + { + "epoch": 0.43447669305189096, + "grad_norm": 1.9142003930487304, + "learning_rate": 0.00013030480656506447, + "loss": 4.143960952758789, + "step": 741, + "token_acc": 0.2083737747581514 + }, + { + "epoch": 0.43506303136909996, + "grad_norm": 2.4806757993224045, + "learning_rate": 0.00013048065650644782, + "loss": 4.038522243499756, + "step": 742, + "token_acc": 0.21859457056322765 + }, + { + "epoch": 0.435649369686309, + "grad_norm": 1.9143472174167695, + "learning_rate": 0.00013065650644783117, + "loss": 4.072566986083984, + "step": 743, + "token_acc": 0.21407456476829098 + }, + { + "epoch": 0.436235708003518, + "grad_norm": 1.9707957125227447, + "learning_rate": 0.00013083235638921452, + "loss": 4.032047271728516, + "step": 744, + "token_acc": 0.21724656823796615 + }, + { + "epoch": 0.4368220463207271, + "grad_norm": 2.096178781940885, + "learning_rate": 0.00013100820633059787, + "loss": 4.087066650390625, + "step": 745, + "token_acc": 0.21413080960956357 + }, + { + "epoch": 0.4374083846379361, + "grad_norm": 2.224574019360642, + "learning_rate": 0.00013118405627198123, + "loss": 4.112011432647705, + "step": 746, + "token_acc": 0.20901167998047712 + }, + { + "epoch": 0.43799472295514513, + "grad_norm": 1.8161649689655095, + "learning_rate": 0.00013135990621336458, + "loss": 4.048702239990234, + "step": 747, + "token_acc": 0.21569269355158074 + }, + { + "epoch": 0.43858106127235413, + "grad_norm": 2.298111949720826, + "learning_rate": 0.00013153575615474793, + "loss": 4.138697147369385, + "step": 748, + "token_acc": 0.20531864787507698 + }, + { + "epoch": 0.4391673995895632, + "grad_norm": 2.1554211028757497, + "learning_rate": 0.0001317116060961313, + "loss": 4.076339244842529, + "step": 749, + "token_acc": 0.2136930807484062 + }, + { + "epoch": 0.4397537379067722, + "grad_norm": 2.1022149460459323, + "learning_rate": 0.00013188745603751466, + "loss": 4.093334197998047, + "step": 750, + "token_acc": 0.2121363986452981 + }, + { + "epoch": 0.44034007622398125, + "grad_norm": 2.184088089058623, + "learning_rate": 0.00013206330597889798, + "loss": 4.112935543060303, + "step": 751, + "token_acc": 0.20786556616614218 + }, + { + "epoch": 0.44092641454119025, + "grad_norm": 2.0327734832578823, + "learning_rate": 0.00013223915592028133, + "loss": 4.0729804039001465, + "step": 752, + "token_acc": 0.21243168701159407 + }, + { + "epoch": 0.4415127528583993, + "grad_norm": 2.355493907916843, + "learning_rate": 0.0001324150058616647, + "loss": 4.067388534545898, + "step": 753, + "token_acc": 0.21615596780896967 + }, + { + "epoch": 0.4420990911756083, + "grad_norm": 2.06448310854639, + "learning_rate": 0.00013259085580304806, + "loss": 4.034196853637695, + "step": 754, + "token_acc": 0.21728687740084712 + }, + { + "epoch": 0.44268542949281736, + "grad_norm": 2.417821533403658, + "learning_rate": 0.0001327667057444314, + "loss": 4.091578960418701, + "step": 755, + "token_acc": 0.20998901081399607 + }, + { + "epoch": 0.44327176781002636, + "grad_norm": 2.1743948418696575, + "learning_rate": 0.00013294255568581476, + "loss": 4.0719451904296875, + "step": 756, + "token_acc": 0.21568946121439436 + }, + { + "epoch": 0.4438581061272354, + "grad_norm": 2.0290993848132914, + "learning_rate": 0.0001331184056271981, + "loss": 4.0861968994140625, + "step": 757, + "token_acc": 0.211518461739941 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 2.1300144802947054, + "learning_rate": 0.00013329425556858146, + "loss": 4.091272354125977, + "step": 758, + "token_acc": 0.20996469162627446 + }, + { + "epoch": 0.4450307827616535, + "grad_norm": 2.099180534718681, + "learning_rate": 0.00013347010550996481, + "loss": 4.030974864959717, + "step": 759, + "token_acc": 0.2173153673304594 + }, + { + "epoch": 0.4456171210788625, + "grad_norm": 2.0165908506525287, + "learning_rate": 0.00013364595545134816, + "loss": 4.082425117492676, + "step": 760, + "token_acc": 0.2113496503858436 + }, + { + "epoch": 0.44620345939607153, + "grad_norm": 2.3107332528859645, + "learning_rate": 0.00013382180539273152, + "loss": 4.047661781311035, + "step": 761, + "token_acc": 0.21523623553626325 + }, + { + "epoch": 0.4467897977132806, + "grad_norm": 1.5887295852701293, + "learning_rate": 0.00013399765533411487, + "loss": 4.032632827758789, + "step": 762, + "token_acc": 0.2163661256842384 + }, + { + "epoch": 0.4473761360304896, + "grad_norm": 2.780848673395678, + "learning_rate": 0.00013417350527549822, + "loss": 4.060242176055908, + "step": 763, + "token_acc": 0.21551964482038496 + }, + { + "epoch": 0.44796247434769865, + "grad_norm": 1.7776689254813058, + "learning_rate": 0.0001343493552168816, + "loss": 4.008596897125244, + "step": 764, + "token_acc": 0.22028683387647935 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 2.439113486656821, + "learning_rate": 0.00013452520515826495, + "loss": 4.083590984344482, + "step": 765, + "token_acc": 0.21233907507907085 + }, + { + "epoch": 0.4491351509821167, + "grad_norm": 1.9902075433071043, + "learning_rate": 0.0001347010550996483, + "loss": 4.019782066345215, + "step": 766, + "token_acc": 0.21966962152321648 + }, + { + "epoch": 0.4497214892993257, + "grad_norm": 2.2612576620568463, + "learning_rate": 0.00013487690504103165, + "loss": 4.063292503356934, + "step": 767, + "token_acc": 0.2138527930567899 + }, + { + "epoch": 0.45030782761653476, + "grad_norm": 2.05662811076168, + "learning_rate": 0.000135052754982415, + "loss": 4.059133529663086, + "step": 768, + "token_acc": 0.21186494597202635 + }, + { + "epoch": 0.45089416593374376, + "grad_norm": 1.8830594810898171, + "learning_rate": 0.00013522860492379835, + "loss": 4.1018829345703125, + "step": 769, + "token_acc": 0.20955638328609022 + }, + { + "epoch": 0.4514805042509528, + "grad_norm": 2.331390668269172, + "learning_rate": 0.0001354044548651817, + "loss": 4.039198875427246, + "step": 770, + "token_acc": 0.21756219618677752 + }, + { + "epoch": 0.4520668425681618, + "grad_norm": 2.45959116003766, + "learning_rate": 0.00013558030480656505, + "loss": 4.031373500823975, + "step": 771, + "token_acc": 0.21664699392845296 + }, + { + "epoch": 0.4526531808853709, + "grad_norm": 2.1000476049218855, + "learning_rate": 0.0001357561547479484, + "loss": 3.9976091384887695, + "step": 772, + "token_acc": 0.2205907226521206 + }, + { + "epoch": 0.4532395192025799, + "grad_norm": 2.1774641066309153, + "learning_rate": 0.00013593200468933175, + "loss": 4.0507893562316895, + "step": 773, + "token_acc": 0.2165560684109642 + }, + { + "epoch": 0.45382585751978893, + "grad_norm": 1.8722671773171955, + "learning_rate": 0.0001361078546307151, + "loss": 4.077295780181885, + "step": 774, + "token_acc": 0.21288626930533328 + }, + { + "epoch": 0.45441219583699793, + "grad_norm": 2.3845653862423277, + "learning_rate": 0.00013628370457209846, + "loss": 4.086780548095703, + "step": 775, + "token_acc": 0.21148281264399593 + }, + { + "epoch": 0.454998534154207, + "grad_norm": 1.8095497490989678, + "learning_rate": 0.00013645955451348183, + "loss": 4.035080909729004, + "step": 776, + "token_acc": 0.21685192353776106 + }, + { + "epoch": 0.455584872471416, + "grad_norm": 2.318434675374333, + "learning_rate": 0.00013663540445486518, + "loss": 4.0308427810668945, + "step": 777, + "token_acc": 0.21576059770324024 + }, + { + "epoch": 0.45617121078862505, + "grad_norm": 2.0287197433760653, + "learning_rate": 0.0001368112543962485, + "loss": 4.029142379760742, + "step": 778, + "token_acc": 0.21721333636137516 + }, + { + "epoch": 0.45675754910583405, + "grad_norm": 2.475126604314923, + "learning_rate": 0.00013698710433763186, + "loss": 4.030572891235352, + "step": 779, + "token_acc": 0.21645246257494938 + }, + { + "epoch": 0.4573438874230431, + "grad_norm": 1.7161855550958882, + "learning_rate": 0.00013716295427901524, + "loss": 4.023125648498535, + "step": 780, + "token_acc": 0.2155937526460423 + }, + { + "epoch": 0.4579302257402521, + "grad_norm": 2.3769327424182167, + "learning_rate": 0.0001373388042203986, + "loss": 4.012986183166504, + "step": 781, + "token_acc": 0.2186782110085776 + }, + { + "epoch": 0.45851656405746116, + "grad_norm": 2.0624369022885913, + "learning_rate": 0.00013751465416178194, + "loss": 4.023934364318848, + "step": 782, + "token_acc": 0.21677102767909628 + }, + { + "epoch": 0.45910290237467016, + "grad_norm": 2.177195634816114, + "learning_rate": 0.0001376905041031653, + "loss": 4.044404029846191, + "step": 783, + "token_acc": 0.21445519159211984 + }, + { + "epoch": 0.4596892406918792, + "grad_norm": 2.2563262707201686, + "learning_rate": 0.00013786635404454864, + "loss": 4.055820941925049, + "step": 784, + "token_acc": 0.21395694057232478 + }, + { + "epoch": 0.4602755790090882, + "grad_norm": 2.4828346512876682, + "learning_rate": 0.000138042203985932, + "loss": 4.045603275299072, + "step": 785, + "token_acc": 0.21446117121626304 + }, + { + "epoch": 0.4608619173262973, + "grad_norm": 2.042905256187356, + "learning_rate": 0.00013821805392731534, + "loss": 3.9925954341888428, + "step": 786, + "token_acc": 0.2192796224314044 + }, + { + "epoch": 0.4614482556435063, + "grad_norm": 2.1699196072038323, + "learning_rate": 0.0001383939038686987, + "loss": 4.077231407165527, + "step": 787, + "token_acc": 0.210341239740475 + }, + { + "epoch": 0.46203459396071533, + "grad_norm": 1.668827224585959, + "learning_rate": 0.00013856975381008204, + "loss": 4.014451026916504, + "step": 788, + "token_acc": 0.21867900758189443 + }, + { + "epoch": 0.46262093227792433, + "grad_norm": 2.4213183982831046, + "learning_rate": 0.0001387456037514654, + "loss": 4.035038948059082, + "step": 789, + "token_acc": 0.21588062358957372 + }, + { + "epoch": 0.4632072705951334, + "grad_norm": 1.8628530124538194, + "learning_rate": 0.00013892145369284875, + "loss": 4.022771835327148, + "step": 790, + "token_acc": 0.21723966118584953 + }, + { + "epoch": 0.46379360891234245, + "grad_norm": 2.1894736172037446, + "learning_rate": 0.00013909730363423212, + "loss": 4.059389591217041, + "step": 791, + "token_acc": 0.21307001231043243 + }, + { + "epoch": 0.46437994722955145, + "grad_norm": 1.875901227982124, + "learning_rate": 0.00013927315357561547, + "loss": 3.989318370819092, + "step": 792, + "token_acc": 0.22080802675585284 + }, + { + "epoch": 0.4649662855467605, + "grad_norm": 2.6561148005205633, + "learning_rate": 0.00013944900351699883, + "loss": 4.044867992401123, + "step": 793, + "token_acc": 0.21448236026615256 + }, + { + "epoch": 0.4655526238639695, + "grad_norm": 1.7302015259189907, + "learning_rate": 0.00013962485345838218, + "loss": 4.015591621398926, + "step": 794, + "token_acc": 0.21665450972049155 + }, + { + "epoch": 0.46613896218117856, + "grad_norm": 2.3466255090535415, + "learning_rate": 0.00013980070339976553, + "loss": 4.007062911987305, + "step": 795, + "token_acc": 0.21982217586273153 + }, + { + "epoch": 0.46672530049838756, + "grad_norm": 2.0480849198278555, + "learning_rate": 0.00013997655334114888, + "loss": 4.026554107666016, + "step": 796, + "token_acc": 0.21854175107224186 + }, + { + "epoch": 0.4673116388155966, + "grad_norm": 2.100109175539408, + "learning_rate": 0.00014015240328253223, + "loss": 4.008111953735352, + "step": 797, + "token_acc": 0.2188090430190523 + }, + { + "epoch": 0.4678979771328056, + "grad_norm": 2.415989266838997, + "learning_rate": 0.00014032825322391558, + "loss": 3.945232629776001, + "step": 798, + "token_acc": 0.22416460155930695 + }, + { + "epoch": 0.4684843154500147, + "grad_norm": 1.7068560656200347, + "learning_rate": 0.00014050410316529893, + "loss": 4.011207580566406, + "step": 799, + "token_acc": 0.21589051259290665 + }, + { + "epoch": 0.4690706537672237, + "grad_norm": 2.1426999987947446, + "learning_rate": 0.00014067995310668228, + "loss": 4.03364372253418, + "step": 800, + "token_acc": 0.21555533808562363 + }, + { + "epoch": 0.46965699208443273, + "grad_norm": 1.8094369551718885, + "learning_rate": 0.00014085580304806563, + "loss": 4.028426170349121, + "step": 801, + "token_acc": 0.21638462670394498 + }, + { + "epoch": 0.47024333040164173, + "grad_norm": 1.845072807159866, + "learning_rate": 0.00014103165298944898, + "loss": 4.014816761016846, + "step": 802, + "token_acc": 0.21553700107652607 + }, + { + "epoch": 0.4708296687188508, + "grad_norm": 2.2812061529373335, + "learning_rate": 0.00014120750293083236, + "loss": 4.042592525482178, + "step": 803, + "token_acc": 0.21553874828794584 + }, + { + "epoch": 0.4714160070360598, + "grad_norm": 1.702609737313762, + "learning_rate": 0.0001413833528722157, + "loss": 4.070908546447754, + "step": 804, + "token_acc": 0.21147798536627446 + }, + { + "epoch": 0.47200234535326885, + "grad_norm": 2.5034058601638933, + "learning_rate": 0.00014155920281359904, + "loss": 4.0488786697387695, + "step": 805, + "token_acc": 0.21286942351624663 + }, + { + "epoch": 0.47258868367047785, + "grad_norm": 1.7491534141236686, + "learning_rate": 0.0001417350527549824, + "loss": 3.9564085006713867, + "step": 806, + "token_acc": 0.22351506149152733 + }, + { + "epoch": 0.4731750219876869, + "grad_norm": 2.6368328801590466, + "learning_rate": 0.00014191090269636576, + "loss": 4.078041076660156, + "step": 807, + "token_acc": 0.2092960879751978 + }, + { + "epoch": 0.4737613603048959, + "grad_norm": 1.7049492759816158, + "learning_rate": 0.00014208675263774912, + "loss": 4.031122207641602, + "step": 808, + "token_acc": 0.21409589593522838 + }, + { + "epoch": 0.47434769862210496, + "grad_norm": 2.175849449615241, + "learning_rate": 0.00014226260257913247, + "loss": 4.014800071716309, + "step": 809, + "token_acc": 0.21730702544387703 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 1.6008167829662476, + "learning_rate": 0.00014243845252051582, + "loss": 3.988290786743164, + "step": 810, + "token_acc": 0.2181024523077002 + }, + { + "epoch": 0.475520375256523, + "grad_norm": 1.9951804005247435, + "learning_rate": 0.00014261430246189917, + "loss": 4.050525188446045, + "step": 811, + "token_acc": 0.21434982865196386 + }, + { + "epoch": 0.476106713573732, + "grad_norm": 1.9175919238515464, + "learning_rate": 0.00014279015240328252, + "loss": 4.008174419403076, + "step": 812, + "token_acc": 0.2191672900319906 + }, + { + "epoch": 0.4766930518909411, + "grad_norm": 2.287598029487112, + "learning_rate": 0.00014296600234466587, + "loss": 3.989607334136963, + "step": 813, + "token_acc": 0.21782770163820522 + }, + { + "epoch": 0.4772793902081501, + "grad_norm": 2.0477310229223096, + "learning_rate": 0.00014314185228604922, + "loss": 4.016474723815918, + "step": 814, + "token_acc": 0.21551566633214794 + }, + { + "epoch": 0.47786572852535913, + "grad_norm": 1.7085568657747554, + "learning_rate": 0.00014331770222743257, + "loss": 4.027948379516602, + "step": 815, + "token_acc": 0.21404618439295173 + }, + { + "epoch": 0.47845206684256814, + "grad_norm": 2.110394743573738, + "learning_rate": 0.00014349355216881592, + "loss": 4.040744781494141, + "step": 816, + "token_acc": 0.21310861903236142 + }, + { + "epoch": 0.4790384051597772, + "grad_norm": 2.2496327045668245, + "learning_rate": 0.00014366940211019927, + "loss": 4.01626443862915, + "step": 817, + "token_acc": 0.21752420540376136 + }, + { + "epoch": 0.47962474347698625, + "grad_norm": 1.8336540992901764, + "learning_rate": 0.00014384525205158262, + "loss": 3.9970903396606445, + "step": 818, + "token_acc": 0.21812795487287706 + }, + { + "epoch": 0.48021108179419525, + "grad_norm": 1.538716532624314, + "learning_rate": 0.000144021101992966, + "loss": 3.980809450149536, + "step": 819, + "token_acc": 0.2216139102236775 + }, + { + "epoch": 0.4807974201114043, + "grad_norm": 1.8764621654293472, + "learning_rate": 0.00014419695193434935, + "loss": 3.9870188236236572, + "step": 820, + "token_acc": 0.2181704702999422 + }, + { + "epoch": 0.4813837584286133, + "grad_norm": 2.1105175838345693, + "learning_rate": 0.0001443728018757327, + "loss": 4.027533531188965, + "step": 821, + "token_acc": 0.21529521725956624 + }, + { + "epoch": 0.48197009674582236, + "grad_norm": 1.9414430591555694, + "learning_rate": 0.00014454865181711603, + "loss": 4.012726783752441, + "step": 822, + "token_acc": 0.21794045963291417 + }, + { + "epoch": 0.48255643506303136, + "grad_norm": 1.7616903861393867, + "learning_rate": 0.0001447245017584994, + "loss": 3.963686943054199, + "step": 823, + "token_acc": 0.22228820780293837 + }, + { + "epoch": 0.4831427733802404, + "grad_norm": 1.9193893003192424, + "learning_rate": 0.00014490035169988276, + "loss": 3.987231731414795, + "step": 824, + "token_acc": 0.21857036266380822 + }, + { + "epoch": 0.4837291116974494, + "grad_norm": 2.8079381968028243, + "learning_rate": 0.0001450762016412661, + "loss": 4.059203147888184, + "step": 825, + "token_acc": 0.21176744366443004 + }, + { + "epoch": 0.4843154500146585, + "grad_norm": 1.321918916420202, + "learning_rate": 0.00014525205158264946, + "loss": 3.9564807415008545, + "step": 826, + "token_acc": 0.22537002932026354 + }, + { + "epoch": 0.4849017883318675, + "grad_norm": 2.1831183683445126, + "learning_rate": 0.0001454279015240328, + "loss": 3.943370819091797, + "step": 827, + "token_acc": 0.22395926813560396 + }, + { + "epoch": 0.48548812664907653, + "grad_norm": 1.8073220827352843, + "learning_rate": 0.00014560375146541616, + "loss": 4.050217628479004, + "step": 828, + "token_acc": 0.21223494574670754 + }, + { + "epoch": 0.48607446496628554, + "grad_norm": 2.0773636021346413, + "learning_rate": 0.0001457796014067995, + "loss": 4.001766204833984, + "step": 829, + "token_acc": 0.21755831781842078 + }, + { + "epoch": 0.4866608032834946, + "grad_norm": 1.8394367672718561, + "learning_rate": 0.0001459554513481829, + "loss": 3.990412712097168, + "step": 830, + "token_acc": 0.2188863208288564 + }, + { + "epoch": 0.4872471416007036, + "grad_norm": 1.9226135370128277, + "learning_rate": 0.0001461313012895662, + "loss": 3.954904556274414, + "step": 831, + "token_acc": 0.2220279589095575 + }, + { + "epoch": 0.48783347991791265, + "grad_norm": 2.1943672410085626, + "learning_rate": 0.00014630715123094956, + "loss": 4.047689914703369, + "step": 832, + "token_acc": 0.21354716951744082 + }, + { + "epoch": 0.48841981823512165, + "grad_norm": 1.6164406326378484, + "learning_rate": 0.00014648300117233291, + "loss": 3.950265645980835, + "step": 833, + "token_acc": 0.22424753026151406 + }, + { + "epoch": 0.4890061565523307, + "grad_norm": 2.4966985525759995, + "learning_rate": 0.0001466588511137163, + "loss": 3.9795613288879395, + "step": 834, + "token_acc": 0.21804594930371415 + }, + { + "epoch": 0.4895924948695397, + "grad_norm": 1.60918508625833, + "learning_rate": 0.00014683470105509964, + "loss": 3.9786455631256104, + "step": 835, + "token_acc": 0.2197452313378655 + }, + { + "epoch": 0.49017883318674876, + "grad_norm": 1.6759984207154885, + "learning_rate": 0.000147010550996483, + "loss": 3.938727378845215, + "step": 836, + "token_acc": 0.2215581564308025 + }, + { + "epoch": 0.49076517150395776, + "grad_norm": 2.1113533339907185, + "learning_rate": 0.00014718640093786635, + "loss": 4.035755157470703, + "step": 837, + "token_acc": 0.21481765655822055 + }, + { + "epoch": 0.4913515098211668, + "grad_norm": 2.0609562966744095, + "learning_rate": 0.0001473622508792497, + "loss": 3.9650158882141113, + "step": 838, + "token_acc": 0.222173579109063 + }, + { + "epoch": 0.4919378481383758, + "grad_norm": 1.791600221391203, + "learning_rate": 0.00014753810082063305, + "loss": 3.995253324508667, + "step": 839, + "token_acc": 0.21673336504024238 + }, + { + "epoch": 0.4925241864555849, + "grad_norm": 2.4579608974993046, + "learning_rate": 0.0001477139507620164, + "loss": 3.959195613861084, + "step": 840, + "token_acc": 0.22332642714560785 + }, + { + "epoch": 0.4931105247727939, + "grad_norm": 1.6621475859581218, + "learning_rate": 0.00014788980070339975, + "loss": 3.998112678527832, + "step": 841, + "token_acc": 0.21640121066498919 + }, + { + "epoch": 0.49369686309000294, + "grad_norm": 1.469163654760437, + "learning_rate": 0.0001480656506447831, + "loss": 3.97268009185791, + "step": 842, + "token_acc": 0.22018389029796032 + }, + { + "epoch": 0.49428320140721194, + "grad_norm": 2.30500065737085, + "learning_rate": 0.00014824150058616645, + "loss": 4.016376495361328, + "step": 843, + "token_acc": 0.2150182527541018 + }, + { + "epoch": 0.494869539724421, + "grad_norm": 1.7887059826943665, + "learning_rate": 0.0001484173505275498, + "loss": 3.9573819637298584, + "step": 844, + "token_acc": 0.22371168462373675 + }, + { + "epoch": 0.49545587804163, + "grad_norm": 2.3988451252707566, + "learning_rate": 0.00014859320046893315, + "loss": 3.932079315185547, + "step": 845, + "token_acc": 0.2243850140891421 + }, + { + "epoch": 0.49604221635883905, + "grad_norm": 1.5338282934520788, + "learning_rate": 0.00014876905041031653, + "loss": 3.955700397491455, + "step": 846, + "token_acc": 0.21945676654977947 + }, + { + "epoch": 0.4966285546760481, + "grad_norm": 1.9250776311428506, + "learning_rate": 0.00014894490035169988, + "loss": 4.006363868713379, + "step": 847, + "token_acc": 0.21841865037567587 + }, + { + "epoch": 0.4972148929932571, + "grad_norm": 1.4048102487483758, + "learning_rate": 0.00014912075029308323, + "loss": 3.9912896156311035, + "step": 848, + "token_acc": 0.21928597364317906 + }, + { + "epoch": 0.49780123131046616, + "grad_norm": 2.1756339607300013, + "learning_rate": 0.00014929660023446656, + "loss": 4.036858558654785, + "step": 849, + "token_acc": 0.21234694548563163 + }, + { + "epoch": 0.49838756962767516, + "grad_norm": 1.5716772729800585, + "learning_rate": 0.00014947245017584993, + "loss": 4.011584281921387, + "step": 850, + "token_acc": 0.2162639758639049 + }, + { + "epoch": 0.4989739079448842, + "grad_norm": 1.949847969488178, + "learning_rate": 0.00014964830011723329, + "loss": 3.9427664279937744, + "step": 851, + "token_acc": 0.22413635735390378 + }, + { + "epoch": 0.4995602462620932, + "grad_norm": 1.9896352603864982, + "learning_rate": 0.00014982415005861664, + "loss": 4.000300407409668, + "step": 852, + "token_acc": 0.21806170210052417 + }, + { + "epoch": 0.5001465845793023, + "grad_norm": 1.7644891475355713, + "learning_rate": 0.00015, + "loss": 3.979973793029785, + "step": 853, + "token_acc": 0.22028406398483627 + }, + { + "epoch": 0.5007329228965113, + "grad_norm": 2.2017802075397044, + "learning_rate": 0.00015017584994138334, + "loss": 3.9872255325317383, + "step": 854, + "token_acc": 0.21750855616181755 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 1.9348175449343203, + "learning_rate": 0.0001503516998827667, + "loss": 4.005254745483398, + "step": 855, + "token_acc": 0.21751924966692182 + }, + { + "epoch": 0.5019055995309294, + "grad_norm": 1.8726980600246896, + "learning_rate": 0.00015052754982415004, + "loss": 3.987555980682373, + "step": 856, + "token_acc": 0.21767533062651193 + }, + { + "epoch": 0.5024919378481384, + "grad_norm": 1.990956672328526, + "learning_rate": 0.00015070339976553342, + "loss": 3.960458517074585, + "step": 857, + "token_acc": 0.22118701835268162 + }, + { + "epoch": 0.5030782761653474, + "grad_norm": 1.8878931629909304, + "learning_rate": 0.00015087924970691677, + "loss": 3.9807324409484863, + "step": 858, + "token_acc": 0.21681157292974515 + }, + { + "epoch": 0.5036646144825564, + "grad_norm": 2.1481393464654372, + "learning_rate": 0.00015105509964830012, + "loss": 4.004087448120117, + "step": 859, + "token_acc": 0.21796587704822173 + }, + { + "epoch": 0.5042509527997655, + "grad_norm": 1.704303098304262, + "learning_rate": 0.00015123094958968347, + "loss": 3.969203472137451, + "step": 860, + "token_acc": 0.21856859915087284 + }, + { + "epoch": 0.5048372911169745, + "grad_norm": 1.7188916071284426, + "learning_rate": 0.00015140679953106682, + "loss": 3.929818630218506, + "step": 861, + "token_acc": 0.22368165993378253 + }, + { + "epoch": 0.5054236294341835, + "grad_norm": 1.8650604888006674, + "learning_rate": 0.00015158264947245014, + "loss": 3.9721601009368896, + "step": 862, + "token_acc": 0.219752282996307 + }, + { + "epoch": 0.5060099677513925, + "grad_norm": 2.0548758658245267, + "learning_rate": 0.0001517584994138335, + "loss": 3.990267276763916, + "step": 863, + "token_acc": 0.21548263359781408 + }, + { + "epoch": 0.5065963060686016, + "grad_norm": 1.5728431121297184, + "learning_rate": 0.00015193434935521685, + "loss": 4.012087345123291, + "step": 864, + "token_acc": 0.21419306105438632 + }, + { + "epoch": 0.5071826443858106, + "grad_norm": 1.8105860775151201, + "learning_rate": 0.0001521101992966002, + "loss": 3.946206569671631, + "step": 865, + "token_acc": 0.2221922390484231 + }, + { + "epoch": 0.5077689827030196, + "grad_norm": 1.5569200935759993, + "learning_rate": 0.00015228604923798358, + "loss": 3.9780349731445312, + "step": 866, + "token_acc": 0.219986240560004 + }, + { + "epoch": 0.5083553210202286, + "grad_norm": 1.8787609377561236, + "learning_rate": 0.00015246189917936693, + "loss": 3.9243876934051514, + "step": 867, + "token_acc": 0.22371271038121932 + }, + { + "epoch": 0.5089416593374377, + "grad_norm": 1.6512207327099913, + "learning_rate": 0.00015263774912075028, + "loss": 3.876498222351074, + "step": 868, + "token_acc": 0.22979409141109505 + }, + { + "epoch": 0.5095279976546467, + "grad_norm": 1.756021093611537, + "learning_rate": 0.00015281359906213363, + "loss": 3.99798846244812, + "step": 869, + "token_acc": 0.21573954844783863 + }, + { + "epoch": 0.5101143359718557, + "grad_norm": 1.718331963623993, + "learning_rate": 0.00015298944900351698, + "loss": 3.944492816925049, + "step": 870, + "token_acc": 0.2218630920848912 + }, + { + "epoch": 0.5107006742890647, + "grad_norm": 1.7128046050177161, + "learning_rate": 0.00015316529894490033, + "loss": 3.983834743499756, + "step": 871, + "token_acc": 0.21704382516494622 + }, + { + "epoch": 0.5112870126062738, + "grad_norm": 1.8836001278405181, + "learning_rate": 0.00015334114888628368, + "loss": 3.883991003036499, + "step": 872, + "token_acc": 0.23023657659451197 + }, + { + "epoch": 0.5118733509234829, + "grad_norm": 1.8942648121354546, + "learning_rate": 0.00015351699882766706, + "loss": 3.985807180404663, + "step": 873, + "token_acc": 0.21714366547539296 + }, + { + "epoch": 0.5124596892406919, + "grad_norm": 2.0609604959415804, + "learning_rate": 0.0001536928487690504, + "loss": 3.9133596420288086, + "step": 874, + "token_acc": 0.2254088136198552 + }, + { + "epoch": 0.513046027557901, + "grad_norm": 1.7925070501652656, + "learning_rate": 0.00015386869871043376, + "loss": 3.8894381523132324, + "step": 875, + "token_acc": 0.22630443301120085 + }, + { + "epoch": 0.51363236587511, + "grad_norm": 1.979811186356441, + "learning_rate": 0.0001540445486518171, + "loss": 3.910457134246826, + "step": 876, + "token_acc": 0.22494236041874377 + }, + { + "epoch": 0.514218704192319, + "grad_norm": 2.2961128449422707, + "learning_rate": 0.00015422039859320046, + "loss": 3.9257307052612305, + "step": 877, + "token_acc": 0.2224986092346817 + }, + { + "epoch": 0.514805042509528, + "grad_norm": 2.08409492393019, + "learning_rate": 0.0001543962485345838, + "loss": 3.9230222702026367, + "step": 878, + "token_acc": 0.2227323400969341 + }, + { + "epoch": 0.5153913808267371, + "grad_norm": 1.8807552297427133, + "learning_rate": 0.00015457209847596716, + "loss": 3.9706668853759766, + "step": 879, + "token_acc": 0.2191473859669354 + }, + { + "epoch": 0.5159777191439461, + "grad_norm": 2.150173111629756, + "learning_rate": 0.0001547479484173505, + "loss": 3.8807334899902344, + "step": 880, + "token_acc": 0.22980057458007425 + }, + { + "epoch": 0.5165640574611551, + "grad_norm": 1.7047678488505607, + "learning_rate": 0.00015492379835873387, + "loss": 3.9462296962738037, + "step": 881, + "token_acc": 0.22053098532321688 + }, + { + "epoch": 0.5171503957783641, + "grad_norm": 1.9234628884001572, + "learning_rate": 0.00015509964830011722, + "loss": 3.891421318054199, + "step": 882, + "token_acc": 0.22498364604857382 + }, + { + "epoch": 0.5177367340955732, + "grad_norm": 1.7389757320470693, + "learning_rate": 0.00015527549824150057, + "loss": 3.930666446685791, + "step": 883, + "token_acc": 0.22066262914143214 + }, + { + "epoch": 0.5183230724127822, + "grad_norm": 2.142997139557562, + "learning_rate": 0.00015545134818288392, + "loss": 3.8838438987731934, + "step": 884, + "token_acc": 0.22805118803282837 + }, + { + "epoch": 0.5189094107299912, + "grad_norm": 1.68542605805146, + "learning_rate": 0.00015562719812426727, + "loss": 3.9110770225524902, + "step": 885, + "token_acc": 0.22470971281123966 + }, + { + "epoch": 0.5194957490472002, + "grad_norm": 2.198383007322505, + "learning_rate": 0.00015580304806565062, + "loss": 3.9386661052703857, + "step": 886, + "token_acc": 0.22072635121799747 + }, + { + "epoch": 0.5200820873644093, + "grad_norm": 1.460136455490652, + "learning_rate": 0.00015597889800703397, + "loss": 3.934483528137207, + "step": 887, + "token_acc": 0.2227731440213931 + }, + { + "epoch": 0.5206684256816183, + "grad_norm": 2.0341208810793736, + "learning_rate": 0.00015615474794841735, + "loss": 3.849897861480713, + "step": 888, + "token_acc": 0.23172450308457923 + }, + { + "epoch": 0.5212547639988273, + "grad_norm": 1.7702890794575363, + "learning_rate": 0.0001563305978898007, + "loss": 3.8413984775543213, + "step": 889, + "token_acc": 0.23266234041201753 + }, + { + "epoch": 0.5218411023160363, + "grad_norm": 1.3172823022436198, + "learning_rate": 0.00015650644783118405, + "loss": 4.023441791534424, + "step": 890, + "token_acc": 0.2140231777076869 + }, + { + "epoch": 0.5224274406332454, + "grad_norm": 1.7987630372430532, + "learning_rate": 0.0001566822977725674, + "loss": 3.9418463706970215, + "step": 891, + "token_acc": 0.21902007098738144 + }, + { + "epoch": 0.5230137789504544, + "grad_norm": 1.5457812760931215, + "learning_rate": 0.00015685814771395075, + "loss": 3.8915979862213135, + "step": 892, + "token_acc": 0.22530290641492604 + }, + { + "epoch": 0.5236001172676634, + "grad_norm": 1.8524408684101006, + "learning_rate": 0.0001570339976553341, + "loss": 3.956329822540283, + "step": 893, + "token_acc": 0.21877838663433324 + }, + { + "epoch": 0.5241864555848724, + "grad_norm": 1.6202311706340022, + "learning_rate": 0.00015720984759671745, + "loss": 3.970872163772583, + "step": 894, + "token_acc": 0.2151157834919733 + }, + { + "epoch": 0.5247727939020815, + "grad_norm": 1.7296810237840348, + "learning_rate": 0.0001573856975381008, + "loss": 3.884525775909424, + "step": 895, + "token_acc": 0.22611281955704657 + }, + { + "epoch": 0.5253591322192905, + "grad_norm": 1.9700508466213726, + "learning_rate": 0.00015756154747948418, + "loss": 3.886685371398926, + "step": 896, + "token_acc": 0.22721692152073364 + }, + { + "epoch": 0.5259454705364995, + "grad_norm": 1.6643932546054272, + "learning_rate": 0.00015773739742086753, + "loss": 3.9216156005859375, + "step": 897, + "token_acc": 0.22382920825500358 + }, + { + "epoch": 0.5265318088537085, + "grad_norm": 2.0422673315712836, + "learning_rate": 0.00015791324736225086, + "loss": 3.9734396934509277, + "step": 898, + "token_acc": 0.21721680088589893 + }, + { + "epoch": 0.5271181471709177, + "grad_norm": 1.8285500570488256, + "learning_rate": 0.0001580890973036342, + "loss": 3.9177701473236084, + "step": 899, + "token_acc": 0.22320209577067432 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 1.892132984584803, + "learning_rate": 0.00015826494724501756, + "loss": 3.8765602111816406, + "step": 900, + "token_acc": 0.2274972571764112 + }, + { + "epoch": 0.5282908238053357, + "grad_norm": 2.0480632835267034, + "learning_rate": 0.0001584407971864009, + "loss": 3.8768796920776367, + "step": 901, + "token_acc": 0.22793550519704706 + }, + { + "epoch": 0.5288771621225447, + "grad_norm": 1.8469982288869577, + "learning_rate": 0.00015861664712778426, + "loss": 3.9207754135131836, + "step": 902, + "token_acc": 0.2237546468401487 + }, + { + "epoch": 0.5294635004397538, + "grad_norm": 1.8078244233765264, + "learning_rate": 0.0001587924970691676, + "loss": 3.962942361831665, + "step": 903, + "token_acc": 0.21719667952510122 + }, + { + "epoch": 0.5300498387569628, + "grad_norm": 2.0112665418099884, + "learning_rate": 0.000158968347010551, + "loss": 3.9658801555633545, + "step": 904, + "token_acc": 0.21717460219436382 + }, + { + "epoch": 0.5306361770741718, + "grad_norm": 1.5803050145359165, + "learning_rate": 0.00015914419695193434, + "loss": 3.930394172668457, + "step": 905, + "token_acc": 0.2199539958041269 + }, + { + "epoch": 0.5312225153913809, + "grad_norm": 2.265693151130477, + "learning_rate": 0.0001593200468933177, + "loss": 3.926424264907837, + "step": 906, + "token_acc": 0.22296035584276616 + }, + { + "epoch": 0.5318088537085899, + "grad_norm": 1.6345979565422217, + "learning_rate": 0.00015949589683470104, + "loss": 3.871497392654419, + "step": 907, + "token_acc": 0.22907294205388415 + }, + { + "epoch": 0.5323951920257989, + "grad_norm": 1.8870322544597933, + "learning_rate": 0.0001596717467760844, + "loss": 3.909820079803467, + "step": 908, + "token_acc": 0.22357154341252924 + }, + { + "epoch": 0.5329815303430079, + "grad_norm": 1.7884835913229296, + "learning_rate": 0.00015984759671746774, + "loss": 3.866403818130493, + "step": 909, + "token_acc": 0.2279854950392159 + }, + { + "epoch": 0.533567868660217, + "grad_norm": 1.6228249739147715, + "learning_rate": 0.0001600234466588511, + "loss": 3.926779270172119, + "step": 910, + "token_acc": 0.22188930077975202 + }, + { + "epoch": 0.534154206977426, + "grad_norm": 2.0371392394497576, + "learning_rate": 0.00016019929660023447, + "loss": 3.983499526977539, + "step": 911, + "token_acc": 0.21486293120309652 + }, + { + "epoch": 0.534740545294635, + "grad_norm": 1.9346715671081824, + "learning_rate": 0.00016037514654161782, + "loss": 3.9338266849517822, + "step": 912, + "token_acc": 0.2199525671567353 + }, + { + "epoch": 0.535326883611844, + "grad_norm": 1.6310952632069726, + "learning_rate": 0.00016055099648300118, + "loss": 3.907869577407837, + "step": 913, + "token_acc": 0.22508152728216785 + }, + { + "epoch": 0.5359132219290531, + "grad_norm": 1.6824459491486203, + "learning_rate": 0.00016072684642438453, + "loss": 3.873809337615967, + "step": 914, + "token_acc": 0.2271099704744425 + }, + { + "epoch": 0.5364995602462621, + "grad_norm": 1.7378065073842361, + "learning_rate": 0.00016090269636576788, + "loss": 3.869752883911133, + "step": 915, + "token_acc": 0.22676870176028938 + }, + { + "epoch": 0.5370858985634711, + "grad_norm": 2.2672502519030124, + "learning_rate": 0.0001610785463071512, + "loss": 3.85251522064209, + "step": 916, + "token_acc": 0.22783467121248743 + }, + { + "epoch": 0.5376722368806801, + "grad_norm": 1.4910400619078958, + "learning_rate": 0.00016125439624853455, + "loss": 3.8842687606811523, + "step": 917, + "token_acc": 0.22513684256666794 + }, + { + "epoch": 0.5382585751978892, + "grad_norm": 2.1919057211015858, + "learning_rate": 0.0001614302461899179, + "loss": 3.872668981552124, + "step": 918, + "token_acc": 0.22686832740213522 + }, + { + "epoch": 0.5388449135150982, + "grad_norm": 1.6830724282109406, + "learning_rate": 0.00016160609613130125, + "loss": 3.909123420715332, + "step": 919, + "token_acc": 0.22325362460046513 + }, + { + "epoch": 0.5394312518323072, + "grad_norm": 2.350468600195527, + "learning_rate": 0.00016178194607268463, + "loss": 3.887495279312134, + "step": 920, + "token_acc": 0.22444137889215657 + }, + { + "epoch": 0.5400175901495162, + "grad_norm": 1.4240916291357377, + "learning_rate": 0.00016195779601406798, + "loss": 3.901359796524048, + "step": 921, + "token_acc": 0.22152898734381993 + }, + { + "epoch": 0.5406039284667253, + "grad_norm": 2.5578139137799214, + "learning_rate": 0.00016213364595545133, + "loss": 3.8996005058288574, + "step": 922, + "token_acc": 0.2224880568208536 + }, + { + "epoch": 0.5411902667839343, + "grad_norm": 1.654339863655892, + "learning_rate": 0.00016230949589683468, + "loss": 3.8932442665100098, + "step": 923, + "token_acc": 0.22398950144584012 + }, + { + "epoch": 0.5417766051011433, + "grad_norm": 2.305658898130935, + "learning_rate": 0.00016248534583821804, + "loss": 3.9399142265319824, + "step": 924, + "token_acc": 0.21933216359067673 + }, + { + "epoch": 0.5423629434183523, + "grad_norm": 1.5671466964981424, + "learning_rate": 0.00016266119577960139, + "loss": 3.9029433727264404, + "step": 925, + "token_acc": 0.2244351100811124 + }, + { + "epoch": 0.5429492817355615, + "grad_norm": 1.4155163165349693, + "learning_rate": 0.00016283704572098474, + "loss": 3.8714826107025146, + "step": 926, + "token_acc": 0.22622458346082167 + }, + { + "epoch": 0.5435356200527705, + "grad_norm": 1.8090848368862913, + "learning_rate": 0.00016301289566236812, + "loss": 3.9201483726501465, + "step": 927, + "token_acc": 0.22064769828416583 + }, + { + "epoch": 0.5441219583699795, + "grad_norm": 1.7077767009824116, + "learning_rate": 0.00016318874560375147, + "loss": 3.915027141571045, + "step": 928, + "token_acc": 0.21957110632731466 + }, + { + "epoch": 0.5447082966871885, + "grad_norm": 2.3053761811942692, + "learning_rate": 0.00016336459554513482, + "loss": 3.8680150508880615, + "step": 929, + "token_acc": 0.22908105340198773 + }, + { + "epoch": 0.5452946350043976, + "grad_norm": 1.7038704881339632, + "learning_rate": 0.00016354044548651817, + "loss": 3.8649606704711914, + "step": 930, + "token_acc": 0.22599985395977634 + }, + { + "epoch": 0.5458809733216066, + "grad_norm": 1.9590465692716896, + "learning_rate": 0.00016371629542790152, + "loss": 3.916379928588867, + "step": 931, + "token_acc": 0.22121331822576118 + }, + { + "epoch": 0.5464673116388156, + "grad_norm": 2.107097180155753, + "learning_rate": 0.00016389214536928487, + "loss": 3.860433578491211, + "step": 932, + "token_acc": 0.22758102385476603 + }, + { + "epoch": 0.5470536499560247, + "grad_norm": 2.020049755319126, + "learning_rate": 0.0001640679953106682, + "loss": 3.876265048980713, + "step": 933, + "token_acc": 0.2249887457394585 + }, + { + "epoch": 0.5476399882732337, + "grad_norm": 1.8651374267246572, + "learning_rate": 0.00016424384525205154, + "loss": 3.887596368789673, + "step": 934, + "token_acc": 0.22486311756928515 + }, + { + "epoch": 0.5482263265904427, + "grad_norm": 1.6270706204818088, + "learning_rate": 0.0001644196951934349, + "loss": 3.7863473892211914, + "step": 935, + "token_acc": 0.23526055088422362 + }, + { + "epoch": 0.5488126649076517, + "grad_norm": 1.8790362280306379, + "learning_rate": 0.00016459554513481827, + "loss": 3.892228603363037, + "step": 936, + "token_acc": 0.22367166868935198 + }, + { + "epoch": 0.5493990032248608, + "grad_norm": 1.6262894298631754, + "learning_rate": 0.00016477139507620162, + "loss": 3.8819103240966797, + "step": 937, + "token_acc": 0.22461407509378842 + }, + { + "epoch": 0.5499853415420698, + "grad_norm": 1.6835260779443002, + "learning_rate": 0.00016494724501758497, + "loss": 3.8713810443878174, + "step": 938, + "token_acc": 0.22596033205135427 + }, + { + "epoch": 0.5505716798592788, + "grad_norm": 1.6139327305337157, + "learning_rate": 0.00016512309495896833, + "loss": 3.8582606315612793, + "step": 939, + "token_acc": 0.22724204171955678 + }, + { + "epoch": 0.5511580181764878, + "grad_norm": 1.6328977273897982, + "learning_rate": 0.00016529894490035168, + "loss": 3.918926239013672, + "step": 940, + "token_acc": 0.21781373723104538 + }, + { + "epoch": 0.5517443564936969, + "grad_norm": 2.540165440159551, + "learning_rate": 0.00016547479484173503, + "loss": 3.8559579849243164, + "step": 941, + "token_acc": 0.2290715995555544 + }, + { + "epoch": 0.5523306948109059, + "grad_norm": 1.6070015667916382, + "learning_rate": 0.00016565064478311838, + "loss": 3.8953933715820312, + "step": 942, + "token_acc": 0.2232832507228418 + }, + { + "epoch": 0.5529170331281149, + "grad_norm": 2.6087940129834606, + "learning_rate": 0.00016582649472450176, + "loss": 3.8771557807922363, + "step": 943, + "token_acc": 0.22429230231061525 + }, + { + "epoch": 0.5535033714453239, + "grad_norm": 1.6772942708898626, + "learning_rate": 0.0001660023446658851, + "loss": 3.8471951484680176, + "step": 944, + "token_acc": 0.22528464797687553 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 2.1197199190074536, + "learning_rate": 0.00016617819460726846, + "loss": 3.889328956604004, + "step": 945, + "token_acc": 0.22346470983584693 + }, + { + "epoch": 0.554676048079742, + "grad_norm": 1.446835203621188, + "learning_rate": 0.0001663540445486518, + "loss": 3.8504536151885986, + "step": 946, + "token_acc": 0.22807207216709605 + }, + { + "epoch": 0.555262386396951, + "grad_norm": 2.0554873327067367, + "learning_rate": 0.00016652989449003516, + "loss": 3.887838840484619, + "step": 947, + "token_acc": 0.22247060174613476 + }, + { + "epoch": 0.55584872471416, + "grad_norm": 1.4985531094150735, + "learning_rate": 0.0001667057444314185, + "loss": 3.8691744804382324, + "step": 948, + "token_acc": 0.22617778441134767 + }, + { + "epoch": 0.5564350630313691, + "grad_norm": 1.7896530441901979, + "learning_rate": 0.00016688159437280186, + "loss": 3.8619778156280518, + "step": 949, + "token_acc": 0.22637287973650477 + }, + { + "epoch": 0.5570214013485781, + "grad_norm": 1.6209176814096278, + "learning_rate": 0.00016705744431418524, + "loss": 3.883054256439209, + "step": 950, + "token_acc": 0.2237000823657571 + }, + { + "epoch": 0.5576077396657871, + "grad_norm": 2.033597766300496, + "learning_rate": 0.00016723329425556856, + "loss": 3.8718981742858887, + "step": 951, + "token_acc": 0.2251093113518304 + }, + { + "epoch": 0.5581940779829961, + "grad_norm": 2.042305998126389, + "learning_rate": 0.00016740914419695191, + "loss": 3.8430066108703613, + "step": 952, + "token_acc": 0.22733380394166275 + }, + { + "epoch": 0.5587804163002053, + "grad_norm": 1.4804159926717073, + "learning_rate": 0.00016758499413833527, + "loss": 3.889702796936035, + "step": 953, + "token_acc": 0.22307067936988267 + }, + { + "epoch": 0.5593667546174143, + "grad_norm": 1.9605146504877211, + "learning_rate": 0.00016776084407971862, + "loss": 3.805490732192993, + "step": 954, + "token_acc": 0.2299566384257852 + }, + { + "epoch": 0.5599530929346233, + "grad_norm": 1.329651864074579, + "learning_rate": 0.00016793669402110197, + "loss": 3.8096566200256348, + "step": 955, + "token_acc": 0.2306157819879429 + }, + { + "epoch": 0.5605394312518323, + "grad_norm": 2.1099994418681005, + "learning_rate": 0.00016811254396248532, + "loss": 3.752127170562744, + "step": 956, + "token_acc": 0.2341067109216535 + }, + { + "epoch": 0.5611257695690414, + "grad_norm": 1.6716877531090917, + "learning_rate": 0.00016828839390386867, + "loss": 3.792173147201538, + "step": 957, + "token_acc": 0.23246892109500805 + }, + { + "epoch": 0.5617121078862504, + "grad_norm": 1.776225589174403, + "learning_rate": 0.00016846424384525205, + "loss": 3.7778494358062744, + "step": 958, + "token_acc": 0.23393346090008563 + }, + { + "epoch": 0.5622984462034594, + "grad_norm": 2.0573797546130765, + "learning_rate": 0.0001686400937866354, + "loss": 3.80838680267334, + "step": 959, + "token_acc": 0.23018234395930243 + }, + { + "epoch": 0.5628847845206685, + "grad_norm": 2.627189945187957, + "learning_rate": 0.00016881594372801875, + "loss": 3.849771499633789, + "step": 960, + "token_acc": 0.22590771705431384 + }, + { + "epoch": 0.5634711228378775, + "grad_norm": 1.4022259144079376, + "learning_rate": 0.0001689917936694021, + "loss": 3.798753261566162, + "step": 961, + "token_acc": 0.23199926703602303 + }, + { + "epoch": 0.5640574611550865, + "grad_norm": 2.641507792972365, + "learning_rate": 0.00016916764361078545, + "loss": 3.8406167030334473, + "step": 962, + "token_acc": 0.22649577042069668 + }, + { + "epoch": 0.5646437994722955, + "grad_norm": 1.6975595373942463, + "learning_rate": 0.0001693434935521688, + "loss": 3.775881290435791, + "step": 963, + "token_acc": 0.2357917643034074 + }, + { + "epoch": 0.5652301377895046, + "grad_norm": 1.910419676601113, + "learning_rate": 0.00016951934349355215, + "loss": 3.7803802490234375, + "step": 964, + "token_acc": 0.2310104062999735 + }, + { + "epoch": 0.5658164761067136, + "grad_norm": 2.0456848023221528, + "learning_rate": 0.00016969519343493553, + "loss": 3.8123297691345215, + "step": 965, + "token_acc": 0.22908259662709057 + }, + { + "epoch": 0.5664028144239226, + "grad_norm": 2.0004497909113788, + "learning_rate": 0.00016987104337631888, + "loss": 3.8176627159118652, + "step": 966, + "token_acc": 0.23051391374577732 + }, + { + "epoch": 0.5669891527411316, + "grad_norm": 1.6749179829807868, + "learning_rate": 0.00017004689331770223, + "loss": 3.865586757659912, + "step": 967, + "token_acc": 0.22201220415835765 + }, + { + "epoch": 0.5675754910583407, + "grad_norm": 1.43314195297598, + "learning_rate": 0.00017022274325908558, + "loss": 3.8020176887512207, + "step": 968, + "token_acc": 0.22984268895728757 + }, + { + "epoch": 0.5681618293755497, + "grad_norm": 1.8604374951102054, + "learning_rate": 0.0001703985932004689, + "loss": 3.7938385009765625, + "step": 969, + "token_acc": 0.2336951736781865 + }, + { + "epoch": 0.5687481676927587, + "grad_norm": 2.058462608515411, + "learning_rate": 0.00017057444314185226, + "loss": 3.8569295406341553, + "step": 970, + "token_acc": 0.2216671669861014 + }, + { + "epoch": 0.5693345060099677, + "grad_norm": 1.5182828879918648, + "learning_rate": 0.0001707502930832356, + "loss": 3.7685647010803223, + "step": 971, + "token_acc": 0.23434252893801036 + }, + { + "epoch": 0.5699208443271768, + "grad_norm": 1.876005075206123, + "learning_rate": 0.00017092614302461896, + "loss": 3.760934829711914, + "step": 972, + "token_acc": 0.23547494620350445 + }, + { + "epoch": 0.5705071826443858, + "grad_norm": 1.5505384496975014, + "learning_rate": 0.0001711019929660023, + "loss": 3.8039684295654297, + "step": 973, + "token_acc": 0.22771953432640576 + }, + { + "epoch": 0.5710935209615948, + "grad_norm": 1.8705460505803075, + "learning_rate": 0.0001712778429073857, + "loss": 3.7606213092803955, + "step": 974, + "token_acc": 0.23188597824420837 + }, + { + "epoch": 0.5716798592788038, + "grad_norm": 1.7528434236491515, + "learning_rate": 0.00017145369284876904, + "loss": 3.761270523071289, + "step": 975, + "token_acc": 0.23444719557424942 + }, + { + "epoch": 0.5722661975960129, + "grad_norm": 2.0821731093703497, + "learning_rate": 0.0001716295427901524, + "loss": 3.7339396476745605, + "step": 976, + "token_acc": 0.2378353388722311 + }, + { + "epoch": 0.5728525359132219, + "grad_norm": 1.9691817522107247, + "learning_rate": 0.00017180539273153574, + "loss": 3.732114553451538, + "step": 977, + "token_acc": 0.238153206172103 + }, + { + "epoch": 0.5734388742304309, + "grad_norm": 2.4070037941293068, + "learning_rate": 0.0001719812426729191, + "loss": 3.7604122161865234, + "step": 978, + "token_acc": 0.23372010539116336 + }, + { + "epoch": 0.5740252125476399, + "grad_norm": 1.6260488737434329, + "learning_rate": 0.00017215709261430244, + "loss": 3.8127951622009277, + "step": 979, + "token_acc": 0.22550215952647382 + }, + { + "epoch": 0.574611550864849, + "grad_norm": 1.9366604197758266, + "learning_rate": 0.0001723329425556858, + "loss": 3.763200044631958, + "step": 980, + "token_acc": 0.23357217573221759 + }, + { + "epoch": 0.575197889182058, + "grad_norm": 2.0556119984650834, + "learning_rate": 0.00017250879249706917, + "loss": 3.8390207290649414, + "step": 981, + "token_acc": 0.2248840603714209 + }, + { + "epoch": 0.575784227499267, + "grad_norm": 1.772028914817992, + "learning_rate": 0.00017268464243845252, + "loss": 3.772087574005127, + "step": 982, + "token_acc": 0.23338080091563634 + }, + { + "epoch": 0.576370565816476, + "grad_norm": 1.4915034547494663, + "learning_rate": 0.00017286049237983587, + "loss": 3.791701078414917, + "step": 983, + "token_acc": 0.2299218870445552 + }, + { + "epoch": 0.5769569041336852, + "grad_norm": 1.7591559593946273, + "learning_rate": 0.00017303634232121922, + "loss": 3.71233868598938, + "step": 984, + "token_acc": 0.23830011919449615 + }, + { + "epoch": 0.5775432424508942, + "grad_norm": 1.7977195385366684, + "learning_rate": 0.00017321219226260257, + "loss": 3.7185568809509277, + "step": 985, + "token_acc": 0.23748368930705308 + }, + { + "epoch": 0.5781295807681032, + "grad_norm": 2.032938491014217, + "learning_rate": 0.00017338804220398593, + "loss": 3.732858657836914, + "step": 986, + "token_acc": 0.23777224019303106 + }, + { + "epoch": 0.5787159190853123, + "grad_norm": 1.6908459291620666, + "learning_rate": 0.00017356389214536925, + "loss": 3.807938814163208, + "step": 987, + "token_acc": 0.22720925697198205 + }, + { + "epoch": 0.5793022574025213, + "grad_norm": 1.9746385185012727, + "learning_rate": 0.0001737397420867526, + "loss": 3.789818286895752, + "step": 988, + "token_acc": 0.23036192354658744 + }, + { + "epoch": 0.5798885957197303, + "grad_norm": 1.7685715705150378, + "learning_rate": 0.00017391559202813595, + "loss": 3.7408945560455322, + "step": 989, + "token_acc": 0.23466769868750528 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 2.085513015081606, + "learning_rate": 0.00017409144196951933, + "loss": 3.779965400695801, + "step": 990, + "token_acc": 0.23080559853564314 + }, + { + "epoch": 0.5810612723541484, + "grad_norm": 1.97289393508748, + "learning_rate": 0.00017426729191090268, + "loss": 3.73659610748291, + "step": 991, + "token_acc": 0.23539004534212696 + }, + { + "epoch": 0.5816476106713574, + "grad_norm": 1.5719566316629405, + "learning_rate": 0.00017444314185228603, + "loss": 3.767144203186035, + "step": 992, + "token_acc": 0.231133577248078 + }, + { + "epoch": 0.5822339489885664, + "grad_norm": 2.221009110169234, + "learning_rate": 0.00017461899179366938, + "loss": 3.813162088394165, + "step": 993, + "token_acc": 0.22603453370172047 + }, + { + "epoch": 0.5828202873057754, + "grad_norm": 1.5729398388240954, + "learning_rate": 0.00017479484173505273, + "loss": 3.738103151321411, + "step": 994, + "token_acc": 0.23470266655182373 + }, + { + "epoch": 0.5834066256229845, + "grad_norm": 2.0343612603791104, + "learning_rate": 0.00017497069167643608, + "loss": 3.756946563720703, + "step": 995, + "token_acc": 0.2318222254497921 + }, + { + "epoch": 0.5839929639401935, + "grad_norm": 1.780455478426327, + "learning_rate": 0.00017514654161781943, + "loss": 3.763497829437256, + "step": 996, + "token_acc": 0.23103976772785306 + }, + { + "epoch": 0.5845793022574025, + "grad_norm": 1.6800980913892105, + "learning_rate": 0.0001753223915592028, + "loss": 3.753526210784912, + "step": 997, + "token_acc": 0.2321179654670696 + }, + { + "epoch": 0.5851656405746115, + "grad_norm": 2.265499265235378, + "learning_rate": 0.00017549824150058616, + "loss": 3.7691006660461426, + "step": 998, + "token_acc": 0.22900453955901426 + }, + { + "epoch": 0.5857519788918206, + "grad_norm": 1.6240050212863657, + "learning_rate": 0.00017567409144196951, + "loss": 3.7468395233154297, + "step": 999, + "token_acc": 0.23339079070451846 + }, + { + "epoch": 0.5863383172090296, + "grad_norm": 2.2018163113534612, + "learning_rate": 0.00017584994138335287, + "loss": 3.7238576412200928, + "step": 1000, + "token_acc": 0.23595961924924083 + }, + { + "epoch": 0.5869246555262386, + "grad_norm": 1.6378836151373348, + "learning_rate": 0.00017602579132473622, + "loss": 3.7456817626953125, + "step": 1001, + "token_acc": 0.23389593975459477 + }, + { + "epoch": 0.5875109938434476, + "grad_norm": 2.28179104386628, + "learning_rate": 0.00017620164126611957, + "loss": 3.812601089477539, + "step": 1002, + "token_acc": 0.22701729235274462 + }, + { + "epoch": 0.5880973321606567, + "grad_norm": 1.8069343466352854, + "learning_rate": 0.00017637749120750292, + "loss": 3.777170181274414, + "step": 1003, + "token_acc": 0.22912079498592827 + }, + { + "epoch": 0.5886836704778657, + "grad_norm": 1.8631877657548856, + "learning_rate": 0.0001765533411488863, + "loss": 3.7404391765594482, + "step": 1004, + "token_acc": 0.23333145963655635 + }, + { + "epoch": 0.5892700087950747, + "grad_norm": 1.7310119191640232, + "learning_rate": 0.0001767291910902696, + "loss": 3.7471632957458496, + "step": 1005, + "token_acc": 0.2318820998103318 + }, + { + "epoch": 0.5898563471122837, + "grad_norm": 1.6105529593792534, + "learning_rate": 0.00017690504103165297, + "loss": 3.741431713104248, + "step": 1006, + "token_acc": 0.23254195223558424 + }, + { + "epoch": 0.5904426854294929, + "grad_norm": 1.7639133298148444, + "learning_rate": 0.00017708089097303632, + "loss": 3.74269962310791, + "step": 1007, + "token_acc": 0.23368833585421506 + }, + { + "epoch": 0.5910290237467019, + "grad_norm": 2.024488540150968, + "learning_rate": 0.00017725674091441967, + "loss": 3.6688318252563477, + "step": 1008, + "token_acc": 0.241715951512101 + }, + { + "epoch": 0.5916153620639109, + "grad_norm": 2.0853110137348727, + "learning_rate": 0.00017743259085580302, + "loss": 3.6972496509552, + "step": 1009, + "token_acc": 0.23908795827385176 + }, + { + "epoch": 0.5922017003811199, + "grad_norm": 1.9697296507358653, + "learning_rate": 0.00017760844079718637, + "loss": 3.709904432296753, + "step": 1010, + "token_acc": 0.2352049776967273 + }, + { + "epoch": 0.592788038698329, + "grad_norm": 1.5305082980186744, + "learning_rate": 0.00017778429073856973, + "loss": 3.7503957748413086, + "step": 1011, + "token_acc": 0.2309733824521351 + }, + { + "epoch": 0.593374377015538, + "grad_norm": 1.7420866171252485, + "learning_rate": 0.00017796014067995308, + "loss": 3.666067600250244, + "step": 1012, + "token_acc": 0.2417316030365603 + }, + { + "epoch": 0.593960715332747, + "grad_norm": 1.754435111239744, + "learning_rate": 0.00017813599062133645, + "loss": 3.6906795501708984, + "step": 1013, + "token_acc": 0.23827355143870713 + }, + { + "epoch": 0.594547053649956, + "grad_norm": 1.5697168034186035, + "learning_rate": 0.0001783118405627198, + "loss": 3.729680061340332, + "step": 1014, + "token_acc": 0.2347010557082746 + }, + { + "epoch": 0.5951333919671651, + "grad_norm": 2.116488133678572, + "learning_rate": 0.00017848769050410316, + "loss": 3.6820173263549805, + "step": 1015, + "token_acc": 0.2389939750963643 + }, + { + "epoch": 0.5957197302843741, + "grad_norm": 1.8672920267123638, + "learning_rate": 0.0001786635404454865, + "loss": 3.724274158477783, + "step": 1016, + "token_acc": 0.2332079088786238 + }, + { + "epoch": 0.5963060686015831, + "grad_norm": 1.657482172222701, + "learning_rate": 0.00017883939038686986, + "loss": 3.7396392822265625, + "step": 1017, + "token_acc": 0.23240976745649644 + }, + { + "epoch": 0.5968924069187922, + "grad_norm": 2.088800459388359, + "learning_rate": 0.0001790152403282532, + "loss": 3.7212891578674316, + "step": 1018, + "token_acc": 0.23728237360001714 + }, + { + "epoch": 0.5974787452360012, + "grad_norm": 1.6670460566976946, + "learning_rate": 0.00017919109026963656, + "loss": 3.6965465545654297, + "step": 1019, + "token_acc": 0.23797888513069232 + }, + { + "epoch": 0.5980650835532102, + "grad_norm": 2.0990693646240994, + "learning_rate": 0.00017936694021101994, + "loss": 3.704850912094116, + "step": 1020, + "token_acc": 0.2369784629430917 + }, + { + "epoch": 0.5986514218704192, + "grad_norm": 1.555766524748678, + "learning_rate": 0.0001795427901524033, + "loss": 3.713559865951538, + "step": 1021, + "token_acc": 0.23499892092597552 + }, + { + "epoch": 0.5992377601876283, + "grad_norm": 1.9341881331459105, + "learning_rate": 0.0001797186400937866, + "loss": 3.6927831172943115, + "step": 1022, + "token_acc": 0.2388497459018265 + }, + { + "epoch": 0.5998240985048373, + "grad_norm": 1.5790932430517346, + "learning_rate": 0.00017989449003516996, + "loss": 3.6795473098754883, + "step": 1023, + "token_acc": 0.24034447449260904 + }, + { + "epoch": 0.6004104368220463, + "grad_norm": 2.1328626002951987, + "learning_rate": 0.00018007033997655331, + "loss": 3.679569721221924, + "step": 1024, + "token_acc": 0.2380116269532051 + }, + { + "epoch": 0.6009967751392553, + "grad_norm": 1.9213149808541714, + "learning_rate": 0.00018024618991793666, + "loss": 3.6670780181884766, + "step": 1025, + "token_acc": 0.23923851256260337 + }, + { + "epoch": 0.6015831134564644, + "grad_norm": 1.7268151431963263, + "learning_rate": 0.00018042203985932002, + "loss": 3.6794185638427734, + "step": 1026, + "token_acc": 0.2390296501440673 + }, + { + "epoch": 0.6021694517736734, + "grad_norm": 1.7862344592153199, + "learning_rate": 0.00018059788980070337, + "loss": 3.689814567565918, + "step": 1027, + "token_acc": 0.23739270313746164 + }, + { + "epoch": 0.6027557900908824, + "grad_norm": 1.408158527334921, + "learning_rate": 0.00018077373974208674, + "loss": 3.6734933853149414, + "step": 1028, + "token_acc": 0.2403965814432246 + }, + { + "epoch": 0.6033421284080914, + "grad_norm": 1.6313996795919927, + "learning_rate": 0.0001809495896834701, + "loss": 3.7151341438293457, + "step": 1029, + "token_acc": 0.23451085368897953 + }, + { + "epoch": 0.6039284667253005, + "grad_norm": 2.028145277378289, + "learning_rate": 0.00018112543962485345, + "loss": 3.6773934364318848, + "step": 1030, + "token_acc": 0.23917269304132632 + }, + { + "epoch": 0.6045148050425095, + "grad_norm": 1.7517553346273191, + "learning_rate": 0.0001813012895662368, + "loss": 3.7363381385803223, + "step": 1031, + "token_acc": 0.23308443964873776 + }, + { + "epoch": 0.6051011433597185, + "grad_norm": 2.3260384753709644, + "learning_rate": 0.00018147713950762015, + "loss": 3.682816982269287, + "step": 1032, + "token_acc": 0.23894650039905288 + }, + { + "epoch": 0.6056874816769275, + "grad_norm": 1.590409744856486, + "learning_rate": 0.0001816529894490035, + "loss": 3.698338508605957, + "step": 1033, + "token_acc": 0.23675260240352064 + }, + { + "epoch": 0.6062738199941367, + "grad_norm": 1.9647215600580257, + "learning_rate": 0.00018182883939038685, + "loss": 3.7348413467407227, + "step": 1034, + "token_acc": 0.23381419503750722 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 1.6307737572790548, + "learning_rate": 0.00018200468933177023, + "loss": 3.6462979316711426, + "step": 1035, + "token_acc": 0.24204562518929898 + }, + { + "epoch": 0.6074464966285547, + "grad_norm": 2.2673121932280202, + "learning_rate": 0.00018218053927315358, + "loss": 3.691218376159668, + "step": 1036, + "token_acc": 0.23765262246002258 + }, + { + "epoch": 0.6080328349457637, + "grad_norm": 1.3696256691750233, + "learning_rate": 0.00018235638921453693, + "loss": 3.693535089492798, + "step": 1037, + "token_acc": 0.23815732174201115 + }, + { + "epoch": 0.6086191732629728, + "grad_norm": 1.692119290208484, + "learning_rate": 0.00018253223915592028, + "loss": 3.665832757949829, + "step": 1038, + "token_acc": 0.2424768155079654 + }, + { + "epoch": 0.6092055115801818, + "grad_norm": 1.6373115228902064, + "learning_rate": 0.00018270808909730363, + "loss": 3.656285285949707, + "step": 1039, + "token_acc": 0.2417843345761936 + }, + { + "epoch": 0.6097918498973908, + "grad_norm": 1.5045690498071511, + "learning_rate": 0.00018288393903868696, + "loss": 3.6519041061401367, + "step": 1040, + "token_acc": 0.24315208174568248 + }, + { + "epoch": 0.6103781882145998, + "grad_norm": 1.472894308715283, + "learning_rate": 0.0001830597889800703, + "loss": 3.6748061180114746, + "step": 1041, + "token_acc": 0.24159138061626617 + }, + { + "epoch": 0.6109645265318089, + "grad_norm": 1.917801013938851, + "learning_rate": 0.00018323563892145366, + "loss": 3.6534295082092285, + "step": 1042, + "token_acc": 0.24099850572124537 + }, + { + "epoch": 0.6115508648490179, + "grad_norm": 1.2835467482420932, + "learning_rate": 0.000183411488862837, + "loss": 3.679368495941162, + "step": 1043, + "token_acc": 0.23895935009895655 + }, + { + "epoch": 0.6121372031662269, + "grad_norm": 2.029075993248839, + "learning_rate": 0.00018358733880422039, + "loss": 3.6091785430908203, + "step": 1044, + "token_acc": 0.2475746132420901 + }, + { + "epoch": 0.612723541483436, + "grad_norm": 1.9608272585512443, + "learning_rate": 0.00018376318874560374, + "loss": 3.6709554195404053, + "step": 1045, + "token_acc": 0.24014582389446734 + }, + { + "epoch": 0.613309879800645, + "grad_norm": 1.5798332645599171, + "learning_rate": 0.0001839390386869871, + "loss": 3.655616283416748, + "step": 1046, + "token_acc": 0.241416062746174 + }, + { + "epoch": 0.613896218117854, + "grad_norm": 2.2355287749237194, + "learning_rate": 0.00018411488862837044, + "loss": 3.693601131439209, + "step": 1047, + "token_acc": 0.23598704514745084 + }, + { + "epoch": 0.614482556435063, + "grad_norm": 1.591001188998099, + "learning_rate": 0.0001842907385697538, + "loss": 3.6287569999694824, + "step": 1048, + "token_acc": 0.24424157776493666 + }, + { + "epoch": 0.6150688947522721, + "grad_norm": 1.7694997501888765, + "learning_rate": 0.00018446658851113714, + "loss": 3.659522533416748, + "step": 1049, + "token_acc": 0.2394644289504535 + }, + { + "epoch": 0.6156552330694811, + "grad_norm": 1.9080366732285265, + "learning_rate": 0.0001846424384525205, + "loss": 3.722494125366211, + "step": 1050, + "token_acc": 0.23213453163904016 + }, + { + "epoch": 0.6162415713866901, + "grad_norm": 1.9899739540283183, + "learning_rate": 0.00018481828839390387, + "loss": 3.6280910968780518, + "step": 1051, + "token_acc": 0.24328213095055012 + }, + { + "epoch": 0.6168279097038991, + "grad_norm": 1.447630401827886, + "learning_rate": 0.00018499413833528722, + "loss": 3.714254856109619, + "step": 1052, + "token_acc": 0.23404759915592482 + }, + { + "epoch": 0.6174142480211082, + "grad_norm": 1.8101851488016054, + "learning_rate": 0.00018516998827667057, + "loss": 3.641073226928711, + "step": 1053, + "token_acc": 0.24086692705595122 + }, + { + "epoch": 0.6180005863383172, + "grad_norm": 1.4578780084340344, + "learning_rate": 0.00018534583821805392, + "loss": 3.684634208679199, + "step": 1054, + "token_acc": 0.2376295647930862 + }, + { + "epoch": 0.6185869246555262, + "grad_norm": 1.4171255394024795, + "learning_rate": 0.00018552168815943727, + "loss": 3.6276702880859375, + "step": 1055, + "token_acc": 0.2427771930878332 + }, + { + "epoch": 0.6191732629727352, + "grad_norm": 1.4895494083038259, + "learning_rate": 0.00018569753810082062, + "loss": 3.659527540206909, + "step": 1056, + "token_acc": 0.24059873154759448 + }, + { + "epoch": 0.6197596012899443, + "grad_norm": 1.8089027556266084, + "learning_rate": 0.00018587338804220397, + "loss": 3.662781000137329, + "step": 1057, + "token_acc": 0.24168235542258087 + }, + { + "epoch": 0.6203459396071533, + "grad_norm": 1.351720852943308, + "learning_rate": 0.0001860492379835873, + "loss": 3.641920566558838, + "step": 1058, + "token_acc": 0.2428657342359603 + }, + { + "epoch": 0.6209322779243623, + "grad_norm": 1.8125143419574568, + "learning_rate": 0.00018622508792497065, + "loss": 3.6598668098449707, + "step": 1059, + "token_acc": 0.24041304608178418 + }, + { + "epoch": 0.6215186162415713, + "grad_norm": 1.3673553699320966, + "learning_rate": 0.00018640093786635403, + "loss": 3.65216064453125, + "step": 1060, + "token_acc": 0.23978535367012135 + }, + { + "epoch": 0.6221049545587805, + "grad_norm": 2.2542466699915336, + "learning_rate": 0.00018657678780773738, + "loss": 3.6763997077941895, + "step": 1061, + "token_acc": 0.2391366067771996 + }, + { + "epoch": 0.6226912928759895, + "grad_norm": 1.3538918594347018, + "learning_rate": 0.00018675263774912073, + "loss": 3.6599977016448975, + "step": 1062, + "token_acc": 0.23938073813094643 + }, + { + "epoch": 0.6232776311931985, + "grad_norm": 1.4829739761686875, + "learning_rate": 0.00018692848769050408, + "loss": 3.636746644973755, + "step": 1063, + "token_acc": 0.24287322295444516 + }, + { + "epoch": 0.6238639695104075, + "grad_norm": 1.748101916192368, + "learning_rate": 0.00018710433763188743, + "loss": 3.645778179168701, + "step": 1064, + "token_acc": 0.24184308710219526 + }, + { + "epoch": 0.6244503078276166, + "grad_norm": 1.7216664092839264, + "learning_rate": 0.00018728018757327078, + "loss": 3.697787284851074, + "step": 1065, + "token_acc": 0.2372536064310098 + }, + { + "epoch": 0.6250366461448256, + "grad_norm": 2.0037505338687174, + "learning_rate": 0.00018745603751465413, + "loss": 3.711813449859619, + "step": 1066, + "token_acc": 0.23334084492106205 + }, + { + "epoch": 0.6256229844620346, + "grad_norm": 1.5125001231386015, + "learning_rate": 0.0001876318874560375, + "loss": 3.677464485168457, + "step": 1067, + "token_acc": 0.23677737437968593 + }, + { + "epoch": 0.6262093227792436, + "grad_norm": 1.7578623567318084, + "learning_rate": 0.00018780773739742086, + "loss": 3.688357353210449, + "step": 1068, + "token_acc": 0.2363699147964349 + }, + { + "epoch": 0.6267956610964527, + "grad_norm": 2.0244761456840568, + "learning_rate": 0.0001879835873388042, + "loss": 3.589259624481201, + "step": 1069, + "token_acc": 0.24947612333847974 + }, + { + "epoch": 0.6273819994136617, + "grad_norm": 1.555361145103413, + "learning_rate": 0.00018815943728018756, + "loss": 3.671065330505371, + "step": 1070, + "token_acc": 0.23687182823682498 + }, + { + "epoch": 0.6279683377308707, + "grad_norm": 1.951366711556471, + "learning_rate": 0.00018833528722157091, + "loss": 3.684986114501953, + "step": 1071, + "token_acc": 0.23795057469478242 + }, + { + "epoch": 0.6285546760480798, + "grad_norm": 1.3478851758346075, + "learning_rate": 0.00018851113716295426, + "loss": 3.6856579780578613, + "step": 1072, + "token_acc": 0.2373701700030501 + }, + { + "epoch": 0.6291410143652888, + "grad_norm": 1.6806400160384543, + "learning_rate": 0.00018868698710433762, + "loss": 3.6672234535217285, + "step": 1073, + "token_acc": 0.2386756230416714 + }, + { + "epoch": 0.6297273526824978, + "grad_norm": 1.4137867333499683, + "learning_rate": 0.000188862837045721, + "loss": 3.689209461212158, + "step": 1074, + "token_acc": 0.23769485812679747 + }, + { + "epoch": 0.6303136909997068, + "grad_norm": 1.6981345647704025, + "learning_rate": 0.00018903868698710434, + "loss": 3.621795654296875, + "step": 1075, + "token_acc": 0.24479109793827836 + }, + { + "epoch": 0.6309000293169159, + "grad_norm": 1.9866745076502357, + "learning_rate": 0.00018921453692848767, + "loss": 3.642712116241455, + "step": 1076, + "token_acc": 0.24154126055880443 + }, + { + "epoch": 0.6314863676341249, + "grad_norm": 1.976887452629786, + "learning_rate": 0.00018939038686987102, + "loss": 3.6317319869995117, + "step": 1077, + "token_acc": 0.24141695793897028 + }, + { + "epoch": 0.6320727059513339, + "grad_norm": 1.7329902202194336, + "learning_rate": 0.00018956623681125437, + "loss": 3.6605727672576904, + "step": 1078, + "token_acc": 0.24026168531492628 + }, + { + "epoch": 0.6326590442685429, + "grad_norm": 1.4726478368834104, + "learning_rate": 0.00018974208675263772, + "loss": 3.6747255325317383, + "step": 1079, + "token_acc": 0.23477749759247646 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 1.7585814971059355, + "learning_rate": 0.00018991793669402107, + "loss": 3.6440324783325195, + "step": 1080, + "token_acc": 0.24026081611929131 + }, + { + "epoch": 0.633831720902961, + "grad_norm": 1.7674480720415773, + "learning_rate": 0.00019009378663540442, + "loss": 3.6040310859680176, + "step": 1081, + "token_acc": 0.2475406848066658 + }, + { + "epoch": 0.63441805922017, + "grad_norm": 1.5517546055726985, + "learning_rate": 0.0001902696365767878, + "loss": 3.6312811374664307, + "step": 1082, + "token_acc": 0.24221097517235882 + }, + { + "epoch": 0.635004397537379, + "grad_norm": 1.7430612793323368, + "learning_rate": 0.00019044548651817115, + "loss": 3.7135157585144043, + "step": 1083, + "token_acc": 0.23036504350489043 + }, + { + "epoch": 0.6355907358545881, + "grad_norm": 2.040381816418502, + "learning_rate": 0.0001906213364595545, + "loss": 3.657586097717285, + "step": 1084, + "token_acc": 0.24113301401118153 + }, + { + "epoch": 0.6361770741717971, + "grad_norm": 1.556818247580028, + "learning_rate": 0.00019079718640093785, + "loss": 3.674015760421753, + "step": 1085, + "token_acc": 0.2372200040603641 + }, + { + "epoch": 0.6367634124890061, + "grad_norm": 2.269620149573287, + "learning_rate": 0.0001909730363423212, + "loss": 3.6781411170959473, + "step": 1086, + "token_acc": 0.23656198212669913 + }, + { + "epoch": 0.6373497508062151, + "grad_norm": 1.7357260290331344, + "learning_rate": 0.00019114888628370456, + "loss": 3.654287338256836, + "step": 1087, + "token_acc": 0.2386494909949324 + }, + { + "epoch": 0.6379360891234243, + "grad_norm": 1.9773427735767597, + "learning_rate": 0.0001913247362250879, + "loss": 3.639333963394165, + "step": 1088, + "token_acc": 0.24030843219460793 + }, + { + "epoch": 0.6385224274406333, + "grad_norm": 1.5183334730919866, + "learning_rate": 0.00019150058616647126, + "loss": 3.646763324737549, + "step": 1089, + "token_acc": 0.24163768412438624 + }, + { + "epoch": 0.6391087657578423, + "grad_norm": 1.6136655963608397, + "learning_rate": 0.00019167643610785463, + "loss": 3.6259384155273438, + "step": 1090, + "token_acc": 0.24393340668031388 + }, + { + "epoch": 0.6396951040750513, + "grad_norm": 1.7046718930028641, + "learning_rate": 0.00019185228604923799, + "loss": 3.6054673194885254, + "step": 1091, + "token_acc": 0.24627198951426857 + }, + { + "epoch": 0.6402814423922604, + "grad_norm": 1.3540144432232415, + "learning_rate": 0.00019202813599062134, + "loss": 3.6118674278259277, + "step": 1092, + "token_acc": 0.24329294288664136 + }, + { + "epoch": 0.6408677807094694, + "grad_norm": 1.4413944342971239, + "learning_rate": 0.00019220398593200466, + "loss": 3.6929872035980225, + "step": 1093, + "token_acc": 0.2353969362149911 + }, + { + "epoch": 0.6414541190266784, + "grad_norm": 1.7950871278456584, + "learning_rate": 0.000192379835873388, + "loss": 3.623234748840332, + "step": 1094, + "token_acc": 0.24344740572953516 + }, + { + "epoch": 0.6420404573438874, + "grad_norm": 1.6697604715179724, + "learning_rate": 0.00019255568581477136, + "loss": 3.5792489051818848, + "step": 1095, + "token_acc": 0.2470687203421172 + }, + { + "epoch": 0.6426267956610965, + "grad_norm": 2.039536708982461, + "learning_rate": 0.0001927315357561547, + "loss": 3.612475872039795, + "step": 1096, + "token_acc": 0.24463368341136277 + }, + { + "epoch": 0.6432131339783055, + "grad_norm": 1.6127142992973633, + "learning_rate": 0.00019290738569753806, + "loss": 3.606299638748169, + "step": 1097, + "token_acc": 0.24438323038241083 + }, + { + "epoch": 0.6437994722955145, + "grad_norm": 1.7625034583598775, + "learning_rate": 0.00019308323563892144, + "loss": 3.6384198665618896, + "step": 1098, + "token_acc": 0.24053047297002822 + }, + { + "epoch": 0.6443858106127235, + "grad_norm": 1.2513562102792075, + "learning_rate": 0.0001932590855803048, + "loss": 3.6397864818573, + "step": 1099, + "token_acc": 0.24160905220037274 + }, + { + "epoch": 0.6449721489299326, + "grad_norm": 1.6512767352844167, + "learning_rate": 0.00019343493552168814, + "loss": 3.67323637008667, + "step": 1100, + "token_acc": 0.2357519688049545 + }, + { + "epoch": 0.6455584872471416, + "grad_norm": 1.8225156712466453, + "learning_rate": 0.0001936107854630715, + "loss": 3.6741366386413574, + "step": 1101, + "token_acc": 0.2391971488724046 + }, + { + "epoch": 0.6461448255643506, + "grad_norm": 1.6907654254018762, + "learning_rate": 0.00019378663540445485, + "loss": 3.6123390197753906, + "step": 1102, + "token_acc": 0.24138375590889558 + }, + { + "epoch": 0.6467311638815597, + "grad_norm": 1.7215367096404044, + "learning_rate": 0.0001939624853458382, + "loss": 3.6283833980560303, + "step": 1103, + "token_acc": 0.24226162319691608 + }, + { + "epoch": 0.6473175021987687, + "grad_norm": 1.3985995856220952, + "learning_rate": 0.00019413833528722155, + "loss": 3.6100409030914307, + "step": 1104, + "token_acc": 0.24666047537661515 + }, + { + "epoch": 0.6479038405159777, + "grad_norm": 2.0689539464013955, + "learning_rate": 0.00019431418522860493, + "loss": 3.6854305267333984, + "step": 1105, + "token_acc": 0.23523078526030633 + }, + { + "epoch": 0.6484901788331867, + "grad_norm": 1.2157216193055542, + "learning_rate": 0.00019449003516998828, + "loss": 3.612168312072754, + "step": 1106, + "token_acc": 0.24397058589628717 + }, + { + "epoch": 0.6490765171503958, + "grad_norm": 2.011225265689044, + "learning_rate": 0.00019466588511137163, + "loss": 3.645557165145874, + "step": 1107, + "token_acc": 0.24132552895656384 + }, + { + "epoch": 0.6496628554676048, + "grad_norm": 1.4751809413167396, + "learning_rate": 0.00019484173505275498, + "loss": 3.6708264350891113, + "step": 1108, + "token_acc": 0.23719865422073816 + }, + { + "epoch": 0.6502491937848138, + "grad_norm": 1.8382469128246788, + "learning_rate": 0.00019501758499413833, + "loss": 3.5735676288604736, + "step": 1109, + "token_acc": 0.2489281631097561 + }, + { + "epoch": 0.6508355321020228, + "grad_norm": 1.3940802207683847, + "learning_rate": 0.00019519343493552168, + "loss": 3.6366333961486816, + "step": 1110, + "token_acc": 0.24249216659694872 + }, + { + "epoch": 0.6514218704192319, + "grad_norm": 1.657597656876021, + "learning_rate": 0.000195369284876905, + "loss": 3.5903561115264893, + "step": 1111, + "token_acc": 0.244454957618766 + }, + { + "epoch": 0.6520082087364409, + "grad_norm": 1.6175099314586001, + "learning_rate": 0.00019554513481828835, + "loss": 3.6336400508880615, + "step": 1112, + "token_acc": 0.24179001828782218 + }, + { + "epoch": 0.6525945470536499, + "grad_norm": 1.470480683499547, + "learning_rate": 0.0001957209847596717, + "loss": 3.5976552963256836, + "step": 1113, + "token_acc": 0.24569830395401945 + }, + { + "epoch": 0.6531808853708589, + "grad_norm": 2.1813622949923626, + "learning_rate": 0.00019589683470105508, + "loss": 3.6614279747009277, + "step": 1114, + "token_acc": 0.24019228690003908 + }, + { + "epoch": 0.653767223688068, + "grad_norm": 1.5860689099835066, + "learning_rate": 0.00019607268464243843, + "loss": 3.63978910446167, + "step": 1115, + "token_acc": 0.24227566265819628 + }, + { + "epoch": 0.6543535620052771, + "grad_norm": 1.6480716077173239, + "learning_rate": 0.00019624853458382179, + "loss": 3.6350765228271484, + "step": 1116, + "token_acc": 0.23837553165774356 + }, + { + "epoch": 0.6549399003224861, + "grad_norm": 1.4176358370845636, + "learning_rate": 0.00019642438452520514, + "loss": 3.5681934356689453, + "step": 1117, + "token_acc": 0.24877584755013568 + }, + { + "epoch": 0.6555262386396951, + "grad_norm": 1.516793444920232, + "learning_rate": 0.0001966002344665885, + "loss": 3.673952102661133, + "step": 1118, + "token_acc": 0.2350283096839975 + }, + { + "epoch": 0.6561125769569042, + "grad_norm": 1.337234260253422, + "learning_rate": 0.00019677608440797184, + "loss": 3.6203818321228027, + "step": 1119, + "token_acc": 0.2439845069435977 + }, + { + "epoch": 0.6566989152741132, + "grad_norm": 1.2978707681197799, + "learning_rate": 0.0001969519343493552, + "loss": 3.5939743518829346, + "step": 1120, + "token_acc": 0.24607298556314597 + }, + { + "epoch": 0.6572852535913222, + "grad_norm": 1.6691223741547296, + "learning_rate": 0.00019712778429073857, + "loss": 3.6283607482910156, + "step": 1121, + "token_acc": 0.24155875527778564 + }, + { + "epoch": 0.6578715919085312, + "grad_norm": 1.5328612206431984, + "learning_rate": 0.00019730363423212192, + "loss": 3.661714553833008, + "step": 1122, + "token_acc": 0.23627960263029865 + }, + { + "epoch": 0.6584579302257403, + "grad_norm": 1.6956253356889974, + "learning_rate": 0.00019747948417350527, + "loss": 3.649235725402832, + "step": 1123, + "token_acc": 0.23973618822016637 + }, + { + "epoch": 0.6590442685429493, + "grad_norm": 1.385611834215062, + "learning_rate": 0.00019765533411488862, + "loss": 3.6056106090545654, + "step": 1124, + "token_acc": 0.24425279978638953 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 1.6153349029702886, + "learning_rate": 0.00019783118405627197, + "loss": 3.5426769256591797, + "step": 1125, + "token_acc": 0.2524058340488067 + }, + { + "epoch": 0.6602169451773673, + "grad_norm": 1.60034514549438, + "learning_rate": 0.00019800703399765532, + "loss": 3.610332489013672, + "step": 1126, + "token_acc": 0.24038042033142212 + }, + { + "epoch": 0.6608032834945764, + "grad_norm": 1.450280750845652, + "learning_rate": 0.00019818288393903867, + "loss": 3.6011390686035156, + "step": 1127, + "token_acc": 0.24470297132638846 + }, + { + "epoch": 0.6613896218117854, + "grad_norm": 1.5520459264059963, + "learning_rate": 0.00019835873388042205, + "loss": 3.645643472671509, + "step": 1128, + "token_acc": 0.24071357710635938 + }, + { + "epoch": 0.6619759601289944, + "grad_norm": 1.4093507282595426, + "learning_rate": 0.00019853458382180535, + "loss": 3.5918097496032715, + "step": 1129, + "token_acc": 0.24674497689983607 + }, + { + "epoch": 0.6625622984462035, + "grad_norm": 1.2870749759372013, + "learning_rate": 0.00019871043376318872, + "loss": 3.579008102416992, + "step": 1130, + "token_acc": 0.2460683717638452 + }, + { + "epoch": 0.6631486367634125, + "grad_norm": 1.5116505122615504, + "learning_rate": 0.00019888628370457208, + "loss": 3.627666473388672, + "step": 1131, + "token_acc": 0.2396118058513041 + }, + { + "epoch": 0.6637349750806215, + "grad_norm": 1.644916560866073, + "learning_rate": 0.00019906213364595543, + "loss": 3.6390559673309326, + "step": 1132, + "token_acc": 0.2402390395769362 + }, + { + "epoch": 0.6643213133978305, + "grad_norm": 1.7802925249319201, + "learning_rate": 0.00019923798358733878, + "loss": 3.6603431701660156, + "step": 1133, + "token_acc": 0.23859199223284577 + }, + { + "epoch": 0.6649076517150396, + "grad_norm": 1.8720354744807326, + "learning_rate": 0.00019941383352872213, + "loss": 3.652681827545166, + "step": 1134, + "token_acc": 0.23852036998233903 + }, + { + "epoch": 0.6654939900322486, + "grad_norm": 1.2546442608542687, + "learning_rate": 0.00019958968347010548, + "loss": 3.5934860706329346, + "step": 1135, + "token_acc": 0.24600937747587442 + }, + { + "epoch": 0.6660803283494576, + "grad_norm": 1.7291911612479556, + "learning_rate": 0.00019976553341148883, + "loss": 3.571781873703003, + "step": 1136, + "token_acc": 0.24974058375564373 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.356670588130926, + "learning_rate": 0.0001999413833528722, + "loss": 3.6166539192199707, + "step": 1137, + "token_acc": 0.2420671408410662 + }, + { + "epoch": 0.6672530049838757, + "grad_norm": 1.7900598056883277, + "learning_rate": 0.00020011723329425556, + "loss": 3.593308448791504, + "step": 1138, + "token_acc": 0.2445009106939989 + }, + { + "epoch": 0.6678393433010847, + "grad_norm": 1.3943622144762045, + "learning_rate": 0.0002002930832356389, + "loss": 3.6098599433898926, + "step": 1139, + "token_acc": 0.24256794400247142 + }, + { + "epoch": 0.6684256816182937, + "grad_norm": 1.7776906887504345, + "learning_rate": 0.00020046893317702226, + "loss": 3.6094508171081543, + "step": 1140, + "token_acc": 0.24432004608583488 + }, + { + "epoch": 0.6690120199355027, + "grad_norm": 1.5104074155476455, + "learning_rate": 0.0002006447831184056, + "loss": 3.618520736694336, + "step": 1141, + "token_acc": 0.24201057955666866 + }, + { + "epoch": 0.6695983582527119, + "grad_norm": 1.4851445853862546, + "learning_rate": 0.00020082063305978896, + "loss": 3.5834691524505615, + "step": 1142, + "token_acc": 0.24682063231896365 + }, + { + "epoch": 0.6701846965699209, + "grad_norm": 1.5474687686537965, + "learning_rate": 0.0002009964830011723, + "loss": 3.592369318008423, + "step": 1143, + "token_acc": 0.24598476895568652 + }, + { + "epoch": 0.6707710348871299, + "grad_norm": 1.5860675010366894, + "learning_rate": 0.0002011723329425557, + "loss": 3.6278696060180664, + "step": 1144, + "token_acc": 0.24052580886428082 + }, + { + "epoch": 0.6713573732043389, + "grad_norm": 1.6020826511503836, + "learning_rate": 0.00020134818288393904, + "loss": 3.6181278228759766, + "step": 1145, + "token_acc": 0.24164455708232685 + }, + { + "epoch": 0.671943711521548, + "grad_norm": 1.4014698945420845, + "learning_rate": 0.0002015240328253224, + "loss": 3.558528423309326, + "step": 1146, + "token_acc": 0.2504937928743287 + }, + { + "epoch": 0.672530049838757, + "grad_norm": 1.4833780015077336, + "learning_rate": 0.00020169988276670572, + "loss": 3.5694637298583984, + "step": 1147, + "token_acc": 0.24811560383281017 + }, + { + "epoch": 0.673116388155966, + "grad_norm": 1.5150151091636253, + "learning_rate": 0.00020187573270808907, + "loss": 3.621929168701172, + "step": 1148, + "token_acc": 0.24057901662878334 + }, + { + "epoch": 0.673702726473175, + "grad_norm": 1.413507895436423, + "learning_rate": 0.00020205158264947242, + "loss": 3.6022233963012695, + "step": 1149, + "token_acc": 0.242924326177315 + }, + { + "epoch": 0.6742890647903841, + "grad_norm": 1.5325210819111854, + "learning_rate": 0.00020222743259085577, + "loss": 3.613931894302368, + "step": 1150, + "token_acc": 0.24302687267349957 + }, + { + "epoch": 0.6748754031075931, + "grad_norm": 1.4181193199346183, + "learning_rate": 0.00020240328253223912, + "loss": 3.588520050048828, + "step": 1151, + "token_acc": 0.24631008162526843 + }, + { + "epoch": 0.6754617414248021, + "grad_norm": 1.8156740867821797, + "learning_rate": 0.0002025791324736225, + "loss": 3.585069179534912, + "step": 1152, + "token_acc": 0.24621286904183853 + }, + { + "epoch": 0.6760480797420111, + "grad_norm": 1.519099142119853, + "learning_rate": 0.00020275498241500585, + "loss": 3.588890552520752, + "step": 1153, + "token_acc": 0.24348914506900032 + }, + { + "epoch": 0.6766344180592202, + "grad_norm": 2.284298859116246, + "learning_rate": 0.0002029308323563892, + "loss": 3.6400070190429688, + "step": 1154, + "token_acc": 0.2388069264901405 + }, + { + "epoch": 0.6772207563764292, + "grad_norm": 1.412118723214527, + "learning_rate": 0.00020310668229777255, + "loss": 3.5892252922058105, + "step": 1155, + "token_acc": 0.2460910084248711 + }, + { + "epoch": 0.6778070946936382, + "grad_norm": 1.4038632447176342, + "learning_rate": 0.0002032825322391559, + "loss": 3.617854356765747, + "step": 1156, + "token_acc": 0.24162731584198796 + }, + { + "epoch": 0.6783934330108473, + "grad_norm": 1.4735367884794843, + "learning_rate": 0.00020345838218053925, + "loss": 3.5383706092834473, + "step": 1157, + "token_acc": 0.25039964451958663 + }, + { + "epoch": 0.6789797713280563, + "grad_norm": 1.3508446984252374, + "learning_rate": 0.0002036342321219226, + "loss": 3.6222965717315674, + "step": 1158, + "token_acc": 0.24018951750390588 + }, + { + "epoch": 0.6795661096452653, + "grad_norm": 1.6147453261994797, + "learning_rate": 0.00020381008206330598, + "loss": 3.589507579803467, + "step": 1159, + "token_acc": 0.2461748835834134 + }, + { + "epoch": 0.6801524479624743, + "grad_norm": 1.6299601848386351, + "learning_rate": 0.00020398593200468933, + "loss": 3.5018553733825684, + "step": 1160, + "token_acc": 0.25406549485889246 + }, + { + "epoch": 0.6807387862796834, + "grad_norm": 1.6419355929097839, + "learning_rate": 0.00020416178194607268, + "loss": 3.636781692504883, + "step": 1161, + "token_acc": 0.23880242611949928 + }, + { + "epoch": 0.6813251245968924, + "grad_norm": 1.4434573456199389, + "learning_rate": 0.00020433763188745603, + "loss": 3.5546317100524902, + "step": 1162, + "token_acc": 0.2501913386514349 + }, + { + "epoch": 0.6819114629141014, + "grad_norm": 1.7254479135607967, + "learning_rate": 0.00020451348182883939, + "loss": 3.614981174468994, + "step": 1163, + "token_acc": 0.24098654802627437 + }, + { + "epoch": 0.6824978012313104, + "grad_norm": 1.9072933428320171, + "learning_rate": 0.0002046893317702227, + "loss": 3.5855712890625, + "step": 1164, + "token_acc": 0.24768261233786498 + }, + { + "epoch": 0.6830841395485195, + "grad_norm": 1.2978073301636897, + "learning_rate": 0.00020486518171160606, + "loss": 3.6052474975585938, + "step": 1165, + "token_acc": 0.24182108715763506 + }, + { + "epoch": 0.6836704778657285, + "grad_norm": 1.5958120732363972, + "learning_rate": 0.0002050410316529894, + "loss": 3.599301338195801, + "step": 1166, + "token_acc": 0.24437615147621186 + }, + { + "epoch": 0.6842568161829375, + "grad_norm": 1.5180439439214437, + "learning_rate": 0.00020521688159437276, + "loss": 3.622044324874878, + "step": 1167, + "token_acc": 0.24223399113347088 + }, + { + "epoch": 0.6848431545001465, + "grad_norm": 1.4818090899694143, + "learning_rate": 0.00020539273153575614, + "loss": 3.617368698120117, + "step": 1168, + "token_acc": 0.24405604228664182 + }, + { + "epoch": 0.6854294928173557, + "grad_norm": 1.347070011833193, + "learning_rate": 0.0002055685814771395, + "loss": 3.596374273300171, + "step": 1169, + "token_acc": 0.2420882549587698 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 1.803849208222528, + "learning_rate": 0.00020574443141852284, + "loss": 3.563307285308838, + "step": 1170, + "token_acc": 0.24695189651165794 + }, + { + "epoch": 0.6866021694517737, + "grad_norm": 1.5364140968501123, + "learning_rate": 0.0002059202813599062, + "loss": 3.5895957946777344, + "step": 1171, + "token_acc": 0.2443650122895845 + }, + { + "epoch": 0.6871885077689827, + "grad_norm": 1.6945213045144898, + "learning_rate": 0.00020609613130128954, + "loss": 3.592796564102173, + "step": 1172, + "token_acc": 0.24319801449991763 + }, + { + "epoch": 0.6877748460861918, + "grad_norm": 1.584319083932552, + "learning_rate": 0.0002062719812426729, + "loss": 3.58577036857605, + "step": 1173, + "token_acc": 0.24424933687002653 + }, + { + "epoch": 0.6883611844034008, + "grad_norm": 1.5081160231626272, + "learning_rate": 0.00020644783118405624, + "loss": 3.591670036315918, + "step": 1174, + "token_acc": 0.24489660758771417 + }, + { + "epoch": 0.6889475227206098, + "grad_norm": 1.5839956729679818, + "learning_rate": 0.00020662368112543962, + "loss": 3.609102725982666, + "step": 1175, + "token_acc": 0.24326734804760783 + }, + { + "epoch": 0.6895338610378188, + "grad_norm": 1.6374116490283572, + "learning_rate": 0.00020679953106682297, + "loss": 3.6066267490386963, + "step": 1176, + "token_acc": 0.24193709988715104 + }, + { + "epoch": 0.6901201993550279, + "grad_norm": 1.5307115289616988, + "learning_rate": 0.00020697538100820632, + "loss": 3.5727434158325195, + "step": 1177, + "token_acc": 0.24325560608848812 + }, + { + "epoch": 0.6907065376722369, + "grad_norm": 1.9058690211782219, + "learning_rate": 0.00020715123094958968, + "loss": 3.6209356784820557, + "step": 1178, + "token_acc": 0.24030822129729526 + }, + { + "epoch": 0.6912928759894459, + "grad_norm": 1.0561626946870013, + "learning_rate": 0.00020732708089097303, + "loss": 3.5838255882263184, + "step": 1179, + "token_acc": 0.2469547475887084 + }, + { + "epoch": 0.6918792143066549, + "grad_norm": 1.401383225761813, + "learning_rate": 0.00020750293083235638, + "loss": 3.564133644104004, + "step": 1180, + "token_acc": 0.2470333343493147 + }, + { + "epoch": 0.692465552623864, + "grad_norm": 1.5602268152648973, + "learning_rate": 0.00020767878077373973, + "loss": 3.618037223815918, + "step": 1181, + "token_acc": 0.24118230654455783 + }, + { + "epoch": 0.693051890941073, + "grad_norm": 1.6271596903446885, + "learning_rate": 0.00020785463071512305, + "loss": 3.5246148109436035, + "step": 1182, + "token_acc": 0.25002085761454623 + }, + { + "epoch": 0.693638229258282, + "grad_norm": 1.3045394781437147, + "learning_rate": 0.0002080304806565064, + "loss": 3.544750213623047, + "step": 1183, + "token_acc": 0.2496232633279483 + }, + { + "epoch": 0.6942245675754911, + "grad_norm": 1.6538527007391917, + "learning_rate": 0.00020820633059788978, + "loss": 3.5446419715881348, + "step": 1184, + "token_acc": 0.2500187660967813 + }, + { + "epoch": 0.6948109058927001, + "grad_norm": 1.3739265379760683, + "learning_rate": 0.00020838218053927313, + "loss": 3.572628974914551, + "step": 1185, + "token_acc": 0.24697953610956327 + }, + { + "epoch": 0.6953972442099091, + "grad_norm": 1.5753787047227474, + "learning_rate": 0.00020855803048065648, + "loss": 3.561522960662842, + "step": 1186, + "token_acc": 0.24755836342687462 + }, + { + "epoch": 0.6959835825271181, + "grad_norm": 1.2660102525575596, + "learning_rate": 0.00020873388042203983, + "loss": 3.59983491897583, + "step": 1187, + "token_acc": 0.24312022202342226 + }, + { + "epoch": 0.6965699208443272, + "grad_norm": 1.4852802655773794, + "learning_rate": 0.00020890973036342318, + "loss": 3.5600967407226562, + "step": 1188, + "token_acc": 0.24581967234460328 + }, + { + "epoch": 0.6971562591615362, + "grad_norm": 1.3948117157455504, + "learning_rate": 0.00020908558030480654, + "loss": 3.570460319519043, + "step": 1189, + "token_acc": 0.2457847772348982 + }, + { + "epoch": 0.6977425974787452, + "grad_norm": 1.3912107301734813, + "learning_rate": 0.00020926143024618989, + "loss": 3.6175618171691895, + "step": 1190, + "token_acc": 0.24230702399251486 + }, + { + "epoch": 0.6983289357959542, + "grad_norm": 1.5759277073923086, + "learning_rate": 0.00020943728018757326, + "loss": 3.5924034118652344, + "step": 1191, + "token_acc": 0.24593419385763687 + }, + { + "epoch": 0.6989152741131633, + "grad_norm": 1.39277861785629, + "learning_rate": 0.00020961313012895662, + "loss": 3.5540080070495605, + "step": 1192, + "token_acc": 0.24777259334493126 + }, + { + "epoch": 0.6995016124303723, + "grad_norm": 1.385638697451805, + "learning_rate": 0.00020978898007033997, + "loss": 3.5938775539398193, + "step": 1193, + "token_acc": 0.2435192618929286 + }, + { + "epoch": 0.7000879507475813, + "grad_norm": 1.8584368658135042, + "learning_rate": 0.00020996483001172332, + "loss": 3.549201488494873, + "step": 1194, + "token_acc": 0.24996201185230207 + }, + { + "epoch": 0.7006742890647903, + "grad_norm": 1.4064310655448726, + "learning_rate": 0.00021014067995310667, + "loss": 3.604276657104492, + "step": 1195, + "token_acc": 0.24198638540074036 + }, + { + "epoch": 0.7012606273819995, + "grad_norm": 1.7018986780837322, + "learning_rate": 0.00021031652989449002, + "loss": 3.57584547996521, + "step": 1196, + "token_acc": 0.2464382163438025 + }, + { + "epoch": 0.7018469656992085, + "grad_norm": 1.5146780956860744, + "learning_rate": 0.00021049237983587337, + "loss": 3.5720572471618652, + "step": 1197, + "token_acc": 0.2452082644585072 + }, + { + "epoch": 0.7024333040164175, + "grad_norm": 1.5270076813463245, + "learning_rate": 0.00021066822977725675, + "loss": 3.633413791656494, + "step": 1198, + "token_acc": 0.23917427063087535 + }, + { + "epoch": 0.7030196423336265, + "grad_norm": 1.8373426947599365, + "learning_rate": 0.0002108440797186401, + "loss": 3.605698347091675, + "step": 1199, + "token_acc": 0.24348149891261342 + }, + { + "epoch": 0.7036059806508356, + "grad_norm": 1.4501453703784004, + "learning_rate": 0.00021101992966002342, + "loss": 3.598822593688965, + "step": 1200, + "token_acc": 0.24447639640430693 + }, + { + "epoch": 0.7041923189680446, + "grad_norm": 1.2155396450137024, + "learning_rate": 0.00021119577960140677, + "loss": 3.5956811904907227, + "step": 1201, + "token_acc": 0.2441128386132825 + }, + { + "epoch": 0.7047786572852536, + "grad_norm": 1.2907905813875884, + "learning_rate": 0.00021137162954279012, + "loss": 3.5836386680603027, + "step": 1202, + "token_acc": 0.24527807333973478 + }, + { + "epoch": 0.7053649956024626, + "grad_norm": 1.4178030166108693, + "learning_rate": 0.00021154747948417347, + "loss": 3.615964651107788, + "step": 1203, + "token_acc": 0.24103439992186035 + }, + { + "epoch": 0.7059513339196717, + "grad_norm": 1.6241951295929389, + "learning_rate": 0.00021172332942555683, + "loss": 3.535749912261963, + "step": 1204, + "token_acc": 0.24910567304039508 + }, + { + "epoch": 0.7065376722368807, + "grad_norm": 1.3848919579031367, + "learning_rate": 0.00021189917936694018, + "loss": 3.566925525665283, + "step": 1205, + "token_acc": 0.24736881976317418 + }, + { + "epoch": 0.7071240105540897, + "grad_norm": 1.4890576918850404, + "learning_rate": 0.00021207502930832353, + "loss": 3.581563949584961, + "step": 1206, + "token_acc": 0.24462583288565312 + }, + { + "epoch": 0.7077103488712987, + "grad_norm": 1.7218651703673185, + "learning_rate": 0.0002122508792497069, + "loss": 3.6127965450286865, + "step": 1207, + "token_acc": 0.24181569023322472 + }, + { + "epoch": 0.7082966871885078, + "grad_norm": 1.1565119321273731, + "learning_rate": 0.00021242672919109026, + "loss": 3.5695741176605225, + "step": 1208, + "token_acc": 0.2475268265348339 + }, + { + "epoch": 0.7088830255057168, + "grad_norm": 1.7283381721414417, + "learning_rate": 0.0002126025791324736, + "loss": 3.5483202934265137, + "step": 1209, + "token_acc": 0.24790857560055568 + }, + { + "epoch": 0.7094693638229258, + "grad_norm": 1.542720787689113, + "learning_rate": 0.00021277842907385696, + "loss": 3.587327480316162, + "step": 1210, + "token_acc": 0.24341776459203965 + }, + { + "epoch": 0.7100557021401348, + "grad_norm": 1.5495727319473576, + "learning_rate": 0.0002129542790152403, + "loss": 3.558802366256714, + "step": 1211, + "token_acc": 0.24773354150403332 + }, + { + "epoch": 0.7106420404573439, + "grad_norm": 1.2971421551206894, + "learning_rate": 0.00021313012895662366, + "loss": 3.569840908050537, + "step": 1212, + "token_acc": 0.24753208967059892 + }, + { + "epoch": 0.7112283787745529, + "grad_norm": 1.719753069320933, + "learning_rate": 0.000213305978898007, + "loss": 3.549684524536133, + "step": 1213, + "token_acc": 0.24809236004551197 + }, + { + "epoch": 0.7118147170917619, + "grad_norm": 1.3880068430694603, + "learning_rate": 0.0002134818288393904, + "loss": 3.5489330291748047, + "step": 1214, + "token_acc": 0.2470236733033285 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 1.5015538155788861, + "learning_rate": 0.00021365767878077374, + "loss": 3.543576717376709, + "step": 1215, + "token_acc": 0.24917295392648137 + }, + { + "epoch": 0.71298739372618, + "grad_norm": 1.5967968755855506, + "learning_rate": 0.0002138335287221571, + "loss": 3.53983736038208, + "step": 1216, + "token_acc": 0.2496313938981513 + }, + { + "epoch": 0.713573732043389, + "grad_norm": 1.3867352012821892, + "learning_rate": 0.00021400937866354044, + "loss": 3.5764036178588867, + "step": 1217, + "token_acc": 0.24566393184290522 + }, + { + "epoch": 0.714160070360598, + "grad_norm": 1.738247725706473, + "learning_rate": 0.00021418522860492377, + "loss": 3.579580307006836, + "step": 1218, + "token_acc": 0.24417444338892128 + }, + { + "epoch": 0.7147464086778071, + "grad_norm": 1.171318680757039, + "learning_rate": 0.00021436107854630712, + "loss": 3.5330920219421387, + "step": 1219, + "token_acc": 0.24978086902561053 + }, + { + "epoch": 0.7153327469950161, + "grad_norm": 1.72557106420474, + "learning_rate": 0.00021453692848769047, + "loss": 3.580744743347168, + "step": 1220, + "token_acc": 0.24503237830745797 + }, + { + "epoch": 0.7159190853122251, + "grad_norm": 1.065274071322266, + "learning_rate": 0.00021471277842907382, + "loss": 3.554166555404663, + "step": 1221, + "token_acc": 0.24794841439762222 + }, + { + "epoch": 0.7165054236294341, + "grad_norm": 1.7524479882390889, + "learning_rate": 0.0002148886283704572, + "loss": 3.5375688076019287, + "step": 1222, + "token_acc": 0.25004202960986016 + }, + { + "epoch": 0.7170917619466433, + "grad_norm": 1.148305853613259, + "learning_rate": 0.00021506447831184055, + "loss": 3.5806210041046143, + "step": 1223, + "token_acc": 0.2460832130269096 + }, + { + "epoch": 0.7176781002638523, + "grad_norm": 1.3143303464959835, + "learning_rate": 0.0002152403282532239, + "loss": 3.5024471282958984, + "step": 1224, + "token_acc": 0.2549667489229997 + }, + { + "epoch": 0.7182644385810613, + "grad_norm": 1.7154443943940636, + "learning_rate": 0.00021541617819460725, + "loss": 3.604999542236328, + "step": 1225, + "token_acc": 0.2437934262396667 + }, + { + "epoch": 0.7188507768982703, + "grad_norm": 1.3277525488232813, + "learning_rate": 0.0002155920281359906, + "loss": 3.598742961883545, + "step": 1226, + "token_acc": 0.24198334135041394 + }, + { + "epoch": 0.7194371152154794, + "grad_norm": 1.6416711338903025, + "learning_rate": 0.00021576787807737395, + "loss": 3.569950580596924, + "step": 1227, + "token_acc": 0.24519790527360977 + }, + { + "epoch": 0.7200234535326884, + "grad_norm": 1.2723851660667214, + "learning_rate": 0.0002159437280187573, + "loss": 3.6088290214538574, + "step": 1228, + "token_acc": 0.2403587070732291 + }, + { + "epoch": 0.7206097918498974, + "grad_norm": 1.6001371153841353, + "learning_rate": 0.00021611957796014068, + "loss": 3.5068674087524414, + "step": 1229, + "token_acc": 0.2534285654463416 + }, + { + "epoch": 0.7211961301671064, + "grad_norm": 1.220262358170032, + "learning_rate": 0.00021629542790152403, + "loss": 3.573232650756836, + "step": 1230, + "token_acc": 0.2471954032973013 + }, + { + "epoch": 0.7217824684843155, + "grad_norm": 1.518819589412541, + "learning_rate": 0.00021647127784290738, + "loss": 3.580660343170166, + "step": 1231, + "token_acc": 0.2448288050159823 + }, + { + "epoch": 0.7223688068015245, + "grad_norm": 1.1318360269614194, + "learning_rate": 0.00021664712778429073, + "loss": 3.5644583702087402, + "step": 1232, + "token_acc": 0.24850441054324857 + }, + { + "epoch": 0.7229551451187335, + "grad_norm": 1.395018834103563, + "learning_rate": 0.00021682297772567408, + "loss": 3.598156213760376, + "step": 1233, + "token_acc": 0.2420621761658031 + }, + { + "epoch": 0.7235414834359425, + "grad_norm": 1.5448312665379487, + "learning_rate": 0.00021699882766705743, + "loss": 3.524854898452759, + "step": 1234, + "token_acc": 0.2517297511730025 + }, + { + "epoch": 0.7241278217531516, + "grad_norm": 1.3364464261702895, + "learning_rate": 0.00021717467760844078, + "loss": 3.5365190505981445, + "step": 1235, + "token_acc": 0.2520886573619029 + }, + { + "epoch": 0.7247141600703606, + "grad_norm": 1.5887730415281391, + "learning_rate": 0.0002173505275498241, + "loss": 3.5950169563293457, + "step": 1236, + "token_acc": 0.2435176786077198 + }, + { + "epoch": 0.7253004983875696, + "grad_norm": 1.3903154856538764, + "learning_rate": 0.00021752637749120746, + "loss": 3.5647411346435547, + "step": 1237, + "token_acc": 0.24810043553549876 + }, + { + "epoch": 0.7258868367047786, + "grad_norm": 1.775432142168979, + "learning_rate": 0.00021770222743259084, + "loss": 3.5696730613708496, + "step": 1238, + "token_acc": 0.24437369492923616 + }, + { + "epoch": 0.7264731750219877, + "grad_norm": 1.157195309336081, + "learning_rate": 0.0002178780773739742, + "loss": 3.5941247940063477, + "step": 1239, + "token_acc": 0.24282366954790882 + }, + { + "epoch": 0.7270595133391967, + "grad_norm": 1.884945186646098, + "learning_rate": 0.00021805392731535754, + "loss": 3.533627510070801, + "step": 1240, + "token_acc": 0.2510889445695497 + }, + { + "epoch": 0.7276458516564057, + "grad_norm": 1.169212926292052, + "learning_rate": 0.0002182297772567409, + "loss": 3.5517265796661377, + "step": 1241, + "token_acc": 0.2504540121819906 + }, + { + "epoch": 0.7282321899736148, + "grad_norm": 1.6188861606613274, + "learning_rate": 0.00021840562719812424, + "loss": 3.5319066047668457, + "step": 1242, + "token_acc": 0.2509352471782027 + }, + { + "epoch": 0.7288185282908238, + "grad_norm": 1.3036897567680428, + "learning_rate": 0.0002185814771395076, + "loss": 3.621346950531006, + "step": 1243, + "token_acc": 0.24057557222096146 + }, + { + "epoch": 0.7294048666080328, + "grad_norm": 1.3072549451955513, + "learning_rate": 0.00021875732708089094, + "loss": 3.517343044281006, + "step": 1244, + "token_acc": 0.25246771634567106 + }, + { + "epoch": 0.7299912049252418, + "grad_norm": 1.492085170051368, + "learning_rate": 0.00021893317702227432, + "loss": 3.6194374561309814, + "step": 1245, + "token_acc": 0.2387442709596701 + }, + { + "epoch": 0.7305775432424509, + "grad_norm": 1.4440577619767259, + "learning_rate": 0.00021910902696365767, + "loss": 3.5678908824920654, + "step": 1246, + "token_acc": 0.24672867937343465 + }, + { + "epoch": 0.73116388155966, + "grad_norm": 1.4249551650165415, + "learning_rate": 0.00021928487690504102, + "loss": 3.564143657684326, + "step": 1247, + "token_acc": 0.24619896642899347 + }, + { + "epoch": 0.731750219876869, + "grad_norm": 1.4700480170173016, + "learning_rate": 0.00021946072684642437, + "loss": 3.542144298553467, + "step": 1248, + "token_acc": 0.2491643685752169 + }, + { + "epoch": 0.732336558194078, + "grad_norm": 1.4625998782950218, + "learning_rate": 0.00021963657678780772, + "loss": 3.5754032135009766, + "step": 1249, + "token_acc": 0.24517069183060547 + }, + { + "epoch": 0.7329228965112871, + "grad_norm": 1.261337351964851, + "learning_rate": 0.00021981242672919107, + "loss": 3.5500237941741943, + "step": 1250, + "token_acc": 0.24797190974705768 + }, + { + "epoch": 0.7335092348284961, + "grad_norm": 1.4197345876805896, + "learning_rate": 0.00021998827667057443, + "loss": 3.545901298522949, + "step": 1251, + "token_acc": 0.25066345983728244 + }, + { + "epoch": 0.7340955731457051, + "grad_norm": 1.1981523835627474, + "learning_rate": 0.0002201641266119578, + "loss": 3.526506185531616, + "step": 1252, + "token_acc": 0.25086497567701355 + }, + { + "epoch": 0.7346819114629141, + "grad_norm": 1.5868929873386077, + "learning_rate": 0.0002203399765533411, + "loss": 3.557638168334961, + "step": 1253, + "token_acc": 0.24662984533986684 + }, + { + "epoch": 0.7352682497801232, + "grad_norm": 1.203787460106692, + "learning_rate": 0.00022051582649472448, + "loss": 3.622738838195801, + "step": 1254, + "token_acc": 0.23892821607727707 + }, + { + "epoch": 0.7358545880973322, + "grad_norm": 1.5349738914300173, + "learning_rate": 0.00022069167643610783, + "loss": 3.570497512817383, + "step": 1255, + "token_acc": 0.24549297209207577 + }, + { + "epoch": 0.7364409264145412, + "grad_norm": 1.2023519482228637, + "learning_rate": 0.00022086752637749118, + "loss": 3.5299220085144043, + "step": 1256, + "token_acc": 0.24937345313117448 + }, + { + "epoch": 0.7370272647317502, + "grad_norm": 1.4428375285935824, + "learning_rate": 0.00022104337631887453, + "loss": 3.625183343887329, + "step": 1257, + "token_acc": 0.23908045381815168 + }, + { + "epoch": 0.7376136030489593, + "grad_norm": 1.492136020848117, + "learning_rate": 0.00022121922626025788, + "loss": 3.583981513977051, + "step": 1258, + "token_acc": 0.2430647087898509 + }, + { + "epoch": 0.7381999413661683, + "grad_norm": 1.2650077279455343, + "learning_rate": 0.00022139507620164123, + "loss": 3.5299758911132812, + "step": 1259, + "token_acc": 0.24919815537854642 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 1.3894114980479488, + "learning_rate": 0.00022157092614302458, + "loss": 3.5406646728515625, + "step": 1260, + "token_acc": 0.2473780787615509 + }, + { + "epoch": 0.7393726180005863, + "grad_norm": 1.5554310690509106, + "learning_rate": 0.00022174677608440796, + "loss": 3.5819411277770996, + "step": 1261, + "token_acc": 0.2432205219966681 + }, + { + "epoch": 0.7399589563177954, + "grad_norm": 1.078788867409666, + "learning_rate": 0.0002219226260257913, + "loss": 3.516218900680542, + "step": 1262, + "token_acc": 0.25198958201319577 + }, + { + "epoch": 0.7405452946350044, + "grad_norm": 1.4517875795545918, + "learning_rate": 0.00022209847596717466, + "loss": 3.539255142211914, + "step": 1263, + "token_acc": 0.24697214860528 + }, + { + "epoch": 0.7411316329522134, + "grad_norm": 0.9449188512997851, + "learning_rate": 0.00022227432590855801, + "loss": 3.493272542953491, + "step": 1264, + "token_acc": 0.25410398605315265 + }, + { + "epoch": 0.7417179712694224, + "grad_norm": 1.6967023908024002, + "learning_rate": 0.00022245017584994137, + "loss": 3.605579376220703, + "step": 1265, + "token_acc": 0.24067534189118514 + }, + { + "epoch": 0.7423043095866315, + "grad_norm": 1.376175745966463, + "learning_rate": 0.00022262602579132472, + "loss": 3.514482021331787, + "step": 1266, + "token_acc": 0.25100421732727846 + }, + { + "epoch": 0.7428906479038405, + "grad_norm": 1.4282552361013525, + "learning_rate": 0.00022280187573270807, + "loss": 3.4941563606262207, + "step": 1267, + "token_acc": 0.25457826957735685 + }, + { + "epoch": 0.7434769862210495, + "grad_norm": 1.2853485429608942, + "learning_rate": 0.00022297772567409145, + "loss": 3.5218427181243896, + "step": 1268, + "token_acc": 0.2522233605440694 + }, + { + "epoch": 0.7440633245382586, + "grad_norm": 1.2983254477011472, + "learning_rate": 0.0002231535756154748, + "loss": 3.5729188919067383, + "step": 1269, + "token_acc": 0.24637513486462417 + }, + { + "epoch": 0.7446496628554676, + "grad_norm": 1.3241285215527823, + "learning_rate": 0.00022332942555685815, + "loss": 3.5643608570098877, + "step": 1270, + "token_acc": 0.24493266380488915 + }, + { + "epoch": 0.7452360011726766, + "grad_norm": 1.7079998418405293, + "learning_rate": 0.00022350527549824147, + "loss": 3.5267105102539062, + "step": 1271, + "token_acc": 0.2508754537373613 + }, + { + "epoch": 0.7458223394898856, + "grad_norm": 1.0785260467019164, + "learning_rate": 0.00022368112543962482, + "loss": 3.516601085662842, + "step": 1272, + "token_acc": 0.25197126247112284 + }, + { + "epoch": 0.7464086778070947, + "grad_norm": 1.5906117477639026, + "learning_rate": 0.00022385697538100817, + "loss": 3.5439701080322266, + "step": 1273, + "token_acc": 0.24772254168470084 + }, + { + "epoch": 0.7469950161243037, + "grad_norm": 1.3551127023366434, + "learning_rate": 0.00022403282532239152, + "loss": 3.4956958293914795, + "step": 1274, + "token_acc": 0.2531258331291185 + }, + { + "epoch": 0.7475813544415127, + "grad_norm": 1.7966361203245316, + "learning_rate": 0.00022420867526377487, + "loss": 3.5369644165039062, + "step": 1275, + "token_acc": 0.2475377049095056 + }, + { + "epoch": 0.7481676927587217, + "grad_norm": 1.1403500735637413, + "learning_rate": 0.00022438452520515825, + "loss": 3.5866994857788086, + "step": 1276, + "token_acc": 0.2411107143982274 + }, + { + "epoch": 0.7487540310759309, + "grad_norm": 1.5401020454499297, + "learning_rate": 0.0002245603751465416, + "loss": 3.5151519775390625, + "step": 1277, + "token_acc": 0.25217215120593206 + }, + { + "epoch": 0.7493403693931399, + "grad_norm": 1.3311392424656527, + "learning_rate": 0.00022473622508792495, + "loss": 3.5742123126983643, + "step": 1278, + "token_acc": 0.2442783841016452 + }, + { + "epoch": 0.7499267077103489, + "grad_norm": 1.159911663442692, + "learning_rate": 0.0002249120750293083, + "loss": 3.5540332794189453, + "step": 1279, + "token_acc": 0.24720883844592179 + }, + { + "epoch": 0.7505130460275579, + "grad_norm": 1.3012220176281875, + "learning_rate": 0.00022508792497069166, + "loss": 3.5348312854766846, + "step": 1280, + "token_acc": 0.24802389364569077 + }, + { + "epoch": 0.751099384344767, + "grad_norm": 1.3022062175616274, + "learning_rate": 0.000225263774912075, + "loss": 3.5225284099578857, + "step": 1281, + "token_acc": 0.2503072616380958 + }, + { + "epoch": 0.751685722661976, + "grad_norm": 1.505998046236882, + "learning_rate": 0.00022543962485345836, + "loss": 3.4947967529296875, + "step": 1282, + "token_acc": 0.2542147387019762 + }, + { + "epoch": 0.752272060979185, + "grad_norm": 1.232094591628959, + "learning_rate": 0.0002256154747948417, + "loss": 3.543626546859741, + "step": 1283, + "token_acc": 0.24764154703884605 + }, + { + "epoch": 0.752858399296394, + "grad_norm": 1.407284196813894, + "learning_rate": 0.00022579132473622509, + "loss": 3.586953639984131, + "step": 1284, + "token_acc": 0.24211429649676508 + }, + { + "epoch": 0.7534447376136031, + "grad_norm": 1.2439733052125364, + "learning_rate": 0.00022596717467760844, + "loss": 3.5615909099578857, + "step": 1285, + "token_acc": 0.24607328480976437 + }, + { + "epoch": 0.7540310759308121, + "grad_norm": 1.2459773478657614, + "learning_rate": 0.0002261430246189918, + "loss": 3.5112464427948, + "step": 1286, + "token_acc": 0.2525803560639626 + }, + { + "epoch": 0.7546174142480211, + "grad_norm": 1.3251173409506307, + "learning_rate": 0.00022631887456037514, + "loss": 3.5989558696746826, + "step": 1287, + "token_acc": 0.2408991245533842 + }, + { + "epoch": 0.7552037525652301, + "grad_norm": 1.519826396769631, + "learning_rate": 0.0002264947245017585, + "loss": 3.538268566131592, + "step": 1288, + "token_acc": 0.249512409602184 + }, + { + "epoch": 0.7557900908824392, + "grad_norm": 1.3617912994699326, + "learning_rate": 0.00022667057444314181, + "loss": 3.6270439624786377, + "step": 1289, + "token_acc": 0.23806774588078 + }, + { + "epoch": 0.7563764291996482, + "grad_norm": 1.4984868102122595, + "learning_rate": 0.00022684642438452516, + "loss": 3.518787384033203, + "step": 1290, + "token_acc": 0.2492241644275672 + }, + { + "epoch": 0.7569627675168572, + "grad_norm": 1.2724412606452153, + "learning_rate": 0.00022702227432590852, + "loss": 3.547226905822754, + "step": 1291, + "token_acc": 0.2482909813492878 + }, + { + "epoch": 0.7575491058340662, + "grad_norm": 1.6547817954334374, + "learning_rate": 0.0002271981242672919, + "loss": 3.5538692474365234, + "step": 1292, + "token_acc": 0.24455592292783035 + }, + { + "epoch": 0.7581354441512753, + "grad_norm": 1.0969874831633897, + "learning_rate": 0.00022737397420867524, + "loss": 3.552861452102661, + "step": 1293, + "token_acc": 0.24882529681936705 + }, + { + "epoch": 0.7587217824684843, + "grad_norm": 1.5525991567533708, + "learning_rate": 0.0002275498241500586, + "loss": 3.5503973960876465, + "step": 1294, + "token_acc": 0.24610411590368234 + }, + { + "epoch": 0.7593081207856933, + "grad_norm": 1.111227001399496, + "learning_rate": 0.00022772567409144195, + "loss": 3.5445427894592285, + "step": 1295, + "token_acc": 0.24689116348563217 + }, + { + "epoch": 0.7598944591029023, + "grad_norm": 1.401945661859676, + "learning_rate": 0.0002279015240328253, + "loss": 3.498046398162842, + "step": 1296, + "token_acc": 0.25129064787773225 + }, + { + "epoch": 0.7604807974201114, + "grad_norm": 1.3553859273140412, + "learning_rate": 0.00022807737397420865, + "loss": 3.550243377685547, + "step": 1297, + "token_acc": 0.2473780663402885 + }, + { + "epoch": 0.7610671357373204, + "grad_norm": 1.4746703457145391, + "learning_rate": 0.000228253223915592, + "loss": 3.568816661834717, + "step": 1298, + "token_acc": 0.24337994489785308 + }, + { + "epoch": 0.7616534740545294, + "grad_norm": 1.5822841647471388, + "learning_rate": 0.00022842907385697538, + "loss": 3.5240468978881836, + "step": 1299, + "token_acc": 0.2501253146413563 + }, + { + "epoch": 0.7622398123717385, + "grad_norm": 1.3058461990043648, + "learning_rate": 0.00022860492379835873, + "loss": 3.5019290447235107, + "step": 1300, + "token_acc": 0.2524095486566734 + }, + { + "epoch": 0.7628261506889475, + "grad_norm": 1.444597586491725, + "learning_rate": 0.00022878077373974208, + "loss": 3.551189422607422, + "step": 1301, + "token_acc": 0.24687636278815045 + }, + { + "epoch": 0.7634124890061565, + "grad_norm": 0.999517254996965, + "learning_rate": 0.00022895662368112543, + "loss": 3.5050086975097656, + "step": 1302, + "token_acc": 0.2526176091602818 + }, + { + "epoch": 0.7639988273233655, + "grad_norm": 1.6290560761411603, + "learning_rate": 0.00022913247362250878, + "loss": 3.491086483001709, + "step": 1303, + "token_acc": 0.2532722361715415 + }, + { + "epoch": 0.7645851656405747, + "grad_norm": 1.06139807828867, + "learning_rate": 0.00022930832356389213, + "loss": 3.552790880203247, + "step": 1304, + "token_acc": 0.24734570364114017 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 1.6256699714520118, + "learning_rate": 0.00022948417350527548, + "loss": 3.5898325443267822, + "step": 1305, + "token_acc": 0.24198415788826494 + }, + { + "epoch": 0.7657578422749927, + "grad_norm": 1.2347533772466504, + "learning_rate": 0.00022966002344665886, + "loss": 3.5725343227386475, + "step": 1306, + "token_acc": 0.2442365366564663 + }, + { + "epoch": 0.7663441805922017, + "grad_norm": 1.2867475843845748, + "learning_rate": 0.00022983587338804216, + "loss": 3.5599560737609863, + "step": 1307, + "token_acc": 0.24574363502424326 + }, + { + "epoch": 0.7669305189094108, + "grad_norm": 1.469264263974974, + "learning_rate": 0.00023001172332942553, + "loss": 3.493713855743408, + "step": 1308, + "token_acc": 0.2529612903395894 + }, + { + "epoch": 0.7675168572266198, + "grad_norm": 1.698079236558699, + "learning_rate": 0.00023018757327080889, + "loss": 3.517455577850342, + "step": 1309, + "token_acc": 0.24846891548628752 + }, + { + "epoch": 0.7681031955438288, + "grad_norm": 1.15350549639968, + "learning_rate": 0.00023036342321219224, + "loss": 3.480907917022705, + "step": 1310, + "token_acc": 0.257596941618342 + }, + { + "epoch": 0.7686895338610378, + "grad_norm": 1.7379349550715335, + "learning_rate": 0.0002305392731535756, + "loss": 3.5841002464294434, + "step": 1311, + "token_acc": 0.2439927620783476 + }, + { + "epoch": 0.7692758721782469, + "grad_norm": 1.1002585500093949, + "learning_rate": 0.00023071512309495894, + "loss": 3.546715259552002, + "step": 1312, + "token_acc": 0.24598529893721616 + }, + { + "epoch": 0.7698622104954559, + "grad_norm": 1.562290812434698, + "learning_rate": 0.0002308909730363423, + "loss": 3.565049409866333, + "step": 1313, + "token_acc": 0.24550648273441925 + }, + { + "epoch": 0.7704485488126649, + "grad_norm": 1.1663479522914595, + "learning_rate": 0.00023106682297772564, + "loss": 3.5516159534454346, + "step": 1314, + "token_acc": 0.24810693781708118 + }, + { + "epoch": 0.7710348871298739, + "grad_norm": 1.3549712160228458, + "learning_rate": 0.00023124267291910902, + "loss": 3.5216691493988037, + "step": 1315, + "token_acc": 0.24940417776531615 + }, + { + "epoch": 0.771621225447083, + "grad_norm": 1.2017379180206698, + "learning_rate": 0.00023141852286049237, + "loss": 3.535663604736328, + "step": 1316, + "token_acc": 0.2503336766911664 + }, + { + "epoch": 0.772207563764292, + "grad_norm": 1.2752756703528207, + "learning_rate": 0.00023159437280187572, + "loss": 3.4778430461883545, + "step": 1317, + "token_acc": 0.25549548416839546 + }, + { + "epoch": 0.772793902081501, + "grad_norm": 1.2451082557090631, + "learning_rate": 0.00023177022274325907, + "loss": 3.548982620239258, + "step": 1318, + "token_acc": 0.2455617512873189 + }, + { + "epoch": 0.77338024039871, + "grad_norm": 1.4158895523275319, + "learning_rate": 0.00023194607268464242, + "loss": 3.553767204284668, + "step": 1319, + "token_acc": 0.24403253697502258 + }, + { + "epoch": 0.7739665787159191, + "grad_norm": 1.1767115162226616, + "learning_rate": 0.00023212192262602577, + "loss": 3.521721363067627, + "step": 1320, + "token_acc": 0.25231452970398655 + }, + { + "epoch": 0.7745529170331281, + "grad_norm": 1.020067149395257, + "learning_rate": 0.00023229777256740912, + "loss": 3.490562915802002, + "step": 1321, + "token_acc": 0.2524878938498624 + }, + { + "epoch": 0.7751392553503371, + "grad_norm": 1.4522842208103106, + "learning_rate": 0.0002324736225087925, + "loss": 3.5425868034362793, + "step": 1322, + "token_acc": 0.2478709940366382 + }, + { + "epoch": 0.7757255936675461, + "grad_norm": 1.2347430944392428, + "learning_rate": 0.00023264947245017585, + "loss": 3.540163040161133, + "step": 1323, + "token_acc": 0.24787067330162482 + }, + { + "epoch": 0.7763119319847552, + "grad_norm": 1.4725162345597291, + "learning_rate": 0.00023282532239155918, + "loss": 3.52274227142334, + "step": 1324, + "token_acc": 0.2501067548118743 + }, + { + "epoch": 0.7768982703019642, + "grad_norm": 1.4164951525512393, + "learning_rate": 0.00023300117233294253, + "loss": 3.5144240856170654, + "step": 1325, + "token_acc": 0.2494186136449186 + }, + { + "epoch": 0.7774846086191732, + "grad_norm": 1.3358592704000456, + "learning_rate": 0.00023317702227432588, + "loss": 3.5409460067749023, + "step": 1326, + "token_acc": 0.24992991646042081 + }, + { + "epoch": 0.7780709469363823, + "grad_norm": 1.429921604376815, + "learning_rate": 0.00023335287221570923, + "loss": 3.551036834716797, + "step": 1327, + "token_acc": 0.24525578265401718 + }, + { + "epoch": 0.7786572852535913, + "grad_norm": 1.129976355717462, + "learning_rate": 0.00023352872215709258, + "loss": 3.491189479827881, + "step": 1328, + "token_acc": 0.25371180054285647 + }, + { + "epoch": 0.7792436235708003, + "grad_norm": 1.425989839314161, + "learning_rate": 0.00023370457209847593, + "loss": 3.498746871948242, + "step": 1329, + "token_acc": 0.25239680547217525 + }, + { + "epoch": 0.7798299618880093, + "grad_norm": 1.218087099427935, + "learning_rate": 0.00023388042203985928, + "loss": 3.551755905151367, + "step": 1330, + "token_acc": 0.2462798623966187 + }, + { + "epoch": 0.7804163002052185, + "grad_norm": 1.4064826474628556, + "learning_rate": 0.00023405627198124266, + "loss": 3.503357410430908, + "step": 1331, + "token_acc": 0.25205527006485345 + }, + { + "epoch": 0.7810026385224275, + "grad_norm": 1.2702129287944959, + "learning_rate": 0.000234232121922626, + "loss": 3.5402162075042725, + "step": 1332, + "token_acc": 0.2482242861177273 + }, + { + "epoch": 0.7815889768396365, + "grad_norm": 1.4589042288206073, + "learning_rate": 0.00023440797186400936, + "loss": 3.5561366081237793, + "step": 1333, + "token_acc": 0.2433288151198179 + }, + { + "epoch": 0.7821753151568455, + "grad_norm": 1.1382322207361673, + "learning_rate": 0.0002345838218053927, + "loss": 3.5850577354431152, + "step": 1334, + "token_acc": 0.24253468594996558 + }, + { + "epoch": 0.7827616534740546, + "grad_norm": 1.4318276791262674, + "learning_rate": 0.00023475967174677606, + "loss": 3.525158166885376, + "step": 1335, + "token_acc": 0.2482472920822298 + }, + { + "epoch": 0.7833479917912636, + "grad_norm": 1.2806354855902475, + "learning_rate": 0.00023493552168815941, + "loss": 3.493321418762207, + "step": 1336, + "token_acc": 0.2520449795999602 + }, + { + "epoch": 0.7839343301084726, + "grad_norm": 1.6088123510738195, + "learning_rate": 0.00023511137162954276, + "loss": 3.478659152984619, + "step": 1337, + "token_acc": 0.25558739102558986 + }, + { + "epoch": 0.7845206684256816, + "grad_norm": 1.176673622710618, + "learning_rate": 0.00023528722157092614, + "loss": 3.543164014816284, + "step": 1338, + "token_acc": 0.24751044947341028 + }, + { + "epoch": 0.7851070067428907, + "grad_norm": 1.736002962508232, + "learning_rate": 0.0002354630715123095, + "loss": 3.5470120906829834, + "step": 1339, + "token_acc": 0.24651919184366672 + }, + { + "epoch": 0.7856933450600997, + "grad_norm": 1.0430661484266757, + "learning_rate": 0.00023563892145369284, + "loss": 3.5405569076538086, + "step": 1340, + "token_acc": 0.24673323006027154 + }, + { + "epoch": 0.7862796833773087, + "grad_norm": 1.4860223772155738, + "learning_rate": 0.0002358147713950762, + "loss": 3.554518699645996, + "step": 1341, + "token_acc": 0.24535291146277385 + }, + { + "epoch": 0.7868660216945177, + "grad_norm": 1.1371472770558677, + "learning_rate": 0.00023599062133645952, + "loss": 3.539022207260132, + "step": 1342, + "token_acc": 0.24746841571493366 + }, + { + "epoch": 0.7874523600117268, + "grad_norm": 1.2986564156057583, + "learning_rate": 0.00023616647127784287, + "loss": 3.512289524078369, + "step": 1343, + "token_acc": 0.25061844192491545 + }, + { + "epoch": 0.7880386983289358, + "grad_norm": 1.4232989622410008, + "learning_rate": 0.00023634232121922622, + "loss": 3.569448709487915, + "step": 1344, + "token_acc": 0.24304652773993662 + }, + { + "epoch": 0.7886250366461448, + "grad_norm": 0.9015955375223305, + "learning_rate": 0.00023651817116060957, + "loss": 3.5098624229431152, + "step": 1345, + "token_acc": 0.25226367114779813 + }, + { + "epoch": 0.7892113749633538, + "grad_norm": 1.1018489389970407, + "learning_rate": 0.00023669402110199295, + "loss": 3.506894111633301, + "step": 1346, + "token_acc": 0.25030938985201284 + }, + { + "epoch": 0.7897977132805629, + "grad_norm": 1.3289606406901249, + "learning_rate": 0.0002368698710433763, + "loss": 3.5221610069274902, + "step": 1347, + "token_acc": 0.2521557298714594 + }, + { + "epoch": 0.7903840515977719, + "grad_norm": 1.1043491605511258, + "learning_rate": 0.00023704572098475965, + "loss": 3.551619529724121, + "step": 1348, + "token_acc": 0.24603639851847905 + }, + { + "epoch": 0.7909703899149809, + "grad_norm": 1.4888513969907502, + "learning_rate": 0.000237221570926143, + "loss": 3.5470314025878906, + "step": 1349, + "token_acc": 0.24807980075970826 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 1.0276340009004528, + "learning_rate": 0.00023739742086752635, + "loss": 3.478890895843506, + "step": 1350, + "token_acc": 0.25571374463647356 + }, + { + "epoch": 0.792143066549399, + "grad_norm": 1.3879383216328915, + "learning_rate": 0.0002375732708089097, + "loss": 3.57604718208313, + "step": 1351, + "token_acc": 0.24262518199658162 + }, + { + "epoch": 0.792729404866608, + "grad_norm": 1.340463944902041, + "learning_rate": 0.00023774912075029306, + "loss": 3.539947509765625, + "step": 1352, + "token_acc": 0.24951912190290865 + }, + { + "epoch": 0.793315743183817, + "grad_norm": 1.2984350731495375, + "learning_rate": 0.00023792497069167643, + "loss": 3.5628278255462646, + "step": 1353, + "token_acc": 0.24179955683097823 + }, + { + "epoch": 0.7939020815010261, + "grad_norm": 1.4053193273464923, + "learning_rate": 0.00023810082063305978, + "loss": 3.596524238586426, + "step": 1354, + "token_acc": 0.24081179590900115 + }, + { + "epoch": 0.7944884198182351, + "grad_norm": 1.1068320873107373, + "learning_rate": 0.00023827667057444313, + "loss": 3.514730215072632, + "step": 1355, + "token_acc": 0.25075754431115355 + }, + { + "epoch": 0.7950747581354441, + "grad_norm": 1.6625948931249201, + "learning_rate": 0.00023845252051582649, + "loss": 3.5009069442749023, + "step": 1356, + "token_acc": 0.2532228289834475 + }, + { + "epoch": 0.7956610964526531, + "grad_norm": 0.9555384726549012, + "learning_rate": 0.00023862837045720984, + "loss": 3.426203727722168, + "step": 1357, + "token_acc": 0.2634332709767469 + }, + { + "epoch": 0.7962474347698623, + "grad_norm": 1.4347928093908564, + "learning_rate": 0.0002388042203985932, + "loss": 3.5869579315185547, + "step": 1358, + "token_acc": 0.23961607866725945 + }, + { + "epoch": 0.7968337730870713, + "grad_norm": 1.5869928118072407, + "learning_rate": 0.00023898007033997654, + "loss": 3.531996250152588, + "step": 1359, + "token_acc": 0.24907132385747976 + }, + { + "epoch": 0.7974201114042803, + "grad_norm": 1.043270178257646, + "learning_rate": 0.00023915592028135986, + "loss": 3.5149025917053223, + "step": 1360, + "token_acc": 0.24939670495375799 + }, + { + "epoch": 0.7980064497214893, + "grad_norm": 1.4997250379643066, + "learning_rate": 0.0002393317702227432, + "loss": 3.499387741088867, + "step": 1361, + "token_acc": 0.25247028794354515 + }, + { + "epoch": 0.7985927880386984, + "grad_norm": 0.9161935273599092, + "learning_rate": 0.0002395076201641266, + "loss": 3.4430837631225586, + "step": 1362, + "token_acc": 0.2608140271196803 + }, + { + "epoch": 0.7991791263559074, + "grad_norm": 1.4636381710982649, + "learning_rate": 0.00023968347010550994, + "loss": 3.538482189178467, + "step": 1363, + "token_acc": 0.2481892192903008 + }, + { + "epoch": 0.7997654646731164, + "grad_norm": 1.222398277145788, + "learning_rate": 0.0002398593200468933, + "loss": 3.4975392818450928, + "step": 1364, + "token_acc": 0.2520004106177095 + }, + { + "epoch": 0.8003518029903254, + "grad_norm": 1.402610637836429, + "learning_rate": 0.00024003516998827664, + "loss": 3.52131986618042, + "step": 1365, + "token_acc": 0.24899112837790638 + }, + { + "epoch": 0.8009381413075345, + "grad_norm": 1.1913008491251893, + "learning_rate": 0.00024021101992966, + "loss": 3.5296170711517334, + "step": 1366, + "token_acc": 0.24790734861005034 + }, + { + "epoch": 0.8015244796247435, + "grad_norm": 1.2684558105859447, + "learning_rate": 0.00024038686987104335, + "loss": 3.518589973449707, + "step": 1367, + "token_acc": 0.24943038184146413 + }, + { + "epoch": 0.8021108179419525, + "grad_norm": 1.2108378798749466, + "learning_rate": 0.0002405627198124267, + "loss": 3.5225632190704346, + "step": 1368, + "token_acc": 0.24759843948226407 + }, + { + "epoch": 0.8026971562591615, + "grad_norm": 1.3267338901291388, + "learning_rate": 0.00024073856975381007, + "loss": 3.4838345050811768, + "step": 1369, + "token_acc": 0.25506533749819404 + }, + { + "epoch": 0.8032834945763706, + "grad_norm": 1.1158571791648582, + "learning_rate": 0.00024091441969519343, + "loss": 3.486650228500366, + "step": 1370, + "token_acc": 0.2515 + }, + { + "epoch": 0.8038698328935796, + "grad_norm": 1.4953364879371343, + "learning_rate": 0.00024109026963657678, + "loss": 3.5227785110473633, + "step": 1371, + "token_acc": 0.25013988470897475 + }, + { + "epoch": 0.8044561712107886, + "grad_norm": 1.1646909892992259, + "learning_rate": 0.00024126611957796013, + "loss": 3.5241241455078125, + "step": 1372, + "token_acc": 0.25021427500053567 + }, + { + "epoch": 0.8050425095279976, + "grad_norm": 1.1741380221992452, + "learning_rate": 0.00024144196951934348, + "loss": 3.509476661682129, + "step": 1373, + "token_acc": 0.2512670580540594 + }, + { + "epoch": 0.8056288478452067, + "grad_norm": 1.373415991157362, + "learning_rate": 0.00024161781946072683, + "loss": 3.523770809173584, + "step": 1374, + "token_acc": 0.24859482287172632 + }, + { + "epoch": 0.8062151861624157, + "grad_norm": 1.2750483087366524, + "learning_rate": 0.00024179366940211018, + "loss": 3.505979061126709, + "step": 1375, + "token_acc": 0.2518861855581306 + }, + { + "epoch": 0.8068015244796247, + "grad_norm": 1.381538930759667, + "learning_rate": 0.00024196951934349356, + "loss": 3.4995217323303223, + "step": 1376, + "token_acc": 0.2513723346657306 + }, + { + "epoch": 0.8073878627968337, + "grad_norm": 1.279395434103982, + "learning_rate": 0.0002421453692848769, + "loss": 3.5280673503875732, + "step": 1377, + "token_acc": 0.24813645966458883 + }, + { + "epoch": 0.8079742011140428, + "grad_norm": 1.2955129904578537, + "learning_rate": 0.00024232121922626023, + "loss": 3.5247902870178223, + "step": 1378, + "token_acc": 0.24923996416049057 + }, + { + "epoch": 0.8085605394312518, + "grad_norm": 1.2994083419952702, + "learning_rate": 0.00024249706916764358, + "loss": 3.4935412406921387, + "step": 1379, + "token_acc": 0.2509757844096378 + }, + { + "epoch": 0.8091468777484608, + "grad_norm": 1.2562584084945727, + "learning_rate": 0.00024267291910902693, + "loss": 3.541517734527588, + "step": 1380, + "token_acc": 0.24732372626236554 + }, + { + "epoch": 0.8097332160656698, + "grad_norm": 1.0297583497336373, + "learning_rate": 0.00024284876905041029, + "loss": 3.521821975708008, + "step": 1381, + "token_acc": 0.24713079980351002 + }, + { + "epoch": 0.810319554382879, + "grad_norm": 1.4345717094664914, + "learning_rate": 0.00024302461899179364, + "loss": 3.5253238677978516, + "step": 1382, + "token_acc": 0.24893450971573697 + }, + { + "epoch": 0.810905892700088, + "grad_norm": 1.2128476575553961, + "learning_rate": 0.000243200468933177, + "loss": 3.5326619148254395, + "step": 1383, + "token_acc": 0.2455768928268432 + }, + { + "epoch": 0.811492231017297, + "grad_norm": 1.5176449352451746, + "learning_rate": 0.00024337631887456034, + "loss": 3.498926877975464, + "step": 1384, + "token_acc": 0.2520472989373041 + }, + { + "epoch": 0.8120785693345061, + "grad_norm": 1.2248775542869774, + "learning_rate": 0.00024355216881594372, + "loss": 3.486328125, + "step": 1385, + "token_acc": 0.2532828461001748 + }, + { + "epoch": 0.8126649076517151, + "grad_norm": 1.260784729825427, + "learning_rate": 0.00024372801875732707, + "loss": 3.4956254959106445, + "step": 1386, + "token_acc": 0.2521185409166037 + }, + { + "epoch": 0.8132512459689241, + "grad_norm": 0.9384761279512973, + "learning_rate": 0.00024390386869871042, + "loss": 3.4776968955993652, + "step": 1387, + "token_acc": 0.25448890559810167 + }, + { + "epoch": 0.8138375842861331, + "grad_norm": 1.4143018745024274, + "learning_rate": 0.00024407971864009377, + "loss": 3.569000720977783, + "step": 1388, + "token_acc": 0.2423899957586822 + }, + { + "epoch": 0.8144239226033422, + "grad_norm": 1.1364348124033246, + "learning_rate": 0.0002442555685814771, + "loss": 3.503652334213257, + "step": 1389, + "token_acc": 0.2497558160772283 + }, + { + "epoch": 0.8150102609205512, + "grad_norm": 1.112221727500393, + "learning_rate": 0.0002444314185228605, + "loss": 3.47096586227417, + "step": 1390, + "token_acc": 0.254377988248935 + }, + { + "epoch": 0.8155965992377602, + "grad_norm": 1.4408744725241267, + "learning_rate": 0.0002446072684642438, + "loss": 3.4749674797058105, + "step": 1391, + "token_acc": 0.25344111187866664 + }, + { + "epoch": 0.8161829375549692, + "grad_norm": 0.999034055474413, + "learning_rate": 0.0002447831184056272, + "loss": 3.519740581512451, + "step": 1392, + "token_acc": 0.25079816690376244 + }, + { + "epoch": 0.8167692758721783, + "grad_norm": 1.4358392785129175, + "learning_rate": 0.0002449589683470105, + "loss": 3.4820048809051514, + "step": 1393, + "token_acc": 0.25344169676288947 + }, + { + "epoch": 0.8173556141893873, + "grad_norm": 1.2158700851637072, + "learning_rate": 0.0002451348182883939, + "loss": 3.487466812133789, + "step": 1394, + "token_acc": 0.25218459529117676 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 1.3575663748216873, + "learning_rate": 0.0002453106682297773, + "loss": 3.4985780715942383, + "step": 1395, + "token_acc": 0.25081551568878313 + }, + { + "epoch": 0.8185282908238053, + "grad_norm": 1.2461491284729482, + "learning_rate": 0.00024548651817116055, + "loss": 3.4759790897369385, + "step": 1396, + "token_acc": 0.2531578725022179 + }, + { + "epoch": 0.8191146291410144, + "grad_norm": 1.2489309189556415, + "learning_rate": 0.0002456623681125439, + "loss": 3.5269782543182373, + "step": 1397, + "token_acc": 0.25053125659894204 + }, + { + "epoch": 0.8197009674582234, + "grad_norm": 1.1061098127228195, + "learning_rate": 0.0002458382180539273, + "loss": 3.565739154815674, + "step": 1398, + "token_acc": 0.24388808520125893 + }, + { + "epoch": 0.8202873057754324, + "grad_norm": 1.2921953537227937, + "learning_rate": 0.00024601406799531063, + "loss": 3.5449059009552, + "step": 1399, + "token_acc": 0.24675700525281913 + }, + { + "epoch": 0.8208736440926414, + "grad_norm": 1.2131059564246882, + "learning_rate": 0.000246189917936694, + "loss": 3.487881898880005, + "step": 1400, + "token_acc": 0.2518350118713242 + }, + { + "epoch": 0.8214599824098505, + "grad_norm": 1.1314907200938653, + "learning_rate": 0.00024636576787807733, + "loss": 3.5301811695098877, + "step": 1401, + "token_acc": 0.24820517924634553 + }, + { + "epoch": 0.8220463207270595, + "grad_norm": 1.2883489472056389, + "learning_rate": 0.0002465416178194607, + "loss": 3.4905428886413574, + "step": 1402, + "token_acc": 0.25414038879133893 + }, + { + "epoch": 0.8226326590442685, + "grad_norm": 1.1617520850905985, + "learning_rate": 0.00024671746776084403, + "loss": 3.490755558013916, + "step": 1403, + "token_acc": 0.2537040115536069 + }, + { + "epoch": 0.8232189973614775, + "grad_norm": 1.3902940951835465, + "learning_rate": 0.0002468933177022274, + "loss": 3.459394931793213, + "step": 1404, + "token_acc": 0.2569570406355315 + }, + { + "epoch": 0.8238053356786866, + "grad_norm": 1.16037314307343, + "learning_rate": 0.0002470691676436108, + "loss": 3.4644532203674316, + "step": 1405, + "token_acc": 0.2531717787801028 + }, + { + "epoch": 0.8243916739958956, + "grad_norm": 1.1317272249494483, + "learning_rate": 0.0002472450175849941, + "loss": 3.4927358627319336, + "step": 1406, + "token_acc": 0.2506719120256865 + }, + { + "epoch": 0.8249780123131046, + "grad_norm": 1.0801859633177322, + "learning_rate": 0.0002474208675263775, + "loss": 3.5378341674804688, + "step": 1407, + "token_acc": 0.24590614226267948 + }, + { + "epoch": 0.8255643506303136, + "grad_norm": 1.0145950275544406, + "learning_rate": 0.0002475967174677608, + "loss": 3.432802200317383, + "step": 1408, + "token_acc": 0.260275911601341 + }, + { + "epoch": 0.8261506889475227, + "grad_norm": 1.3244002168746623, + "learning_rate": 0.0002477725674091442, + "loss": 3.5539357662200928, + "step": 1409, + "token_acc": 0.24332076884005668 + }, + { + "epoch": 0.8267370272647317, + "grad_norm": 1.3588962961468913, + "learning_rate": 0.0002479484173505275, + "loss": 3.4740781784057617, + "step": 1410, + "token_acc": 0.25389171162129387 + }, + { + "epoch": 0.8273233655819408, + "grad_norm": 1.2332645972401635, + "learning_rate": 0.0002481242672919109, + "loss": 3.5055789947509766, + "step": 1411, + "token_acc": 0.25093262276360867 + }, + { + "epoch": 0.8279097038991499, + "grad_norm": 1.2857659721300096, + "learning_rate": 0.00024830011723329427, + "loss": 3.4982221126556396, + "step": 1412, + "token_acc": 0.2508262315663979 + }, + { + "epoch": 0.8284960422163589, + "grad_norm": 1.2115742894605912, + "learning_rate": 0.0002484759671746776, + "loss": 3.4536831378936768, + "step": 1413, + "token_acc": 0.25567958773425087 + }, + { + "epoch": 0.8290823805335679, + "grad_norm": 1.0186740097083244, + "learning_rate": 0.0002486518171160609, + "loss": 3.525965690612793, + "step": 1414, + "token_acc": 0.247292948605744 + }, + { + "epoch": 0.8296687188507769, + "grad_norm": 1.0860146122015826, + "learning_rate": 0.0002488276670574443, + "loss": 3.5002598762512207, + "step": 1415, + "token_acc": 0.2515596551706045 + }, + { + "epoch": 0.830255057167986, + "grad_norm": 1.4454233866292083, + "learning_rate": 0.0002490035169988276, + "loss": 3.537914752960205, + "step": 1416, + "token_acc": 0.24730110892160273 + }, + { + "epoch": 0.830841395485195, + "grad_norm": 1.0400082681968426, + "learning_rate": 0.000249179366940211, + "loss": 3.445465087890625, + "step": 1417, + "token_acc": 0.2572701816602924 + }, + { + "epoch": 0.831427733802404, + "grad_norm": 1.4586003171229651, + "learning_rate": 0.0002493552168815943, + "loss": 3.524172782897949, + "step": 1418, + "token_acc": 0.2477750959022223 + }, + { + "epoch": 0.832014072119613, + "grad_norm": 0.9737932195192143, + "learning_rate": 0.0002495310668229777, + "loss": 3.504889965057373, + "step": 1419, + "token_acc": 0.24958140576413168 + }, + { + "epoch": 0.8326004104368221, + "grad_norm": 1.6420739394542294, + "learning_rate": 0.0002497069167643611, + "loss": 3.4851255416870117, + "step": 1420, + "token_acc": 0.25322547614171614 + }, + { + "epoch": 0.8331867487540311, + "grad_norm": 1.1223518955111516, + "learning_rate": 0.0002498827667057444, + "loss": 3.542442560195923, + "step": 1421, + "token_acc": 0.24554698040089198 + }, + { + "epoch": 0.8337730870712401, + "grad_norm": 1.3367038739023618, + "learning_rate": 0.0002500586166471278, + "loss": 3.515259265899658, + "step": 1422, + "token_acc": 0.24910493711883128 + }, + { + "epoch": 0.8343594253884491, + "grad_norm": 1.03374443830272, + "learning_rate": 0.0002502344665885111, + "loss": 3.444234848022461, + "step": 1423, + "token_acc": 0.2579337291317806 + }, + { + "epoch": 0.8349457637056582, + "grad_norm": 1.2791814816590374, + "learning_rate": 0.0002504103165298945, + "loss": 3.44527530670166, + "step": 1424, + "token_acc": 0.25652109548482604 + }, + { + "epoch": 0.8355321020228672, + "grad_norm": 1.3864977882348868, + "learning_rate": 0.0002505861664712778, + "loss": 3.455772638320923, + "step": 1425, + "token_acc": 0.25790059227569306 + }, + { + "epoch": 0.8361184403400762, + "grad_norm": 1.4144485065719277, + "learning_rate": 0.0002507620164126612, + "loss": 3.488882541656494, + "step": 1426, + "token_acc": 0.25109528330143516 + }, + { + "epoch": 0.8367047786572852, + "grad_norm": 1.1264031573258386, + "learning_rate": 0.00025093786635404456, + "loss": 3.5070533752441406, + "step": 1427, + "token_acc": 0.25087618485559127 + }, + { + "epoch": 0.8372911169744943, + "grad_norm": 1.152685641703167, + "learning_rate": 0.0002511137162954279, + "loss": 3.5346567630767822, + "step": 1428, + "token_acc": 0.24735420126290364 + }, + { + "epoch": 0.8378774552917033, + "grad_norm": 1.3353647284902894, + "learning_rate": 0.00025128956623681126, + "loss": 3.505019426345825, + "step": 1429, + "token_acc": 0.24829969020798873 + }, + { + "epoch": 0.8384637936089123, + "grad_norm": 1.0996475557570604, + "learning_rate": 0.0002514654161781946, + "loss": 3.502018451690674, + "step": 1430, + "token_acc": 0.25136771144445275 + }, + { + "epoch": 0.8390501319261213, + "grad_norm": 1.2944463810727704, + "learning_rate": 0.0002516412661195779, + "loss": 3.5040979385375977, + "step": 1431, + "token_acc": 0.25034546526041984 + }, + { + "epoch": 0.8396364702433304, + "grad_norm": 1.0922213874510516, + "learning_rate": 0.0002518171160609613, + "loss": 3.4808406829833984, + "step": 1432, + "token_acc": 0.25272973577262503 + }, + { + "epoch": 0.8402228085605394, + "grad_norm": 1.3836978652417502, + "learning_rate": 0.0002519929660023446, + "loss": 3.5306410789489746, + "step": 1433, + "token_acc": 0.24835992873415202 + }, + { + "epoch": 0.8408091468777484, + "grad_norm": 1.0227616473534067, + "learning_rate": 0.000252168815943728, + "loss": 3.477841854095459, + "step": 1434, + "token_acc": 0.2519114740119637 + }, + { + "epoch": 0.8413954851949574, + "grad_norm": 1.4117895464330559, + "learning_rate": 0.0002523446658851113, + "loss": 3.4867329597473145, + "step": 1435, + "token_acc": 0.25292114434123525 + }, + { + "epoch": 0.8419818235121665, + "grad_norm": 0.979133012776937, + "learning_rate": 0.0002525205158264947, + "loss": 3.526702880859375, + "step": 1436, + "token_acc": 0.24739271984843747 + }, + { + "epoch": 0.8425681618293756, + "grad_norm": 1.460079217104386, + "learning_rate": 0.00025269636576787807, + "loss": 3.5548555850982666, + "step": 1437, + "token_acc": 0.24606543045867876 + }, + { + "epoch": 0.8431545001465846, + "grad_norm": 0.8637701731042254, + "learning_rate": 0.0002528722157092614, + "loss": 3.5132594108581543, + "step": 1438, + "token_acc": 0.25090352816248 + }, + { + "epoch": 0.8437408384637937, + "grad_norm": 1.2024678017098835, + "learning_rate": 0.00025304806565064477, + "loss": 3.492680072784424, + "step": 1439, + "token_acc": 0.24954935197616512 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 1.0795016627432437, + "learning_rate": 0.0002532239155920281, + "loss": 3.4910407066345215, + "step": 1440, + "token_acc": 0.25258379139807996 + }, + { + "epoch": 0.8449135150982117, + "grad_norm": 1.2303250210827923, + "learning_rate": 0.0002533997655334115, + "loss": 3.5061259269714355, + "step": 1441, + "token_acc": 0.250391739739532 + }, + { + "epoch": 0.8454998534154207, + "grad_norm": 0.8931192989937837, + "learning_rate": 0.0002535756154747948, + "loss": 3.4786667823791504, + "step": 1442, + "token_acc": 0.25215940034550405 + }, + { + "epoch": 0.8460861917326298, + "grad_norm": 1.228573698795334, + "learning_rate": 0.0002537514654161782, + "loss": 3.5324888229370117, + "step": 1443, + "token_acc": 0.2465188989406279 + }, + { + "epoch": 0.8466725300498388, + "grad_norm": 1.1340169813949923, + "learning_rate": 0.00025392731535756155, + "loss": 3.5123674869537354, + "step": 1444, + "token_acc": 0.24927085358740034 + }, + { + "epoch": 0.8472588683670478, + "grad_norm": 1.5219406858487863, + "learning_rate": 0.0002541031652989449, + "loss": 3.4856228828430176, + "step": 1445, + "token_acc": 0.2519374781168183 + }, + { + "epoch": 0.8478452066842568, + "grad_norm": 1.2730331229444234, + "learning_rate": 0.00025427901524032826, + "loss": 3.509390354156494, + "step": 1446, + "token_acc": 0.24904399460755258 + }, + { + "epoch": 0.8484315450014659, + "grad_norm": 1.3087532698596622, + "learning_rate": 0.0002544548651817116, + "loss": 3.457827568054199, + "step": 1447, + "token_acc": 0.2574022526492273 + }, + { + "epoch": 0.8490178833186749, + "grad_norm": 1.0173605101250207, + "learning_rate": 0.00025463071512309496, + "loss": 3.519615650177002, + "step": 1448, + "token_acc": 0.2480974204944206 + }, + { + "epoch": 0.8496042216358839, + "grad_norm": 1.2744942717422465, + "learning_rate": 0.0002548065650644783, + "loss": 3.4946985244750977, + "step": 1449, + "token_acc": 0.2520526711775526 + }, + { + "epoch": 0.8501905599530929, + "grad_norm": 1.0618891422752543, + "learning_rate": 0.0002549824150058616, + "loss": 3.4787182807922363, + "step": 1450, + "token_acc": 0.2524193865900232 + }, + { + "epoch": 0.850776898270302, + "grad_norm": 0.9765908292231744, + "learning_rate": 0.000255158264947245, + "loss": 3.4466562271118164, + "step": 1451, + "token_acc": 0.2585026765760615 + }, + { + "epoch": 0.851363236587511, + "grad_norm": 0.9419277848629152, + "learning_rate": 0.00025533411488862836, + "loss": 3.4913783073425293, + "step": 1452, + "token_acc": 0.2518612762745918 + }, + { + "epoch": 0.85194957490472, + "grad_norm": 1.2032763069489223, + "learning_rate": 0.0002555099648300117, + "loss": 3.507528781890869, + "step": 1453, + "token_acc": 0.250432230255326 + }, + { + "epoch": 0.852535913221929, + "grad_norm": 1.1184302831915998, + "learning_rate": 0.00025568581477139506, + "loss": 3.502519369125366, + "step": 1454, + "token_acc": 0.24943050925985508 + }, + { + "epoch": 0.8531222515391381, + "grad_norm": 1.2133641109908768, + "learning_rate": 0.0002558616647127784, + "loss": 3.4678733348846436, + "step": 1455, + "token_acc": 0.2559625269397266 + }, + { + "epoch": 0.8537085898563471, + "grad_norm": 1.3048733888561916, + "learning_rate": 0.00025603751465416176, + "loss": 3.3941495418548584, + "step": 1456, + "token_acc": 0.2624058203529579 + }, + { + "epoch": 0.8542949281735561, + "grad_norm": 0.9244945988282646, + "learning_rate": 0.0002562133645955451, + "loss": 3.4439454078674316, + "step": 1457, + "token_acc": 0.25687500498236904 + }, + { + "epoch": 0.8548812664907651, + "grad_norm": 1.1368826972867665, + "learning_rate": 0.00025638921453692847, + "loss": 3.504081964492798, + "step": 1458, + "token_acc": 0.2516719909315746 + }, + { + "epoch": 0.8554676048079742, + "grad_norm": 1.1832458636905463, + "learning_rate": 0.00025656506447831184, + "loss": 3.424797773361206, + "step": 1459, + "token_acc": 0.26008319627513493 + }, + { + "epoch": 0.8560539431251832, + "grad_norm": 1.3148642348600923, + "learning_rate": 0.00025674091441969517, + "loss": 3.488696813583374, + "step": 1460, + "token_acc": 0.25248551812633613 + }, + { + "epoch": 0.8566402814423922, + "grad_norm": 0.9801609890583167, + "learning_rate": 0.00025691676436107855, + "loss": 3.450690746307373, + "step": 1461, + "token_acc": 0.25557135958326627 + }, + { + "epoch": 0.8572266197596012, + "grad_norm": 1.227028490260521, + "learning_rate": 0.00025709261430246187, + "loss": 3.500920057296753, + "step": 1462, + "token_acc": 0.2508067284586337 + }, + { + "epoch": 0.8578129580768104, + "grad_norm": 1.0108741979870475, + "learning_rate": 0.00025726846424384525, + "loss": 3.5010433197021484, + "step": 1463, + "token_acc": 0.2509909872319119 + }, + { + "epoch": 0.8583992963940194, + "grad_norm": 1.1379483325284288, + "learning_rate": 0.00025744431418522857, + "loss": 3.4791853427886963, + "step": 1464, + "token_acc": 0.25361647491251477 + }, + { + "epoch": 0.8589856347112284, + "grad_norm": 1.020050248471449, + "learning_rate": 0.00025762016412661195, + "loss": 3.4631359577178955, + "step": 1465, + "token_acc": 0.2546912619533825 + }, + { + "epoch": 0.8595719730284375, + "grad_norm": 1.3453424739355984, + "learning_rate": 0.00025779601406799533, + "loss": 3.4491984844207764, + "step": 1466, + "token_acc": 0.25708798020709844 + }, + { + "epoch": 0.8601583113456465, + "grad_norm": 1.1373548787033998, + "learning_rate": 0.00025797186400937865, + "loss": 3.50360107421875, + "step": 1467, + "token_acc": 0.2490274178007018 + }, + { + "epoch": 0.8607446496628555, + "grad_norm": 1.2029776923453677, + "learning_rate": 0.000258147713950762, + "loss": 3.505368232727051, + "step": 1468, + "token_acc": 0.2486163043707882 + }, + { + "epoch": 0.8613309879800645, + "grad_norm": 1.0136470358274647, + "learning_rate": 0.00025832356389214535, + "loss": 3.478726863861084, + "step": 1469, + "token_acc": 0.25389309378519237 + }, + { + "epoch": 0.8619173262972736, + "grad_norm": 1.3792194661646142, + "learning_rate": 0.0002584994138335287, + "loss": 3.445176601409912, + "step": 1470, + "token_acc": 0.2578973587173398 + }, + { + "epoch": 0.8625036646144826, + "grad_norm": 1.148264023600593, + "learning_rate": 0.00025867526377491205, + "loss": 3.522228240966797, + "step": 1471, + "token_acc": 0.2464372049549487 + }, + { + "epoch": 0.8630900029316916, + "grad_norm": 1.0318068798469455, + "learning_rate": 0.0002588511137162954, + "loss": 3.5284390449523926, + "step": 1472, + "token_acc": 0.24818001323626737 + }, + { + "epoch": 0.8636763412489006, + "grad_norm": 1.2969094563008188, + "learning_rate": 0.00025902696365767876, + "loss": 3.4596054553985596, + "step": 1473, + "token_acc": 0.25765682803296125 + }, + { + "epoch": 0.8642626795661097, + "grad_norm": 1.1230446180911924, + "learning_rate": 0.00025920281359906213, + "loss": 3.5157618522644043, + "step": 1474, + "token_acc": 0.2497594150625521 + }, + { + "epoch": 0.8648490178833187, + "grad_norm": 1.287610683595465, + "learning_rate": 0.00025937866354044546, + "loss": 3.4883811473846436, + "step": 1475, + "token_acc": 0.2500610084099765 + }, + { + "epoch": 0.8654353562005277, + "grad_norm": 1.127778010493277, + "learning_rate": 0.00025955451348182884, + "loss": 3.4494807720184326, + "step": 1476, + "token_acc": 0.2569076566417355 + }, + { + "epoch": 0.8660216945177367, + "grad_norm": 1.2590031047059904, + "learning_rate": 0.00025973036342321216, + "loss": 3.4717233180999756, + "step": 1477, + "token_acc": 0.25256045831364743 + }, + { + "epoch": 0.8666080328349458, + "grad_norm": 1.0091424360495187, + "learning_rate": 0.00025990621336459554, + "loss": 3.5164389610290527, + "step": 1478, + "token_acc": 0.24728033472803349 + }, + { + "epoch": 0.8671943711521548, + "grad_norm": 1.1052879637029678, + "learning_rate": 0.00026008206330597886, + "loss": 3.457829475402832, + "step": 1479, + "token_acc": 0.25569888995534784 + }, + { + "epoch": 0.8677807094693638, + "grad_norm": 1.1066971925333116, + "learning_rate": 0.00026025791324736224, + "loss": 3.4888057708740234, + "step": 1480, + "token_acc": 0.25156994341360706 + }, + { + "epoch": 0.8683670477865728, + "grad_norm": 1.390212179035056, + "learning_rate": 0.0002604337631887456, + "loss": 3.4768948554992676, + "step": 1481, + "token_acc": 0.2531208139395487 + }, + { + "epoch": 0.8689533861037819, + "grad_norm": 1.1820990083916247, + "learning_rate": 0.00026060961313012894, + "loss": 3.4105687141418457, + "step": 1482, + "token_acc": 0.2611815216069455 + }, + { + "epoch": 0.8695397244209909, + "grad_norm": 1.0701695103759878, + "learning_rate": 0.0002607854630715123, + "loss": 3.462088108062744, + "step": 1483, + "token_acc": 0.2538903665261109 + }, + { + "epoch": 0.8701260627381999, + "grad_norm": 1.1792212192386595, + "learning_rate": 0.00026096131301289564, + "loss": 3.4792885780334473, + "step": 1484, + "token_acc": 0.25245618275050435 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 0.9898559172407613, + "learning_rate": 0.00026113716295427897, + "loss": 3.492661952972412, + "step": 1485, + "token_acc": 0.25025937611398713 + }, + { + "epoch": 0.871298739372618, + "grad_norm": 1.212503342664103, + "learning_rate": 0.00026131301289566234, + "loss": 3.5061206817626953, + "step": 1486, + "token_acc": 0.24855186662942017 + }, + { + "epoch": 0.871885077689827, + "grad_norm": 1.2683829743754083, + "learning_rate": 0.00026148886283704567, + "loss": 3.4554123878479004, + "step": 1487, + "token_acc": 0.2552584960599458 + }, + { + "epoch": 0.872471416007036, + "grad_norm": 1.1643304855501575, + "learning_rate": 0.00026166471277842905, + "loss": 3.4975650310516357, + "step": 1488, + "token_acc": 0.2510860008635983 + }, + { + "epoch": 0.873057754324245, + "grad_norm": 1.0068136487743475, + "learning_rate": 0.00026184056271981237, + "loss": 3.483488082885742, + "step": 1489, + "token_acc": 0.25227338553748446 + }, + { + "epoch": 0.8736440926414542, + "grad_norm": 1.2632821978307662, + "learning_rate": 0.00026201641266119575, + "loss": 3.4896020889282227, + "step": 1490, + "token_acc": 0.25088503419267155 + }, + { + "epoch": 0.8742304309586632, + "grad_norm": 0.9838168543250088, + "learning_rate": 0.0002621922626025791, + "loss": 3.4653091430664062, + "step": 1491, + "token_acc": 0.2544072444359416 + }, + { + "epoch": 0.8748167692758722, + "grad_norm": 1.471163288906277, + "learning_rate": 0.00026236811254396245, + "loss": 3.4483559131622314, + "step": 1492, + "token_acc": 0.2559266484011058 + }, + { + "epoch": 0.8754031075930812, + "grad_norm": 1.101568292463219, + "learning_rate": 0.00026254396248534583, + "loss": 3.469982862472534, + "step": 1493, + "token_acc": 0.2517078089365656 + }, + { + "epoch": 0.8759894459102903, + "grad_norm": 1.2615179350877874, + "learning_rate": 0.00026271981242672915, + "loss": 3.4580917358398438, + "step": 1494, + "token_acc": 0.25682331239959877 + }, + { + "epoch": 0.8765757842274993, + "grad_norm": 1.06422370883166, + "learning_rate": 0.00026289566236811253, + "loss": 3.4955356121063232, + "step": 1495, + "token_acc": 0.25027668230636135 + }, + { + "epoch": 0.8771621225447083, + "grad_norm": 1.2603191881587898, + "learning_rate": 0.00026307151230949585, + "loss": 3.482748508453369, + "step": 1496, + "token_acc": 0.2543479033133891 + }, + { + "epoch": 0.8777484608619174, + "grad_norm": 0.9783108614081832, + "learning_rate": 0.00026324736225087923, + "loss": 3.5418219566345215, + "step": 1497, + "token_acc": 0.2436677761502971 + }, + { + "epoch": 0.8783347991791264, + "grad_norm": 1.2429591111490474, + "learning_rate": 0.0002634232121922626, + "loss": 3.4759297370910645, + "step": 1498, + "token_acc": 0.2520641356431994 + }, + { + "epoch": 0.8789211374963354, + "grad_norm": 1.0581699407986824, + "learning_rate": 0.00026359906213364593, + "loss": 3.494903326034546, + "step": 1499, + "token_acc": 0.2509341234861309 + }, + { + "epoch": 0.8795074758135444, + "grad_norm": 0.8789727902423904, + "learning_rate": 0.0002637749120750293, + "loss": 3.4828197956085205, + "step": 1500, + "token_acc": 0.25208624216644476 + }, + { + "epoch": 0.8800938141307535, + "grad_norm": 0.9957855287579016, + "learning_rate": 0.00026395076201641264, + "loss": 3.466064453125, + "step": 1501, + "token_acc": 0.2543825885223967 + }, + { + "epoch": 0.8806801524479625, + "grad_norm": 1.1784501014555773, + "learning_rate": 0.00026412661195779596, + "loss": 3.4562320709228516, + "step": 1502, + "token_acc": 0.25585390534911634 + }, + { + "epoch": 0.8812664907651715, + "grad_norm": 1.2146156037236144, + "learning_rate": 0.00026430246189917934, + "loss": 3.491581439971924, + "step": 1503, + "token_acc": 0.2509083900809457 + }, + { + "epoch": 0.8818528290823805, + "grad_norm": 1.2503935349588382, + "learning_rate": 0.00026447831184056266, + "loss": 3.500335216522217, + "step": 1504, + "token_acc": 0.2490078088268899 + }, + { + "epoch": 0.8824391673995896, + "grad_norm": 0.808137274840268, + "learning_rate": 0.00026465416178194604, + "loss": 3.4292941093444824, + "step": 1505, + "token_acc": 0.2593210806571452 + }, + { + "epoch": 0.8830255057167986, + "grad_norm": 1.1391797718055179, + "learning_rate": 0.0002648300117233294, + "loss": 3.427213191986084, + "step": 1506, + "token_acc": 0.2594526072285175 + }, + { + "epoch": 0.8836118440340076, + "grad_norm": 1.0590003334045317, + "learning_rate": 0.00026500586166471274, + "loss": 3.5022454261779785, + "step": 1507, + "token_acc": 0.24925489858827135 + }, + { + "epoch": 0.8841981823512166, + "grad_norm": 1.1006071145397718, + "learning_rate": 0.0002651817116060961, + "loss": 3.477682590484619, + "step": 1508, + "token_acc": 0.2546573643082078 + }, + { + "epoch": 0.8847845206684257, + "grad_norm": 1.3300564427698305, + "learning_rate": 0.00026535756154747944, + "loss": 3.443197250366211, + "step": 1509, + "token_acc": 0.2561571114761236 + }, + { + "epoch": 0.8853708589856347, + "grad_norm": 1.0665700378584235, + "learning_rate": 0.0002655334114888628, + "loss": 3.4111695289611816, + "step": 1510, + "token_acc": 0.263408075437938 + }, + { + "epoch": 0.8859571973028437, + "grad_norm": 1.0033152514507064, + "learning_rate": 0.00026570926143024614, + "loss": 3.500072956085205, + "step": 1511, + "token_acc": 0.2512841083393881 + }, + { + "epoch": 0.8865435356200527, + "grad_norm": 1.1281883883466841, + "learning_rate": 0.0002658851113716295, + "loss": 3.456676721572876, + "step": 1512, + "token_acc": 0.2554430868374071 + }, + { + "epoch": 0.8871298739372618, + "grad_norm": 1.2163668516851136, + "learning_rate": 0.0002660609613130129, + "loss": 3.4494264125823975, + "step": 1513, + "token_acc": 0.25702038132807364 + }, + { + "epoch": 0.8877162122544708, + "grad_norm": 0.9904118753122958, + "learning_rate": 0.0002662368112543962, + "loss": 3.5263233184814453, + "step": 1514, + "token_acc": 0.24631515893651815 + }, + { + "epoch": 0.8883025505716798, + "grad_norm": 1.1719530863935192, + "learning_rate": 0.0002664126611957796, + "loss": 3.51806902885437, + "step": 1515, + "token_acc": 0.2499573497832286 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.9593989649835573, + "learning_rate": 0.0002665885111371629, + "loss": 3.4662561416625977, + "step": 1516, + "token_acc": 0.2530683585629148 + }, + { + "epoch": 0.889475227206098, + "grad_norm": 1.197077472172848, + "learning_rate": 0.0002667643610785463, + "loss": 3.467747449874878, + "step": 1517, + "token_acc": 0.2523390872657972 + }, + { + "epoch": 0.890061565523307, + "grad_norm": 0.933390047550227, + "learning_rate": 0.00026694021101992963, + "loss": 3.4822099208831787, + "step": 1518, + "token_acc": 0.25373968487202025 + }, + { + "epoch": 0.890647903840516, + "grad_norm": 1.1438148656892082, + "learning_rate": 0.000267116060961313, + "loss": 3.454476833343506, + "step": 1519, + "token_acc": 0.2563312180513519 + }, + { + "epoch": 0.891234242157725, + "grad_norm": 1.1179665355696513, + "learning_rate": 0.00026729191090269633, + "loss": 3.4433393478393555, + "step": 1520, + "token_acc": 0.2554607718712013 + }, + { + "epoch": 0.8918205804749341, + "grad_norm": 0.944106697077151, + "learning_rate": 0.0002674677608440797, + "loss": 3.4730758666992188, + "step": 1521, + "token_acc": 0.2535616381458714 + }, + { + "epoch": 0.8924069187921431, + "grad_norm": 1.1242940117241675, + "learning_rate": 0.00026764361078546303, + "loss": 3.5014383792877197, + "step": 1522, + "token_acc": 0.2501024229977295 + }, + { + "epoch": 0.8929932571093521, + "grad_norm": 1.056202439618883, + "learning_rate": 0.0002678194607268464, + "loss": 3.487800121307373, + "step": 1523, + "token_acc": 0.2504834817899256 + }, + { + "epoch": 0.8935795954265612, + "grad_norm": 1.1450304903088528, + "learning_rate": 0.00026799531066822973, + "loss": 3.4891011714935303, + "step": 1524, + "token_acc": 0.2487708844090349 + }, + { + "epoch": 0.8941659337437702, + "grad_norm": 1.1963699191797805, + "learning_rate": 0.0002681711606096131, + "loss": 3.471309185028076, + "step": 1525, + "token_acc": 0.25321512030353027 + }, + { + "epoch": 0.8947522720609792, + "grad_norm": 1.2911891429700755, + "learning_rate": 0.00026834701055099643, + "loss": 3.4688570499420166, + "step": 1526, + "token_acc": 0.2543395530606078 + }, + { + "epoch": 0.8953386103781882, + "grad_norm": 1.3541671300085827, + "learning_rate": 0.0002685228604923798, + "loss": 3.4217777252197266, + "step": 1527, + "token_acc": 0.25732142290147436 + }, + { + "epoch": 0.8959249486953973, + "grad_norm": 1.0340294987781176, + "learning_rate": 0.0002686987104337632, + "loss": 3.526620864868164, + "step": 1528, + "token_acc": 0.24828612247415052 + }, + { + "epoch": 0.8965112870126063, + "grad_norm": 1.1843352082829566, + "learning_rate": 0.0002688745603751465, + "loss": 3.4540655612945557, + "step": 1529, + "token_acc": 0.2562146528039365 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 0.9312341910820744, + "learning_rate": 0.0002690504103165299, + "loss": 3.4109597206115723, + "step": 1530, + "token_acc": 0.26077811972492254 + }, + { + "epoch": 0.8976839636470243, + "grad_norm": 1.1389510210835876, + "learning_rate": 0.0002692262602579132, + "loss": 3.4806084632873535, + "step": 1531, + "token_acc": 0.25224521258902977 + }, + { + "epoch": 0.8982703019642334, + "grad_norm": 1.204245093576224, + "learning_rate": 0.0002694021101992966, + "loss": 3.506807327270508, + "step": 1532, + "token_acc": 0.24877111207612038 + }, + { + "epoch": 0.8988566402814424, + "grad_norm": 0.9354575749316849, + "learning_rate": 0.0002695779601406799, + "loss": 3.431272029876709, + "step": 1533, + "token_acc": 0.2572023602915654 + }, + { + "epoch": 0.8994429785986514, + "grad_norm": 1.0973157385520398, + "learning_rate": 0.0002697538100820633, + "loss": 3.4384636878967285, + "step": 1534, + "token_acc": 0.2583302555720576 + }, + { + "epoch": 0.9000293169158604, + "grad_norm": 1.2470823605175088, + "learning_rate": 0.0002699296600234467, + "loss": 3.445343494415283, + "step": 1535, + "token_acc": 0.25776050573665904 + }, + { + "epoch": 0.9006156552330695, + "grad_norm": 1.21117306060003, + "learning_rate": 0.00027010550996483, + "loss": 3.459315538406372, + "step": 1536, + "token_acc": 0.254273623326833 + }, + { + "epoch": 0.9012019935502785, + "grad_norm": 1.2707438527261994, + "learning_rate": 0.0002702813599062134, + "loss": 3.4351089000701904, + "step": 1537, + "token_acc": 0.2573327377810262 + }, + { + "epoch": 0.9017883318674875, + "grad_norm": 1.0563873238335013, + "learning_rate": 0.0002704572098475967, + "loss": 3.4426050186157227, + "step": 1538, + "token_acc": 0.25467843458135725 + }, + { + "epoch": 0.9023746701846965, + "grad_norm": 1.301847498796856, + "learning_rate": 0.00027063305978898, + "loss": 3.4981689453125, + "step": 1539, + "token_acc": 0.2497133292035165 + }, + { + "epoch": 0.9029610085019056, + "grad_norm": 0.9476036329108056, + "learning_rate": 0.0002708089097303634, + "loss": 3.442716121673584, + "step": 1540, + "token_acc": 0.2575194305277856 + }, + { + "epoch": 0.9035473468191146, + "grad_norm": 1.0257906465617297, + "learning_rate": 0.0002709847596717467, + "loss": 3.4262518882751465, + "step": 1541, + "token_acc": 0.25883133784487244 + }, + { + "epoch": 0.9041336851363236, + "grad_norm": 1.3430203391119773, + "learning_rate": 0.0002711606096131301, + "loss": 3.4443607330322266, + "step": 1542, + "token_acc": 0.2563439717168789 + }, + { + "epoch": 0.9047200234535326, + "grad_norm": 0.997431049613658, + "learning_rate": 0.0002713364595545134, + "loss": 3.441514015197754, + "step": 1543, + "token_acc": 0.25662779507196354 + }, + { + "epoch": 0.9053063617707418, + "grad_norm": 1.4781970651884355, + "learning_rate": 0.0002715123094958968, + "loss": 3.4904818534851074, + "step": 1544, + "token_acc": 0.25204524014495816 + }, + { + "epoch": 0.9058927000879508, + "grad_norm": 0.898564746248468, + "learning_rate": 0.0002716881594372802, + "loss": 3.405942916870117, + "step": 1545, + "token_acc": 0.2619123811122594 + }, + { + "epoch": 0.9064790384051598, + "grad_norm": 1.2525403228685923, + "learning_rate": 0.0002718640093786635, + "loss": 3.3839802742004395, + "step": 1546, + "token_acc": 0.26291074945578935 + }, + { + "epoch": 0.9070653767223688, + "grad_norm": 1.1600874729832094, + "learning_rate": 0.0002720398593200469, + "loss": 3.4191513061523438, + "step": 1547, + "token_acc": 0.260927032794369 + }, + { + "epoch": 0.9076517150395779, + "grad_norm": 1.1537609840819099, + "learning_rate": 0.0002722157092614302, + "loss": 3.491884231567383, + "step": 1548, + "token_acc": 0.2488000781037672 + }, + { + "epoch": 0.9082380533567869, + "grad_norm": 0.9037660294633072, + "learning_rate": 0.0002723915592028136, + "loss": 3.4326887130737305, + "step": 1549, + "token_acc": 0.25638336083622676 + }, + { + "epoch": 0.9088243916739959, + "grad_norm": 0.9686272469080168, + "learning_rate": 0.0002725674091441969, + "loss": 3.4177675247192383, + "step": 1550, + "token_acc": 0.26115206565009647 + }, + { + "epoch": 0.909410729991205, + "grad_norm": 0.9481275062618324, + "learning_rate": 0.0002727432590855803, + "loss": 3.45444393157959, + "step": 1551, + "token_acc": 0.25564734718585236 + }, + { + "epoch": 0.909997068308414, + "grad_norm": 1.129928778529221, + "learning_rate": 0.00027291910902696367, + "loss": 3.4867782592773438, + "step": 1552, + "token_acc": 0.250120702029099 + }, + { + "epoch": 0.910583406625623, + "grad_norm": 1.1799087427895225, + "learning_rate": 0.000273094958968347, + "loss": 3.4504122734069824, + "step": 1553, + "token_acc": 0.25701350966743186 + }, + { + "epoch": 0.911169744942832, + "grad_norm": 1.1453521579593329, + "learning_rate": 0.00027327080890973037, + "loss": 3.485167980194092, + "step": 1554, + "token_acc": 0.2528238922868001 + }, + { + "epoch": 0.9117560832600411, + "grad_norm": 1.0363219830339416, + "learning_rate": 0.0002734466588511137, + "loss": 3.503322124481201, + "step": 1555, + "token_acc": 0.24887578345660666 + }, + { + "epoch": 0.9123424215772501, + "grad_norm": 1.1860029539500596, + "learning_rate": 0.000273622508792497, + "loss": 3.515435218811035, + "step": 1556, + "token_acc": 0.24652886453957976 + }, + { + "epoch": 0.9129287598944591, + "grad_norm": 1.0213078858930382, + "learning_rate": 0.0002737983587338804, + "loss": 3.53326153755188, + "step": 1557, + "token_acc": 0.24519392484369182 + }, + { + "epoch": 0.9135150982116681, + "grad_norm": 1.0297035281757607, + "learning_rate": 0.0002739742086752637, + "loss": 3.4802563190460205, + "step": 1558, + "token_acc": 0.2525334820568045 + }, + { + "epoch": 0.9141014365288772, + "grad_norm": 1.1029739860383216, + "learning_rate": 0.0002741500586166471, + "loss": 3.4929981231689453, + "step": 1559, + "token_acc": 0.24966556494526077 + }, + { + "epoch": 0.9146877748460862, + "grad_norm": 0.9430169525700198, + "learning_rate": 0.0002743259085580305, + "loss": 3.467041492462158, + "step": 1560, + "token_acc": 0.2545945973305529 + }, + { + "epoch": 0.9152741131632952, + "grad_norm": 1.2943235615356024, + "learning_rate": 0.0002745017584994138, + "loss": 3.38330340385437, + "step": 1561, + "token_acc": 0.2645694722901279 + }, + { + "epoch": 0.9158604514805042, + "grad_norm": 0.9078577818826634, + "learning_rate": 0.0002746776084407972, + "loss": 3.4435367584228516, + "step": 1562, + "token_acc": 0.25605095205164885 + }, + { + "epoch": 0.9164467897977133, + "grad_norm": 1.0582256931438696, + "learning_rate": 0.0002748534583821805, + "loss": 3.4161782264709473, + "step": 1563, + "token_acc": 0.261538744389201 + }, + { + "epoch": 0.9170331281149223, + "grad_norm": 0.9571577507235014, + "learning_rate": 0.0002750293083235639, + "loss": 3.3922221660614014, + "step": 1564, + "token_acc": 0.26461262143024716 + }, + { + "epoch": 0.9176194664321313, + "grad_norm": 1.1759342304557798, + "learning_rate": 0.0002752051582649472, + "loss": 3.4620676040649414, + "step": 1565, + "token_acc": 0.2534643031075604 + }, + { + "epoch": 0.9182058047493403, + "grad_norm": 1.1334047859869674, + "learning_rate": 0.0002753810082063306, + "loss": 3.427372694015503, + "step": 1566, + "token_acc": 0.2580983700096032 + }, + { + "epoch": 0.9187921430665494, + "grad_norm": 1.3161157829942292, + "learning_rate": 0.00027555685814771396, + "loss": 3.5011913776397705, + "step": 1567, + "token_acc": 0.24847892459419685 + }, + { + "epoch": 0.9193784813837584, + "grad_norm": 1.0831452504834898, + "learning_rate": 0.0002757327080890973, + "loss": 3.4381520748138428, + "step": 1568, + "token_acc": 0.25972817126945436 + }, + { + "epoch": 0.9199648197009674, + "grad_norm": 1.0470354314887476, + "learning_rate": 0.00027590855803048066, + "loss": 3.4123809337615967, + "step": 1569, + "token_acc": 0.2594358897434969 + }, + { + "epoch": 0.9205511580181764, + "grad_norm": 1.0322005775210283, + "learning_rate": 0.000276084407971864, + "loss": 3.4311747550964355, + "step": 1570, + "token_acc": 0.25962873549108667 + }, + { + "epoch": 0.9211374963353856, + "grad_norm": 1.3650104051929608, + "learning_rate": 0.00027626025791324736, + "loss": 3.44868803024292, + "step": 1571, + "token_acc": 0.25470813620129956 + }, + { + "epoch": 0.9217238346525946, + "grad_norm": 1.0078466222217648, + "learning_rate": 0.0002764361078546307, + "loss": 3.454195976257324, + "step": 1572, + "token_acc": 0.25561077530713333 + }, + { + "epoch": 0.9223101729698036, + "grad_norm": 1.1215704139131795, + "learning_rate": 0.000276611957796014, + "loss": 3.435854911804199, + "step": 1573, + "token_acc": 0.2560089361284383 + }, + { + "epoch": 0.9228965112870126, + "grad_norm": 1.1436980541290278, + "learning_rate": 0.0002767878077373974, + "loss": 3.4624428749084473, + "step": 1574, + "token_acc": 0.2537039373891008 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 1.213858884230976, + "learning_rate": 0.00027696365767878076, + "loss": 3.419650077819824, + "step": 1575, + "token_acc": 0.25872801152678004 + }, + { + "epoch": 0.9240691879214307, + "grad_norm": 0.9928902206662252, + "learning_rate": 0.0002771395076201641, + "loss": 3.447695016860962, + "step": 1576, + "token_acc": 0.25604969877128303 + }, + { + "epoch": 0.9246555262386397, + "grad_norm": 1.1170179466519534, + "learning_rate": 0.00027731535756154747, + "loss": 3.4231395721435547, + "step": 1577, + "token_acc": 0.2588927730986735 + }, + { + "epoch": 0.9252418645558487, + "grad_norm": 0.8570173868715074, + "learning_rate": 0.0002774912075029308, + "loss": 3.5291895866394043, + "step": 1578, + "token_acc": 0.24584225128814902 + }, + { + "epoch": 0.9258282028730578, + "grad_norm": 0.9676214841379986, + "learning_rate": 0.00027766705744431417, + "loss": 3.414877414703369, + "step": 1579, + "token_acc": 0.2584987040285545 + }, + { + "epoch": 0.9264145411902668, + "grad_norm": 1.1782797758324282, + "learning_rate": 0.0002778429073856975, + "loss": 3.4611570835113525, + "step": 1580, + "token_acc": 0.2526968036408508 + }, + { + "epoch": 0.9270008795074758, + "grad_norm": 0.8791078117414856, + "learning_rate": 0.00027801875732708087, + "loss": 3.416090965270996, + "step": 1581, + "token_acc": 0.25959511297139976 + }, + { + "epoch": 0.9275872178246849, + "grad_norm": 0.91160368854889, + "learning_rate": 0.00027819460726846425, + "loss": 3.4486334323883057, + "step": 1582, + "token_acc": 0.2559706543219903 + }, + { + "epoch": 0.9281735561418939, + "grad_norm": 1.04591211316074, + "learning_rate": 0.00027837045720984757, + "loss": 3.436034679412842, + "step": 1583, + "token_acc": 0.2574375156626383 + }, + { + "epoch": 0.9287598944591029, + "grad_norm": 0.9027223087739822, + "learning_rate": 0.00027854630715123095, + "loss": 3.4619922637939453, + "step": 1584, + "token_acc": 0.25507177510212525 + }, + { + "epoch": 0.9293462327763119, + "grad_norm": 1.0490646679277922, + "learning_rate": 0.00027872215709261427, + "loss": 3.4477930068969727, + "step": 1585, + "token_acc": 0.2542764857881137 + }, + { + "epoch": 0.929932571093521, + "grad_norm": 0.9890043199810226, + "learning_rate": 0.00027889800703399765, + "loss": 3.4564199447631836, + "step": 1586, + "token_acc": 0.2542775148070373 + }, + { + "epoch": 0.93051890941073, + "grad_norm": 1.114124375382977, + "learning_rate": 0.000279073856975381, + "loss": 3.4412851333618164, + "step": 1587, + "token_acc": 0.2556504217042669 + }, + { + "epoch": 0.931105247727939, + "grad_norm": 1.2382464201645553, + "learning_rate": 0.00027924970691676435, + "loss": 3.4975357055664062, + "step": 1588, + "token_acc": 0.24976999841018732 + }, + { + "epoch": 0.931691586045148, + "grad_norm": 1.0018365063275843, + "learning_rate": 0.00027942555685814773, + "loss": 3.4665846824645996, + "step": 1589, + "token_acc": 0.25362675646197996 + }, + { + "epoch": 0.9322779243623571, + "grad_norm": 1.0706134072461655, + "learning_rate": 0.00027960140679953105, + "loss": 3.4295220375061035, + "step": 1590, + "token_acc": 0.2591023108282688 + }, + { + "epoch": 0.9328642626795661, + "grad_norm": 1.1521618515120322, + "learning_rate": 0.0002797772567409144, + "loss": 3.4484362602233887, + "step": 1591, + "token_acc": 0.2554142581888247 + }, + { + "epoch": 0.9334506009967751, + "grad_norm": 1.0522691885661983, + "learning_rate": 0.00027995310668229776, + "loss": 3.455878496170044, + "step": 1592, + "token_acc": 0.25255059935216323 + }, + { + "epoch": 0.9340369393139841, + "grad_norm": 0.8390656539948494, + "learning_rate": 0.0002801289566236811, + "loss": 3.419095039367676, + "step": 1593, + "token_acc": 0.26043699919502916 + }, + { + "epoch": 0.9346232776311932, + "grad_norm": 1.0104283054265661, + "learning_rate": 0.00028030480656506446, + "loss": 3.393857002258301, + "step": 1594, + "token_acc": 0.2604841721112415 + }, + { + "epoch": 0.9352096159484022, + "grad_norm": 1.273394420912752, + "learning_rate": 0.0002804806565064478, + "loss": 3.4570064544677734, + "step": 1595, + "token_acc": 0.2550111272425326 + }, + { + "epoch": 0.9357959542656112, + "grad_norm": 0.8500555303463819, + "learning_rate": 0.00028065650644783116, + "loss": 3.3855323791503906, + "step": 1596, + "token_acc": 0.26343645660142206 + }, + { + "epoch": 0.9363822925828202, + "grad_norm": 0.9823160060984943, + "learning_rate": 0.0002808323563892145, + "loss": 3.4690091609954834, + "step": 1597, + "token_acc": 0.25358774294978353 + }, + { + "epoch": 0.9369686309000294, + "grad_norm": 1.0324244245829983, + "learning_rate": 0.00028100820633059786, + "loss": 3.4059247970581055, + "step": 1598, + "token_acc": 0.25973350998798966 + }, + { + "epoch": 0.9375549692172384, + "grad_norm": 1.1353441800518547, + "learning_rate": 0.00028118405627198124, + "loss": 3.423762321472168, + "step": 1599, + "token_acc": 0.25978963105344655 + }, + { + "epoch": 0.9381413075344474, + "grad_norm": 0.9101082160968709, + "learning_rate": 0.00028135990621336456, + "loss": 3.438351631164551, + "step": 1600, + "token_acc": 0.25616560754659295 + }, + { + "epoch": 0.9387276458516564, + "grad_norm": 0.8532083561356912, + "learning_rate": 0.00028153575615474794, + "loss": 3.45070743560791, + "step": 1601, + "token_acc": 0.25418973383457893 + }, + { + "epoch": 0.9393139841688655, + "grad_norm": 0.9842250500653741, + "learning_rate": 0.00028171160609613126, + "loss": 3.3930931091308594, + "step": 1602, + "token_acc": 0.26322528928287 + }, + { + "epoch": 0.9399003224860745, + "grad_norm": 1.1484976018054966, + "learning_rate": 0.00028188745603751464, + "loss": 3.463414192199707, + "step": 1603, + "token_acc": 0.2519882191284152 + }, + { + "epoch": 0.9404866608032835, + "grad_norm": 1.0558181128485224, + "learning_rate": 0.00028206330597889797, + "loss": 3.4456119537353516, + "step": 1604, + "token_acc": 0.2534280556385314 + }, + { + "epoch": 0.9410729991204925, + "grad_norm": 1.395649076515183, + "learning_rate": 0.00028223915592028134, + "loss": 3.5010666847229004, + "step": 1605, + "token_acc": 0.2490527628416582 + }, + { + "epoch": 0.9416593374377016, + "grad_norm": 1.1699134005984961, + "learning_rate": 0.0002824150058616647, + "loss": 3.485076904296875, + "step": 1606, + "token_acc": 0.2505359893426585 + }, + { + "epoch": 0.9422456757549106, + "grad_norm": 1.0050844548368318, + "learning_rate": 0.00028259085580304805, + "loss": 3.414064884185791, + "step": 1607, + "token_acc": 0.2585769322525138 + }, + { + "epoch": 0.9428320140721196, + "grad_norm": 1.0017520470621288, + "learning_rate": 0.0002827667057444314, + "loss": 3.426929473876953, + "step": 1608, + "token_acc": 0.25771782568771745 + }, + { + "epoch": 0.9434183523893287, + "grad_norm": 1.0464793362631202, + "learning_rate": 0.00028294255568581475, + "loss": 3.459712505340576, + "step": 1609, + "token_acc": 0.2544558884700207 + }, + { + "epoch": 0.9440046907065377, + "grad_norm": 0.8884348039241726, + "learning_rate": 0.00028311840562719807, + "loss": 3.4620704650878906, + "step": 1610, + "token_acc": 0.25350297036689257 + }, + { + "epoch": 0.9445910290237467, + "grad_norm": 1.151615454096963, + "learning_rate": 0.00028329425556858145, + "loss": 3.430426597595215, + "step": 1611, + "token_acc": 0.25567336001852525 + }, + { + "epoch": 0.9451773673409557, + "grad_norm": 1.1225200597210718, + "learning_rate": 0.0002834701055099648, + "loss": 3.4575791358947754, + "step": 1612, + "token_acc": 0.25634345896767924 + }, + { + "epoch": 0.9457637056581648, + "grad_norm": 1.056428813670255, + "learning_rate": 0.00028364595545134815, + "loss": 3.4641103744506836, + "step": 1613, + "token_acc": 0.25412191202893847 + }, + { + "epoch": 0.9463500439753738, + "grad_norm": 1.2067914765995427, + "learning_rate": 0.00028382180539273153, + "loss": 3.397404193878174, + "step": 1614, + "token_acc": 0.2611007607514361 + }, + { + "epoch": 0.9469363822925828, + "grad_norm": 0.8468931035795969, + "learning_rate": 0.00028399765533411485, + "loss": 3.431407928466797, + "step": 1615, + "token_acc": 0.2580003772280029 + }, + { + "epoch": 0.9475227206097918, + "grad_norm": 0.6781329738531857, + "learning_rate": 0.00028417350527549823, + "loss": 3.418238401412964, + "step": 1616, + "token_acc": 0.25974239546751826 + }, + { + "epoch": 0.9481090589270009, + "grad_norm": 0.8887112611673056, + "learning_rate": 0.00028434935521688156, + "loss": 3.4343841075897217, + "step": 1617, + "token_acc": 0.25546847490903657 + }, + { + "epoch": 0.9486953972442099, + "grad_norm": 0.9606695431904696, + "learning_rate": 0.00028452520515826493, + "loss": 3.4573464393615723, + "step": 1618, + "token_acc": 0.2544670120243435 + }, + { + "epoch": 0.9492817355614189, + "grad_norm": 0.9466212276822745, + "learning_rate": 0.00028470105509964826, + "loss": 3.428793430328369, + "step": 1619, + "token_acc": 0.2570824747714905 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 1.247266489564097, + "learning_rate": 0.00028487690504103163, + "loss": 3.483790636062622, + "step": 1620, + "token_acc": 0.2524136429156599 + }, + { + "epoch": 0.950454412195837, + "grad_norm": 0.9881453608434886, + "learning_rate": 0.000285052754982415, + "loss": 3.4680538177490234, + "step": 1621, + "token_acc": 0.25247324958089284 + }, + { + "epoch": 0.951040750513046, + "grad_norm": 1.2373340402330901, + "learning_rate": 0.00028522860492379834, + "loss": 3.401545763015747, + "step": 1622, + "token_acc": 0.25966873933282625 + }, + { + "epoch": 0.951627088830255, + "grad_norm": 1.0211834418326686, + "learning_rate": 0.0002854044548651817, + "loss": 3.4372189044952393, + "step": 1623, + "token_acc": 0.25602884016106214 + }, + { + "epoch": 0.952213427147464, + "grad_norm": 1.0192162955522874, + "learning_rate": 0.00028558030480656504, + "loss": 3.450834274291992, + "step": 1624, + "token_acc": 0.25508543961123 + }, + { + "epoch": 0.9527997654646732, + "grad_norm": 1.1325298124281291, + "learning_rate": 0.0002857561547479484, + "loss": 3.4558000564575195, + "step": 1625, + "token_acc": 0.25267409347121744 + }, + { + "epoch": 0.9533861037818822, + "grad_norm": 1.099952880945093, + "learning_rate": 0.00028593200468933174, + "loss": 3.42177152633667, + "step": 1626, + "token_acc": 0.258832141675274 + }, + { + "epoch": 0.9539724420990912, + "grad_norm": 0.8946557370056859, + "learning_rate": 0.00028610785463071506, + "loss": 3.3207759857177734, + "step": 1627, + "token_acc": 0.2713037641456994 + }, + { + "epoch": 0.9545587804163002, + "grad_norm": 0.8901828893814432, + "learning_rate": 0.00028628370457209844, + "loss": 3.427022695541382, + "step": 1628, + "token_acc": 0.2590191883936757 + }, + { + "epoch": 0.9551451187335093, + "grad_norm": 1.1268292660227681, + "learning_rate": 0.00028645955451348177, + "loss": 3.4051380157470703, + "step": 1629, + "token_acc": 0.2599669184799224 + }, + { + "epoch": 0.9557314570507183, + "grad_norm": 1.0518598816999978, + "learning_rate": 0.00028663540445486514, + "loss": 3.4638655185699463, + "step": 1630, + "token_acc": 0.2531246365164797 + }, + { + "epoch": 0.9563177953679273, + "grad_norm": 1.0118229097954303, + "learning_rate": 0.0002868112543962485, + "loss": 3.407078266143799, + "step": 1631, + "token_acc": 0.2619734458121537 + }, + { + "epoch": 0.9569041336851363, + "grad_norm": 1.0145122324754978, + "learning_rate": 0.00028698710433763185, + "loss": 3.438203811645508, + "step": 1632, + "token_acc": 0.25449237586341716 + }, + { + "epoch": 0.9574904720023454, + "grad_norm": 1.2528340922136278, + "learning_rate": 0.0002871629542790152, + "loss": 3.444272518157959, + "step": 1633, + "token_acc": 0.25588460659506107 + }, + { + "epoch": 0.9580768103195544, + "grad_norm": 0.9964204749519147, + "learning_rate": 0.00028733880422039855, + "loss": 3.477262258529663, + "step": 1634, + "token_acc": 0.2513322627498312 + }, + { + "epoch": 0.9586631486367634, + "grad_norm": 1.1931086643729485, + "learning_rate": 0.0002875146541617819, + "loss": 3.4402215480804443, + "step": 1635, + "token_acc": 0.25753290746181273 + }, + { + "epoch": 0.9592494869539725, + "grad_norm": 0.8850779833023668, + "learning_rate": 0.00028769050410316525, + "loss": 3.4351909160614014, + "step": 1636, + "token_acc": 0.2557032609064188 + }, + { + "epoch": 0.9598358252711815, + "grad_norm": 0.8319326232433024, + "learning_rate": 0.0002878663540445486, + "loss": 3.412260055541992, + "step": 1637, + "token_acc": 0.26149460866382074 + }, + { + "epoch": 0.9604221635883905, + "grad_norm": 0.9458665141032183, + "learning_rate": 0.000288042203985932, + "loss": 3.4018025398254395, + "step": 1638, + "token_acc": 0.259475120000833 + }, + { + "epoch": 0.9610085019055995, + "grad_norm": 1.0007726259310146, + "learning_rate": 0.00028821805392731533, + "loss": 3.47475528717041, + "step": 1639, + "token_acc": 0.25281027579492144 + }, + { + "epoch": 0.9615948402228086, + "grad_norm": 1.0352169379895728, + "learning_rate": 0.0002883939038686987, + "loss": 3.4306507110595703, + "step": 1640, + "token_acc": 0.2567856643223238 + }, + { + "epoch": 0.9621811785400176, + "grad_norm": 1.0827222011867332, + "learning_rate": 0.00028856975381008203, + "loss": 3.4324893951416016, + "step": 1641, + "token_acc": 0.2558575349720682 + }, + { + "epoch": 0.9627675168572266, + "grad_norm": 1.1095957533359941, + "learning_rate": 0.0002887456037514654, + "loss": 3.485936164855957, + "step": 1642, + "token_acc": 0.24930181553582134 + }, + { + "epoch": 0.9633538551744356, + "grad_norm": 1.1520469133687614, + "learning_rate": 0.00028892145369284873, + "loss": 3.506608009338379, + "step": 1643, + "token_acc": 0.24794244891212808 + }, + { + "epoch": 0.9639401934916447, + "grad_norm": 0.8331536357057102, + "learning_rate": 0.00028909730363423206, + "loss": 3.452690601348877, + "step": 1644, + "token_acc": 0.2532282272813269 + }, + { + "epoch": 0.9645265318088537, + "grad_norm": 1.1806714446698217, + "learning_rate": 0.00028927315357561543, + "loss": 3.4091579914093018, + "step": 1645, + "token_acc": 0.25954991374608205 + }, + { + "epoch": 0.9651128701260627, + "grad_norm": 1.1003504159123314, + "learning_rate": 0.0002894490035169988, + "loss": 3.449368715286255, + "step": 1646, + "token_acc": 0.2527903231542209 + }, + { + "epoch": 0.9656992084432717, + "grad_norm": 0.9521412616048038, + "learning_rate": 0.00028962485345838214, + "loss": 3.372286558151245, + "step": 1647, + "token_acc": 0.2640550807217474 + }, + { + "epoch": 0.9662855467604808, + "grad_norm": 0.9085826341092487, + "learning_rate": 0.0002898007033997655, + "loss": 3.4189882278442383, + "step": 1648, + "token_acc": 0.259935453412688 + }, + { + "epoch": 0.9668718850776898, + "grad_norm": 0.8411084758914664, + "learning_rate": 0.00028997655334114884, + "loss": 3.4541051387786865, + "step": 1649, + "token_acc": 0.2536862395053104 + }, + { + "epoch": 0.9674582233948988, + "grad_norm": 0.9785933359551604, + "learning_rate": 0.0002901524032825322, + "loss": 3.3913700580596924, + "step": 1650, + "token_acc": 0.26226255371691387 + }, + { + "epoch": 0.9680445617121078, + "grad_norm": 1.1455793192318853, + "learning_rate": 0.00029032825322391554, + "loss": 3.462675094604492, + "step": 1651, + "token_acc": 0.2526551920618869 + }, + { + "epoch": 0.968630900029317, + "grad_norm": 0.7993899416606041, + "learning_rate": 0.0002905041031652989, + "loss": 3.3909566402435303, + "step": 1652, + "token_acc": 0.2631858142732802 + }, + { + "epoch": 0.969217238346526, + "grad_norm": 0.875426702268605, + "learning_rate": 0.0002906799531066823, + "loss": 3.4527156352996826, + "step": 1653, + "token_acc": 0.2534200314845774 + }, + { + "epoch": 0.969803576663735, + "grad_norm": 1.024280615064505, + "learning_rate": 0.0002908558030480656, + "loss": 3.4014289379119873, + "step": 1654, + "token_acc": 0.2612895972625327 + }, + { + "epoch": 0.970389914980944, + "grad_norm": 0.9195103456149949, + "learning_rate": 0.000291031652989449, + "loss": 3.4070944786071777, + "step": 1655, + "token_acc": 0.2599360876147802 + }, + { + "epoch": 0.9709762532981531, + "grad_norm": 1.2107313775202413, + "learning_rate": 0.0002912075029308323, + "loss": 3.4211158752441406, + "step": 1656, + "token_acc": 0.25745008537943737 + }, + { + "epoch": 0.9715625916153621, + "grad_norm": 0.9604788197685252, + "learning_rate": 0.0002913833528722157, + "loss": 3.3969945907592773, + "step": 1657, + "token_acc": 0.2608897465317618 + }, + { + "epoch": 0.9721489299325711, + "grad_norm": 0.9546786830863327, + "learning_rate": 0.000291559202813599, + "loss": 3.439542770385742, + "step": 1658, + "token_acc": 0.2548316751753569 + }, + { + "epoch": 0.9727352682497801, + "grad_norm": 1.1081209240364918, + "learning_rate": 0.0002917350527549824, + "loss": 3.4130752086639404, + "step": 1659, + "token_acc": 0.2589106143792215 + }, + { + "epoch": 0.9733216065669892, + "grad_norm": 0.9994860287529672, + "learning_rate": 0.0002919109026963658, + "loss": 3.4299957752227783, + "step": 1660, + "token_acc": 0.25488947319741556 + }, + { + "epoch": 0.9739079448841982, + "grad_norm": 0.8782120536730222, + "learning_rate": 0.0002920867526377491, + "loss": 3.405499219894409, + "step": 1661, + "token_acc": 0.2595201857750302 + }, + { + "epoch": 0.9744942832014072, + "grad_norm": 0.9933984036179534, + "learning_rate": 0.0002922626025791324, + "loss": 3.397000312805176, + "step": 1662, + "token_acc": 0.25998143486553654 + }, + { + "epoch": 0.9750806215186162, + "grad_norm": 1.1258193437404802, + "learning_rate": 0.0002924384525205158, + "loss": 3.431743860244751, + "step": 1663, + "token_acc": 0.25656518648453763 + }, + { + "epoch": 0.9756669598358253, + "grad_norm": 1.1338615165102073, + "learning_rate": 0.00029261430246189913, + "loss": 3.479358434677124, + "step": 1664, + "token_acc": 0.25233220663895994 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 1.0556484863970759, + "learning_rate": 0.0002927901524032825, + "loss": 3.420722007751465, + "step": 1665, + "token_acc": 0.25751524015376054 + }, + { + "epoch": 0.9768396364702433, + "grad_norm": 0.8963897877390183, + "learning_rate": 0.00029296600234466583, + "loss": 3.443148136138916, + "step": 1666, + "token_acc": 0.25492007068033706 + }, + { + "epoch": 0.9774259747874524, + "grad_norm": 0.9650260165038219, + "learning_rate": 0.0002931418522860492, + "loss": 3.41599702835083, + "step": 1667, + "token_acc": 0.25878221038451804 + }, + { + "epoch": 0.9780123131046614, + "grad_norm": 0.917491397482388, + "learning_rate": 0.0002933177022274326, + "loss": 3.41109037399292, + "step": 1668, + "token_acc": 0.2582422657423119 + }, + { + "epoch": 0.9785986514218704, + "grad_norm": 1.0246960695630682, + "learning_rate": 0.0002934935521688159, + "loss": 3.4457051753997803, + "step": 1669, + "token_acc": 0.2550188870884565 + }, + { + "epoch": 0.9791849897390794, + "grad_norm": 1.1251563619561569, + "learning_rate": 0.0002936694021101993, + "loss": 3.450312614440918, + "step": 1670, + "token_acc": 0.25354109093134003 + }, + { + "epoch": 0.9797713280562885, + "grad_norm": 0.8059031273350623, + "learning_rate": 0.0002938452520515826, + "loss": 3.352806568145752, + "step": 1671, + "token_acc": 0.2665452773984363 + }, + { + "epoch": 0.9803576663734975, + "grad_norm": 0.9693959222114189, + "learning_rate": 0.000294021101992966, + "loss": 3.464014768600464, + "step": 1672, + "token_acc": 0.2524639658217273 + }, + { + "epoch": 0.9809440046907065, + "grad_norm": 1.0537377134456585, + "learning_rate": 0.0002941969519343493, + "loss": 3.4342567920684814, + "step": 1673, + "token_acc": 0.2546251981106909 + }, + { + "epoch": 0.9815303430079155, + "grad_norm": 0.8635266937934581, + "learning_rate": 0.0002943728018757327, + "loss": 3.4285411834716797, + "step": 1674, + "token_acc": 0.2589354350273712 + }, + { + "epoch": 0.9821166813251246, + "grad_norm": 0.9324102609728363, + "learning_rate": 0.00029454865181711607, + "loss": 3.4739909172058105, + "step": 1675, + "token_acc": 0.2521764907791681 + }, + { + "epoch": 0.9827030196423336, + "grad_norm": 0.8812169433582375, + "learning_rate": 0.0002947245017584994, + "loss": 3.38845157623291, + "step": 1676, + "token_acc": 0.2618837295643379 + }, + { + "epoch": 0.9832893579595426, + "grad_norm": 0.9133348923682535, + "learning_rate": 0.00029490035169988277, + "loss": 3.4450371265411377, + "step": 1677, + "token_acc": 0.25328833461679595 + }, + { + "epoch": 0.9838756962767516, + "grad_norm": 0.9242693428256067, + "learning_rate": 0.0002950762016412661, + "loss": 3.441045045852661, + "step": 1678, + "token_acc": 0.25596900947650514 + }, + { + "epoch": 0.9844620345939608, + "grad_norm": 0.8511730515132478, + "learning_rate": 0.00029525205158264947, + "loss": 3.4234378337860107, + "step": 1679, + "token_acc": 0.2579017299923499 + }, + { + "epoch": 0.9850483729111698, + "grad_norm": 0.7143281472029915, + "learning_rate": 0.0002954279015240328, + "loss": 3.4041244983673096, + "step": 1680, + "token_acc": 0.2593249099357803 + }, + { + "epoch": 0.9856347112283788, + "grad_norm": 0.712568877758948, + "learning_rate": 0.0002956037514654161, + "loss": 3.4403133392333984, + "step": 1681, + "token_acc": 0.25675481053611565 + }, + { + "epoch": 0.9862210495455878, + "grad_norm": 0.862445178621824, + "learning_rate": 0.0002957796014067995, + "loss": 3.3996949195861816, + "step": 1682, + "token_acc": 0.2594488686638857 + }, + { + "epoch": 0.9868073878627969, + "grad_norm": 0.9530327210573329, + "learning_rate": 0.0002959554513481828, + "loss": 3.453869104385376, + "step": 1683, + "token_acc": 0.25481199071028715 + }, + { + "epoch": 0.9873937261800059, + "grad_norm": 1.065627650859129, + "learning_rate": 0.0002961313012895662, + "loss": 3.4256539344787598, + "step": 1684, + "token_acc": 0.2574449525113286 + }, + { + "epoch": 0.9879800644972149, + "grad_norm": 1.0511687294908194, + "learning_rate": 0.0002963071512309496, + "loss": 3.4245340824127197, + "step": 1685, + "token_acc": 0.2563528258738974 + }, + { + "epoch": 0.9885664028144239, + "grad_norm": 0.8970258445831942, + "learning_rate": 0.0002964830011723329, + "loss": 3.4051733016967773, + "step": 1686, + "token_acc": 0.2606317553235485 + }, + { + "epoch": 0.989152741131633, + "grad_norm": 1.1716050343225166, + "learning_rate": 0.0002966588511137163, + "loss": 3.3879857063293457, + "step": 1687, + "token_acc": 0.2617099571079236 + }, + { + "epoch": 0.989739079448842, + "grad_norm": 0.9696488751938213, + "learning_rate": 0.0002968347010550996, + "loss": 3.4367923736572266, + "step": 1688, + "token_acc": 0.25598060517837457 + }, + { + "epoch": 0.990325417766051, + "grad_norm": 0.9268558263660351, + "learning_rate": 0.000297010550996483, + "loss": 3.436521053314209, + "step": 1689, + "token_acc": 0.25622815233019824 + }, + { + "epoch": 0.99091175608326, + "grad_norm": 0.7939532103834785, + "learning_rate": 0.0002971864009378663, + "loss": 3.399738073348999, + "step": 1690, + "token_acc": 0.26121320531228526 + }, + { + "epoch": 0.9914980944004691, + "grad_norm": 0.7943310120893513, + "learning_rate": 0.0002973622508792497, + "loss": 3.4149017333984375, + "step": 1691, + "token_acc": 0.25951125415413 + }, + { + "epoch": 0.9920844327176781, + "grad_norm": 0.8748408720329571, + "learning_rate": 0.00029753810082063306, + "loss": 3.45902156829834, + "step": 1692, + "token_acc": 0.2526114892472697 + }, + { + "epoch": 0.9926707710348871, + "grad_norm": 1.1103174216671499, + "learning_rate": 0.0002977139507620164, + "loss": 3.4256043434143066, + "step": 1693, + "token_acc": 0.2564982095258609 + }, + { + "epoch": 0.9932571093520962, + "grad_norm": 1.2286517604869127, + "learning_rate": 0.00029788980070339976, + "loss": 3.4395804405212402, + "step": 1694, + "token_acc": 0.25493543139971436 + }, + { + "epoch": 0.9938434476693052, + "grad_norm": 1.0210072953526748, + "learning_rate": 0.0002980656506447831, + "loss": 3.4143528938293457, + "step": 1695, + "token_acc": 0.2597158684178792 + }, + { + "epoch": 0.9944297859865142, + "grad_norm": 0.9802514881528891, + "learning_rate": 0.00029824150058616646, + "loss": 3.375446319580078, + "step": 1696, + "token_acc": 0.2642880572615679 + }, + { + "epoch": 0.9950161243037232, + "grad_norm": 0.9943322618687697, + "learning_rate": 0.0002984173505275498, + "loss": 3.421415328979492, + "step": 1697, + "token_acc": 0.25697515562022133 + }, + { + "epoch": 0.9956024626209323, + "grad_norm": 1.0951892710373659, + "learning_rate": 0.0002985932004689331, + "loss": 3.4430785179138184, + "step": 1698, + "token_acc": 0.2549121346846752 + }, + { + "epoch": 0.9961888009381413, + "grad_norm": 0.94806515266285, + "learning_rate": 0.0002987690504103165, + "loss": 3.436896800994873, + "step": 1699, + "token_acc": 0.2558370881853824 + }, + { + "epoch": 0.9967751392553503, + "grad_norm": 0.9653025378040253, + "learning_rate": 0.00029894490035169987, + "loss": 3.3707313537597656, + "step": 1700, + "token_acc": 0.2640488333377375 + }, + { + "epoch": 0.9973614775725593, + "grad_norm": 1.155369457124877, + "learning_rate": 0.0002991207502930832, + "loss": 3.3923587799072266, + "step": 1701, + "token_acc": 0.26086674199036375 + }, + { + "epoch": 0.9979478158897684, + "grad_norm": 1.2166474726195073, + "learning_rate": 0.00029929660023446657, + "loss": 3.393880844116211, + "step": 1702, + "token_acc": 0.2593271936556664 + }, + { + "epoch": 0.9985341542069774, + "grad_norm": 0.7563388412989023, + "learning_rate": 0.0002994724501758499, + "loss": 3.429440975189209, + "step": 1703, + "token_acc": 0.2594440978720455 + }, + { + "epoch": 0.9991204925241864, + "grad_norm": 0.756823285765525, + "learning_rate": 0.00029964830011723327, + "loss": 3.382394313812256, + "step": 1704, + "token_acc": 0.26312419476509824 + }, + { + "epoch": 0.9997068308413954, + "grad_norm": 0.8732197525299397, + "learning_rate": 0.0002998241500586166, + "loss": 3.435934066772461, + "step": 1705, + "token_acc": 0.2552384780278671 + }, + { + "epoch": 1.0, + "grad_norm": 0.9633250349032103, + "learning_rate": 0.0003, + "loss": 3.3410556316375732, + "step": 1706, + "token_acc": 0.27049430741970304 + }, + { + "epoch": 1.0, + "eval_loss": 3.3927879333496094, + "eval_runtime": 8.222, + "eval_samples_per_second": 31.136, + "eval_steps_per_second": 3.892, + "eval_token_acc": 0.26011028975955314, + "step": 1706 + }, + { + "epoch": 1.0005863383172091, + "grad_norm": 1.0278859598089938, + "learning_rate": 0.000299999999295476, + "loss": 3.4180030822753906, + "step": 1707, + "token_acc": 0.25876622434251667 + }, + { + "epoch": 1.001172676634418, + "grad_norm": 1.0864264429910573, + "learning_rate": 0.00029999999718190423, + "loss": 3.3736886978149414, + "step": 1708, + "token_acc": 0.2637199628222097 + }, + { + "epoch": 1.0017590149516271, + "grad_norm": 0.9769255217125332, + "learning_rate": 0.0002999999936592846, + "loss": 3.409825325012207, + "step": 1709, + "token_acc": 0.25942727451812764 + }, + { + "epoch": 1.0023453532688362, + "grad_norm": 0.8333681661466221, + "learning_rate": 0.00029999998872761715, + "loss": 3.37786602973938, + "step": 1710, + "token_acc": 0.2628836363454688 + }, + { + "epoch": 1.0029316915860451, + "grad_norm": 0.8084953482684752, + "learning_rate": 0.0002999999823869019, + "loss": 3.392746925354004, + "step": 1711, + "token_acc": 0.26052477321027856 + }, + { + "epoch": 1.0035180299032542, + "grad_norm": 0.9261746325531222, + "learning_rate": 0.00029999997463713897, + "loss": 3.4910969734191895, + "step": 1712, + "token_acc": 0.25134285714285715 + }, + { + "epoch": 1.0041043682204631, + "grad_norm": 1.1063628391454878, + "learning_rate": 0.0002999999654783284, + "loss": 3.391463279724121, + "step": 1713, + "token_acc": 0.2600584345177497 + }, + { + "epoch": 1.0046907065376722, + "grad_norm": 0.9810377414032089, + "learning_rate": 0.00029999995491047026, + "loss": 3.3883185386657715, + "step": 1714, + "token_acc": 0.25857074704600147 + }, + { + "epoch": 1.0052770448548813, + "grad_norm": 0.8932328364487947, + "learning_rate": 0.00029999994293356474, + "loss": 3.3523049354553223, + "step": 1715, + "token_acc": 0.2667162173751465 + }, + { + "epoch": 1.0058633831720902, + "grad_norm": 0.739044666395893, + "learning_rate": 0.0002999999295476118, + "loss": 3.3763883113861084, + "step": 1716, + "token_acc": 0.2626059459045688 + }, + { + "epoch": 1.0064497214892993, + "grad_norm": 0.8456230589026097, + "learning_rate": 0.00029999991475261167, + "loss": 3.399453639984131, + "step": 1717, + "token_acc": 0.2597213931399532 + }, + { + "epoch": 1.0070360598065085, + "grad_norm": 0.8751833313184247, + "learning_rate": 0.0002999998985485645, + "loss": 3.3983278274536133, + "step": 1718, + "token_acc": 0.2583237978996618 + }, + { + "epoch": 1.0076223981237173, + "grad_norm": 0.9920140389801315, + "learning_rate": 0.0002999998809354704, + "loss": 3.4007740020751953, + "step": 1719, + "token_acc": 0.25923446534313105 + }, + { + "epoch": 1.0082087364409265, + "grad_norm": 1.2084044953976938, + "learning_rate": 0.00029999986191332957, + "loss": 3.424487590789795, + "step": 1720, + "token_acc": 0.25755801703237646 + }, + { + "epoch": 1.0087950747581353, + "grad_norm": 0.6820958420544017, + "learning_rate": 0.0002999998414821421, + "loss": 3.4095396995544434, + "step": 1721, + "token_acc": 0.26009093340465367 + }, + { + "epoch": 1.0093814130753445, + "grad_norm": 0.7412080973084054, + "learning_rate": 0.0002999998196419083, + "loss": 3.4043984413146973, + "step": 1722, + "token_acc": 0.2596113732532137 + }, + { + "epoch": 1.0099677513925536, + "grad_norm": 1.0373002169063243, + "learning_rate": 0.0002999997963926283, + "loss": 3.354407787322998, + "step": 1723, + "token_acc": 0.26333875781941 + }, + { + "epoch": 1.0105540897097625, + "grad_norm": 0.9080489688837124, + "learning_rate": 0.00029999977173430234, + "loss": 3.428590774536133, + "step": 1724, + "token_acc": 0.2568598981812835 + }, + { + "epoch": 1.0111404280269716, + "grad_norm": 0.9875429010714675, + "learning_rate": 0.00029999974566693067, + "loss": 3.4000887870788574, + "step": 1725, + "token_acc": 0.2605814519190159 + }, + { + "epoch": 1.0117267663441807, + "grad_norm": 1.0907716369815048, + "learning_rate": 0.0002999997181905135, + "loss": 3.415714740753174, + "step": 1726, + "token_acc": 0.25532852890809754 + }, + { + "epoch": 1.0123131046613896, + "grad_norm": 0.965178027918728, + "learning_rate": 0.0002999996893050511, + "loss": 3.4131994247436523, + "step": 1727, + "token_acc": 0.2566456520606471 + }, + { + "epoch": 1.0128994429785987, + "grad_norm": 0.9547230227619361, + "learning_rate": 0.0002999996590105438, + "loss": 3.4405550956726074, + "step": 1728, + "token_acc": 0.25368073740722774 + }, + { + "epoch": 1.0134857812958076, + "grad_norm": 0.8847965964508416, + "learning_rate": 0.00029999962730699174, + "loss": 3.389859676361084, + "step": 1729, + "token_acc": 0.2604225765926406 + }, + { + "epoch": 1.0140721196130167, + "grad_norm": 0.8294793922589502, + "learning_rate": 0.00029999959419439536, + "loss": 3.4114487171173096, + "step": 1730, + "token_acc": 0.25742371223732746 + }, + { + "epoch": 1.0146584579302258, + "grad_norm": 1.0027198034549007, + "learning_rate": 0.00029999955967275493, + "loss": 3.35640025138855, + "step": 1731, + "token_acc": 0.26197248549718116 + }, + { + "epoch": 1.0152447962474347, + "grad_norm": 0.9592782797974249, + "learning_rate": 0.0002999995237420707, + "loss": 3.3589940071105957, + "step": 1732, + "token_acc": 0.2638201270316613 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.943559877890224, + "learning_rate": 0.00029999948640234317, + "loss": 3.314626455307007, + "step": 1733, + "token_acc": 0.2709107539526983 + }, + { + "epoch": 1.016417472881853, + "grad_norm": 0.8642867669201103, + "learning_rate": 0.00029999944765357253, + "loss": 3.400038242340088, + "step": 1734, + "token_acc": 0.25767984943731037 + }, + { + "epoch": 1.0170038111990618, + "grad_norm": 0.8459321748467306, + "learning_rate": 0.0002999994074957592, + "loss": 3.4074056148529053, + "step": 1735, + "token_acc": 0.2597568777991043 + }, + { + "epoch": 1.017590149516271, + "grad_norm": 0.894745378019444, + "learning_rate": 0.00029999936592890356, + "loss": 3.372298240661621, + "step": 1736, + "token_acc": 0.2629354410082604 + }, + { + "epoch": 1.01817648783348, + "grad_norm": 0.9224572091580456, + "learning_rate": 0.00029999932295300604, + "loss": 3.4247336387634277, + "step": 1737, + "token_acc": 0.25723349397265083 + }, + { + "epoch": 1.018762826150689, + "grad_norm": 0.7911471956349633, + "learning_rate": 0.00029999927856806695, + "loss": 3.3763225078582764, + "step": 1738, + "token_acc": 0.2620417021538588 + }, + { + "epoch": 1.019349164467898, + "grad_norm": 0.7174391704005636, + "learning_rate": 0.00029999923277408686, + "loss": 3.3414993286132812, + "step": 1739, + "token_acc": 0.2666880440478332 + }, + { + "epoch": 1.019935502785107, + "grad_norm": 0.8386471678889612, + "learning_rate": 0.00029999918557106607, + "loss": 3.394024133682251, + "step": 1740, + "token_acc": 0.2607623532406376 + }, + { + "epoch": 1.020521841102316, + "grad_norm": 1.0113645296909612, + "learning_rate": 0.000299999136959005, + "loss": 3.3740687370300293, + "step": 1741, + "token_acc": 0.26362324544142723 + }, + { + "epoch": 1.0211081794195251, + "grad_norm": 1.128621336111224, + "learning_rate": 0.0002999990869379042, + "loss": 3.3917953968048096, + "step": 1742, + "token_acc": 0.26263016121899246 + }, + { + "epoch": 1.021694517736734, + "grad_norm": 0.710250082745488, + "learning_rate": 0.0002999990355077641, + "loss": 3.450207233428955, + "step": 1743, + "token_acc": 0.2519529839669444 + }, + { + "epoch": 1.0222808560539431, + "grad_norm": 0.8119155944613001, + "learning_rate": 0.00029999898266858517, + "loss": 3.3751885890960693, + "step": 1744, + "token_acc": 0.2633274532274056 + }, + { + "epoch": 1.0228671943711523, + "grad_norm": 0.8324607460854886, + "learning_rate": 0.00029999892842036803, + "loss": 3.4380359649658203, + "step": 1745, + "token_acc": 0.2558347268818098 + }, + { + "epoch": 1.0234535326883611, + "grad_norm": 0.7664113526347227, + "learning_rate": 0.00029999887276311297, + "loss": 3.3703644275665283, + "step": 1746, + "token_acc": 0.2644394860979511 + }, + { + "epoch": 1.0240398710055703, + "grad_norm": 0.8877303154156282, + "learning_rate": 0.00029999881569682063, + "loss": 3.363328695297241, + "step": 1747, + "token_acc": 0.26397908341443105 + }, + { + "epoch": 1.0246262093227791, + "grad_norm": 0.7224506263196261, + "learning_rate": 0.0002999987572214916, + "loss": 3.36671781539917, + "step": 1748, + "token_acc": 0.26416642265677964 + }, + { + "epoch": 1.0252125476399883, + "grad_norm": 0.6699390918986655, + "learning_rate": 0.00029999869733712635, + "loss": 3.3922994136810303, + "step": 1749, + "token_acc": 0.2604790495493773 + }, + { + "epoch": 1.0257988859571974, + "grad_norm": 0.8220225165178263, + "learning_rate": 0.00029999863604372544, + "loss": 3.3872084617614746, + "step": 1750, + "token_acc": 0.2595554774573166 + }, + { + "epoch": 1.0263852242744063, + "grad_norm": 0.8485810972963475, + "learning_rate": 0.0002999985733412895, + "loss": 3.3624041080474854, + "step": 1751, + "token_acc": 0.2649622390525435 + }, + { + "epoch": 1.0269715625916154, + "grad_norm": 0.9047595067154075, + "learning_rate": 0.00029999850922981906, + "loss": 3.362987756729126, + "step": 1752, + "token_acc": 0.26315309662853864 + }, + { + "epoch": 1.0275579009088245, + "grad_norm": 1.1178257155612394, + "learning_rate": 0.0002999984437093148, + "loss": 3.3803837299346924, + "step": 1753, + "token_acc": 0.26111043660492295 + }, + { + "epoch": 1.0281442392260334, + "grad_norm": 0.9210282713304254, + "learning_rate": 0.00029999837677977724, + "loss": 3.372103214263916, + "step": 1754, + "token_acc": 0.2640985176492804 + }, + { + "epoch": 1.0287305775432425, + "grad_norm": 0.7476643013868809, + "learning_rate": 0.0002999983084412071, + "loss": 3.4022512435913086, + "step": 1755, + "token_acc": 0.2576047129090288 + }, + { + "epoch": 1.0293169158604514, + "grad_norm": 0.81344564858752, + "learning_rate": 0.0002999982386936049, + "loss": 3.372739791870117, + "step": 1756, + "token_acc": 0.2634982415879752 + }, + { + "epoch": 1.0299032541776605, + "grad_norm": 0.7536285278639427, + "learning_rate": 0.00029999816753697143, + "loss": 3.3786659240722656, + "step": 1757, + "token_acc": 0.26132991417888957 + }, + { + "epoch": 1.0304895924948696, + "grad_norm": 0.7842789862506705, + "learning_rate": 0.0002999980949713073, + "loss": 3.348092555999756, + "step": 1758, + "token_acc": 0.2655454906348118 + }, + { + "epoch": 1.0310759308120785, + "grad_norm": 0.8396611512827058, + "learning_rate": 0.0002999980209966132, + "loss": 3.3995118141174316, + "step": 1759, + "token_acc": 0.2603121619861858 + }, + { + "epoch": 1.0316622691292876, + "grad_norm": 1.1018203486439937, + "learning_rate": 0.0002999979456128898, + "loss": 3.3909754753112793, + "step": 1760, + "token_acc": 0.2593186258946939 + }, + { + "epoch": 1.0322486074464967, + "grad_norm": 1.0576369226958502, + "learning_rate": 0.0002999978688201378, + "loss": 3.3943753242492676, + "step": 1761, + "token_acc": 0.260843463121654 + }, + { + "epoch": 1.0328349457637056, + "grad_norm": 1.019445181119772, + "learning_rate": 0.000299997790618358, + "loss": 3.3849689960479736, + "step": 1762, + "token_acc": 0.261232816955356 + }, + { + "epoch": 1.0334212840809147, + "grad_norm": 0.8755984761795024, + "learning_rate": 0.00029999771100755105, + "loss": 3.3875365257263184, + "step": 1763, + "token_acc": 0.25996019017955674 + }, + { + "epoch": 1.0340076223981236, + "grad_norm": 0.8994273455788394, + "learning_rate": 0.00029999762998771774, + "loss": 3.393388271331787, + "step": 1764, + "token_acc": 0.2599351092190374 + }, + { + "epoch": 1.0345939607153327, + "grad_norm": 0.9717786494864452, + "learning_rate": 0.0002999975475588588, + "loss": 3.350372314453125, + "step": 1765, + "token_acc": 0.2650019770248897 + }, + { + "epoch": 1.0351802990325418, + "grad_norm": 1.0468605375312847, + "learning_rate": 0.00029999746372097507, + "loss": 3.3480684757232666, + "step": 1766, + "token_acc": 0.26507298053939743 + }, + { + "epoch": 1.0357666373497507, + "grad_norm": 0.8851988192954556, + "learning_rate": 0.0002999973784740672, + "loss": 3.3836312294006348, + "step": 1767, + "token_acc": 0.26240938520281115 + }, + { + "epoch": 1.0363529756669598, + "grad_norm": 0.9331771450498074, + "learning_rate": 0.00029999729181813615, + "loss": 3.4141969680786133, + "step": 1768, + "token_acc": 0.2595553457020279 + }, + { + "epoch": 1.036939313984169, + "grad_norm": 0.77731891668261, + "learning_rate": 0.00029999720375318266, + "loss": 3.3435940742492676, + "step": 1769, + "token_acc": 0.26451718240192285 + }, + { + "epoch": 1.0375256523013778, + "grad_norm": 0.8986994009828604, + "learning_rate": 0.0002999971142792076, + "loss": 3.3709797859191895, + "step": 1770, + "token_acc": 0.262259951557326 + }, + { + "epoch": 1.038111990618587, + "grad_norm": 0.8776321376775235, + "learning_rate": 0.0002999970233962117, + "loss": 3.3955564498901367, + "step": 1771, + "token_acc": 0.26160457333350695 + }, + { + "epoch": 1.038698328935796, + "grad_norm": 0.8175033022470682, + "learning_rate": 0.00029999693110419593, + "loss": 3.419679641723633, + "step": 1772, + "token_acc": 0.25593657649161256 + }, + { + "epoch": 1.039284667253005, + "grad_norm": 0.8277918762478872, + "learning_rate": 0.00029999683740316116, + "loss": 3.4046270847320557, + "step": 1773, + "token_acc": 0.25829507278095404 + }, + { + "epoch": 1.039871005570214, + "grad_norm": 0.7603283305041085, + "learning_rate": 0.0002999967422931082, + "loss": 3.3523402214050293, + "step": 1774, + "token_acc": 0.2653741866942922 + }, + { + "epoch": 1.040457343887423, + "grad_norm": 0.7606691782279854, + "learning_rate": 0.00029999664577403794, + "loss": 3.380520820617676, + "step": 1775, + "token_acc": 0.2620260087961379 + }, + { + "epoch": 1.041043682204632, + "grad_norm": 0.7226846040608634, + "learning_rate": 0.00029999654784595135, + "loss": 3.411355495452881, + "step": 1776, + "token_acc": 0.2576562091014471 + }, + { + "epoch": 1.0416300205218412, + "grad_norm": 0.689014870912869, + "learning_rate": 0.0002999964485088493, + "loss": 3.382324695587158, + "step": 1777, + "token_acc": 0.25940353790152626 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.7761575920904261, + "learning_rate": 0.0002999963477627327, + "loss": 3.3927111625671387, + "step": 1778, + "token_acc": 0.26143789183304567 + }, + { + "epoch": 1.0428026971562592, + "grad_norm": 0.7044803022677012, + "learning_rate": 0.0002999962456076026, + "loss": 3.3545103073120117, + "step": 1779, + "token_acc": 0.263890076684056 + }, + { + "epoch": 1.0433890354734683, + "grad_norm": 0.7699425146010567, + "learning_rate": 0.00029999614204345986, + "loss": 3.3955256938934326, + "step": 1780, + "token_acc": 0.2587263670070707 + }, + { + "epoch": 1.0439753737906772, + "grad_norm": 0.9232243393083946, + "learning_rate": 0.00029999603707030545, + "loss": 3.3742294311523438, + "step": 1781, + "token_acc": 0.2616142697768505 + }, + { + "epoch": 1.0445617121078863, + "grad_norm": 0.9426066110575911, + "learning_rate": 0.00029999593068814044, + "loss": 3.305436611175537, + "step": 1782, + "token_acc": 0.26920904756133096 + }, + { + "epoch": 1.0451480504250952, + "grad_norm": 0.8670073963620424, + "learning_rate": 0.0002999958228969658, + "loss": 3.376132011413574, + "step": 1783, + "token_acc": 0.2624668927582907 + }, + { + "epoch": 1.0457343887423043, + "grad_norm": 1.0426661507890371, + "learning_rate": 0.0002999957136967825, + "loss": 3.334969997406006, + "step": 1784, + "token_acc": 0.2681092555750532 + }, + { + "epoch": 1.0463207270595134, + "grad_norm": 0.9671634793867665, + "learning_rate": 0.0002999956030875916, + "loss": 3.3868446350097656, + "step": 1785, + "token_acc": 0.26027436522101643 + }, + { + "epoch": 1.0469070653767223, + "grad_norm": 0.8985632541483063, + "learning_rate": 0.00029999549106939414, + "loss": 3.3959574699401855, + "step": 1786, + "token_acc": 0.2607896363587497 + }, + { + "epoch": 1.0474934036939314, + "grad_norm": 0.8612457539310248, + "learning_rate": 0.0002999953776421911, + "loss": 3.416934013366699, + "step": 1787, + "token_acc": 0.25827750621775314 + }, + { + "epoch": 1.0480797420111405, + "grad_norm": 0.7170934421803308, + "learning_rate": 0.00029999526280598374, + "loss": 3.3741862773895264, + "step": 1788, + "token_acc": 0.2614633512952022 + }, + { + "epoch": 1.0486660803283494, + "grad_norm": 0.8041434643072711, + "learning_rate": 0.00029999514656077285, + "loss": 3.404782772064209, + "step": 1789, + "token_acc": 0.2589871424822881 + }, + { + "epoch": 1.0492524186455585, + "grad_norm": 0.7943564409743228, + "learning_rate": 0.00029999502890655977, + "loss": 3.348447322845459, + "step": 1790, + "token_acc": 0.26582131135510745 + }, + { + "epoch": 1.0498387569627674, + "grad_norm": 0.6660538951113505, + "learning_rate": 0.0002999949098433455, + "loss": 3.4021592140197754, + "step": 1791, + "token_acc": 0.25776594328318464 + }, + { + "epoch": 1.0504250952799765, + "grad_norm": 0.7349703975298816, + "learning_rate": 0.0002999947893711312, + "loss": 3.380229949951172, + "step": 1792, + "token_acc": 0.26187537359199486 + }, + { + "epoch": 1.0510114335971856, + "grad_norm": 0.7399682433070527, + "learning_rate": 0.0002999946674899179, + "loss": 3.330737829208374, + "step": 1793, + "token_acc": 0.2670677534825549 + }, + { + "epoch": 1.0515977719143945, + "grad_norm": 0.7233746919487639, + "learning_rate": 0.0002999945441997069, + "loss": 3.3915553092956543, + "step": 1794, + "token_acc": 0.2597853714336429 + }, + { + "epoch": 1.0521841102316036, + "grad_norm": 0.6939501488533247, + "learning_rate": 0.0002999944195004992, + "loss": 3.3740787506103516, + "step": 1795, + "token_acc": 0.2618271831453743 + }, + { + "epoch": 1.0527704485488127, + "grad_norm": 0.6525540005774914, + "learning_rate": 0.0002999942933922961, + "loss": 3.3524160385131836, + "step": 1796, + "token_acc": 0.264246341495198 + }, + { + "epoch": 1.0533567868660216, + "grad_norm": 0.7278215743329536, + "learning_rate": 0.00029999416587509875, + "loss": 3.349514961242676, + "step": 1797, + "token_acc": 0.26369993675947173 + }, + { + "epoch": 1.0539431251832307, + "grad_norm": 0.74157067501199, + "learning_rate": 0.0002999940369489083, + "loss": 3.3952043056488037, + "step": 1798, + "token_acc": 0.2597982444648238 + }, + { + "epoch": 1.0545294635004399, + "grad_norm": 0.6652350999524858, + "learning_rate": 0.000299993906613726, + "loss": 3.3528342247009277, + "step": 1799, + "token_acc": 0.2653861775033546 + }, + { + "epoch": 1.0551158018176487, + "grad_norm": 0.766098439126724, + "learning_rate": 0.00029999377486955304, + "loss": 3.345491647720337, + "step": 1800, + "token_acc": 0.2650788052054133 + }, + { + "epoch": 1.0557021401348579, + "grad_norm": 0.9866872815918452, + "learning_rate": 0.00029999364171639077, + "loss": 3.407106876373291, + "step": 1801, + "token_acc": 0.2580140856371538 + }, + { + "epoch": 1.0562884784520667, + "grad_norm": 1.1768840730452808, + "learning_rate": 0.0002999935071542403, + "loss": 3.4136111736297607, + "step": 1802, + "token_acc": 0.25747453552966904 + }, + { + "epoch": 1.0568748167692759, + "grad_norm": 0.7892104606692729, + "learning_rate": 0.0002999933711831029, + "loss": 3.4035239219665527, + "step": 1803, + "token_acc": 0.2581690174726764 + }, + { + "epoch": 1.057461155086485, + "grad_norm": 0.7375141856192511, + "learning_rate": 0.00029999323380298, + "loss": 3.389493227005005, + "step": 1804, + "token_acc": 0.26071410652181376 + }, + { + "epoch": 1.0580474934036939, + "grad_norm": 0.9593468572012652, + "learning_rate": 0.0002999930950138727, + "loss": 3.3721256256103516, + "step": 1805, + "token_acc": 0.2624717719774853 + }, + { + "epoch": 1.058633831720903, + "grad_norm": 1.0766239421207, + "learning_rate": 0.0002999929548157824, + "loss": 3.3696908950805664, + "step": 1806, + "token_acc": 0.26196856726142603 + }, + { + "epoch": 1.059220170038112, + "grad_norm": 0.9456005406147554, + "learning_rate": 0.00029999281320871045, + "loss": 3.359165668487549, + "step": 1807, + "token_acc": 0.2623298447226212 + }, + { + "epoch": 1.059806508355321, + "grad_norm": 0.8439669142285969, + "learning_rate": 0.0002999926701926581, + "loss": 3.3571033477783203, + "step": 1808, + "token_acc": 0.26622653285334696 + }, + { + "epoch": 1.06039284667253, + "grad_norm": 0.8205434561019966, + "learning_rate": 0.0002999925257676267, + "loss": 3.384303092956543, + "step": 1809, + "token_acc": 0.25844957316782885 + }, + { + "epoch": 1.060979184989739, + "grad_norm": 0.8971858413606124, + "learning_rate": 0.00029999237993361767, + "loss": 3.3800411224365234, + "step": 1810, + "token_acc": 0.260637640565144 + }, + { + "epoch": 1.061565523306948, + "grad_norm": 0.7823903778009983, + "learning_rate": 0.0002999922326906324, + "loss": 3.379568099975586, + "step": 1811, + "token_acc": 0.2601063569776506 + }, + { + "epoch": 1.0621518616241572, + "grad_norm": 0.9159004083745563, + "learning_rate": 0.0002999920840386722, + "loss": 3.3650574684143066, + "step": 1812, + "token_acc": 0.2624144561590261 + }, + { + "epoch": 1.062738199941366, + "grad_norm": 1.030104968460944, + "learning_rate": 0.00029999193397773846, + "loss": 3.3583874702453613, + "step": 1813, + "token_acc": 0.2638046572030603 + }, + { + "epoch": 1.0633245382585752, + "grad_norm": 0.9624137254531098, + "learning_rate": 0.0002999917825078326, + "loss": 3.400360107421875, + "step": 1814, + "token_acc": 0.26012860373840685 + }, + { + "epoch": 1.0639108765757843, + "grad_norm": 0.7893157710454682, + "learning_rate": 0.00029999162962895606, + "loss": 3.3766489028930664, + "step": 1815, + "token_acc": 0.2627018532277808 + }, + { + "epoch": 1.0644972148929932, + "grad_norm": 0.6949594233793593, + "learning_rate": 0.0002999914753411103, + "loss": 3.3800716400146484, + "step": 1816, + "token_acc": 0.26135095090219 + }, + { + "epoch": 1.0650835532102023, + "grad_norm": 0.745899088250051, + "learning_rate": 0.00029999131964429676, + "loss": 3.3419995307922363, + "step": 1817, + "token_acc": 0.26539760067210405 + }, + { + "epoch": 1.0656698915274112, + "grad_norm": 0.9529758359560807, + "learning_rate": 0.0002999911625385169, + "loss": 3.3772125244140625, + "step": 1818, + "token_acc": 0.2623152772573517 + }, + { + "epoch": 1.0662562298446203, + "grad_norm": 0.857926572563159, + "learning_rate": 0.00029999100402377214, + "loss": 3.392021656036377, + "step": 1819, + "token_acc": 0.25674741018237357 + }, + { + "epoch": 1.0668425681618294, + "grad_norm": 0.9550570270986178, + "learning_rate": 0.000299990844100064, + "loss": 3.392603874206543, + "step": 1820, + "token_acc": 0.2594738689607106 + }, + { + "epoch": 1.0674289064790383, + "grad_norm": 0.9535639342697362, + "learning_rate": 0.0002999906827673941, + "loss": 3.389920711517334, + "step": 1821, + "token_acc": 0.2597747740643187 + }, + { + "epoch": 1.0680152447962474, + "grad_norm": 1.0020147431310158, + "learning_rate": 0.00029999052002576375, + "loss": 3.3925065994262695, + "step": 1822, + "token_acc": 0.2601016069554256 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 0.8487964557424253, + "learning_rate": 0.0002999903558751746, + "loss": 3.354167938232422, + "step": 1823, + "token_acc": 0.26329381444657307 + }, + { + "epoch": 1.0691879214306654, + "grad_norm": 0.7935267537680214, + "learning_rate": 0.00029999019031562814, + "loss": 3.3833789825439453, + "step": 1824, + "token_acc": 0.2610409315544568 + }, + { + "epoch": 1.0697742597478745, + "grad_norm": 0.8960275105104439, + "learning_rate": 0.000299990023347126, + "loss": 3.3536205291748047, + "step": 1825, + "token_acc": 0.26386308371728845 + }, + { + "epoch": 1.0703605980650837, + "grad_norm": 0.9312218639291532, + "learning_rate": 0.0002999898549696697, + "loss": 3.3346824645996094, + "step": 1826, + "token_acc": 0.2663930145599632 + }, + { + "epoch": 1.0709469363822925, + "grad_norm": 0.8228525371779517, + "learning_rate": 0.00029998968518326084, + "loss": 3.3215909004211426, + "step": 1827, + "token_acc": 0.2687290038815346 + }, + { + "epoch": 1.0715332746995017, + "grad_norm": 0.7293975570159389, + "learning_rate": 0.000299989513987901, + "loss": 3.334415912628174, + "step": 1828, + "token_acc": 0.2639500933935404 + }, + { + "epoch": 1.0721196130167105, + "grad_norm": 0.845495366430657, + "learning_rate": 0.00029998934138359177, + "loss": 3.384962797164917, + "step": 1829, + "token_acc": 0.25753564664448714 + }, + { + "epoch": 1.0727059513339197, + "grad_norm": 0.9174501715512632, + "learning_rate": 0.0002999891673703348, + "loss": 3.4210264682769775, + "step": 1830, + "token_acc": 0.2531643301602578 + }, + { + "epoch": 1.0732922896511288, + "grad_norm": 0.851762399027193, + "learning_rate": 0.00029998899194813165, + "loss": 3.3407535552978516, + "step": 1831, + "token_acc": 0.2675511755328695 + }, + { + "epoch": 1.0738786279683377, + "grad_norm": 0.7976284371502028, + "learning_rate": 0.0002999888151169841, + "loss": 3.3013997077941895, + "step": 1832, + "token_acc": 0.27040328165224425 + }, + { + "epoch": 1.0744649662855468, + "grad_norm": 0.9812288980132592, + "learning_rate": 0.0002999886368768938, + "loss": 3.3562488555908203, + "step": 1833, + "token_acc": 0.2630545586935039 + }, + { + "epoch": 1.0750513046027559, + "grad_norm": 0.9791616359697202, + "learning_rate": 0.0002999884572278623, + "loss": 3.3331212997436523, + "step": 1834, + "token_acc": 0.2662169065005254 + }, + { + "epoch": 1.0756376429199648, + "grad_norm": 0.7719278901610888, + "learning_rate": 0.0002999882761698913, + "loss": 3.3892135620117188, + "step": 1835, + "token_acc": 0.2607259208156069 + }, + { + "epoch": 1.0762239812371739, + "grad_norm": 0.7422384611730193, + "learning_rate": 0.00029998809370298266, + "loss": 3.343909740447998, + "step": 1836, + "token_acc": 0.2646412836875938 + }, + { + "epoch": 1.0768103195543828, + "grad_norm": 0.8532424680345285, + "learning_rate": 0.00029998790982713793, + "loss": 3.401294708251953, + "step": 1837, + "token_acc": 0.2601695338928915 + }, + { + "epoch": 1.077396657871592, + "grad_norm": 0.7758794825761017, + "learning_rate": 0.00029998772454235893, + "loss": 3.353382110595703, + "step": 1838, + "token_acc": 0.2618015035145331 + }, + { + "epoch": 1.077982996188801, + "grad_norm": 0.6932813798239679, + "learning_rate": 0.0002999875378486474, + "loss": 3.373500347137451, + "step": 1839, + "token_acc": 0.2617260594030746 + }, + { + "epoch": 1.07856933450601, + "grad_norm": 0.7131069637005163, + "learning_rate": 0.000299987349746005, + "loss": 3.385409116744995, + "step": 1840, + "token_acc": 0.25893486904826496 + }, + { + "epoch": 1.079155672823219, + "grad_norm": 0.6980254047974559, + "learning_rate": 0.00029998716023443356, + "loss": 3.332803726196289, + "step": 1841, + "token_acc": 0.26615579121191013 + }, + { + "epoch": 1.0797420111404281, + "grad_norm": 0.6515846096456306, + "learning_rate": 0.00029998696931393486, + "loss": 3.340400218963623, + "step": 1842, + "token_acc": 0.26582544097814875 + }, + { + "epoch": 1.080328349457637, + "grad_norm": 0.7106931322015347, + "learning_rate": 0.00029998677698451077, + "loss": 3.389979124069214, + "step": 1843, + "token_acc": 0.2595262230234107 + }, + { + "epoch": 1.0809146877748461, + "grad_norm": 0.6975720815938058, + "learning_rate": 0.0002999865832461629, + "loss": 3.358290910720825, + "step": 1844, + "token_acc": 0.26243774346509835 + }, + { + "epoch": 1.081501026092055, + "grad_norm": 0.6874789787841085, + "learning_rate": 0.00029998638809889327, + "loss": 3.3440351486206055, + "step": 1845, + "token_acc": 0.26515496479568335 + }, + { + "epoch": 1.0820873644092641, + "grad_norm": 0.7484763116980394, + "learning_rate": 0.00029998619154270365, + "loss": 3.3031270503997803, + "step": 1846, + "token_acc": 0.27085472806441013 + }, + { + "epoch": 1.0826737027264732, + "grad_norm": 0.6716394854257814, + "learning_rate": 0.0002999859935775958, + "loss": 3.3274431228637695, + "step": 1847, + "token_acc": 0.2670102902325925 + }, + { + "epoch": 1.0832600410436821, + "grad_norm": 0.7246323707521469, + "learning_rate": 0.00029998579420357173, + "loss": 3.3061628341674805, + "step": 1848, + "token_acc": 0.2686076228303312 + }, + { + "epoch": 1.0838463793608912, + "grad_norm": 0.8409240959189997, + "learning_rate": 0.0002999855934206331, + "loss": 3.273674964904785, + "step": 1849, + "token_acc": 0.27425059355728765 + }, + { + "epoch": 1.0844327176781003, + "grad_norm": 0.7850085431693875, + "learning_rate": 0.00029998539122878207, + "loss": 3.364394187927246, + "step": 1850, + "token_acc": 0.26277502407017794 + }, + { + "epoch": 1.0850190559953092, + "grad_norm": 0.8227303629573909, + "learning_rate": 0.00029998518762802033, + "loss": 3.3496623039245605, + "step": 1851, + "token_acc": 0.2631613021569537 + }, + { + "epoch": 1.0856053943125183, + "grad_norm": 0.8260038363963389, + "learning_rate": 0.0002999849826183499, + "loss": 3.358795404434204, + "step": 1852, + "token_acc": 0.26455691762210704 + }, + { + "epoch": 1.0861917326297275, + "grad_norm": 0.697201027188689, + "learning_rate": 0.0002999847761997726, + "loss": 3.376037836074829, + "step": 1853, + "token_acc": 0.26037088869724184 + }, + { + "epoch": 1.0867780709469363, + "grad_norm": 0.6127033600818833, + "learning_rate": 0.0002999845683722905, + "loss": 3.396925449371338, + "step": 1854, + "token_acc": 0.2584458122608201 + }, + { + "epoch": 1.0873644092641455, + "grad_norm": 0.722678643055183, + "learning_rate": 0.00029998435913590547, + "loss": 3.305473804473877, + "step": 1855, + "token_acc": 0.2696824059852351 + }, + { + "epoch": 1.0879507475813543, + "grad_norm": 0.7044692180510782, + "learning_rate": 0.0002999841484906195, + "loss": 3.361895799636841, + "step": 1856, + "token_acc": 0.26089728895107256 + }, + { + "epoch": 1.0885370858985635, + "grad_norm": 0.6668830006063075, + "learning_rate": 0.0002999839364364345, + "loss": 3.415842056274414, + "step": 1857, + "token_acc": 0.2564124743679114 + }, + { + "epoch": 1.0891234242157726, + "grad_norm": 0.7317548277285817, + "learning_rate": 0.0002999837229733526, + "loss": 3.3588547706604004, + "step": 1858, + "token_acc": 0.2610472786462046 + }, + { + "epoch": 1.0897097625329815, + "grad_norm": 0.9015859051332197, + "learning_rate": 0.0002999835081013756, + "loss": 3.366499423980713, + "step": 1859, + "token_acc": 0.26132335417598257 + }, + { + "epoch": 1.0902961008501906, + "grad_norm": 1.0877575177463663, + "learning_rate": 0.0002999832918205058, + "loss": 3.3732504844665527, + "step": 1860, + "token_acc": 0.259486928952226 + }, + { + "epoch": 1.0908824391673997, + "grad_norm": 1.0735482612044867, + "learning_rate": 0.00029998307413074503, + "loss": 3.423321008682251, + "step": 1861, + "token_acc": 0.25715507538769505 + }, + { + "epoch": 1.0914687774846086, + "grad_norm": 0.868524363994832, + "learning_rate": 0.0002999828550320953, + "loss": 3.3437538146972656, + "step": 1862, + "token_acc": 0.2673338852475441 + }, + { + "epoch": 1.0920551158018177, + "grad_norm": 0.6891365587706741, + "learning_rate": 0.0002999826345245589, + "loss": 3.3648080825805664, + "step": 1863, + "token_acc": 0.26254205175843265 + }, + { + "epoch": 1.0926414541190266, + "grad_norm": 0.9074006485523566, + "learning_rate": 0.00029998241260813767, + "loss": 3.3081865310668945, + "step": 1864, + "token_acc": 0.27017784222366753 + }, + { + "epoch": 1.0932277924362357, + "grad_norm": 0.7593773774507495, + "learning_rate": 0.0002999821892828338, + "loss": 3.315218448638916, + "step": 1865, + "token_acc": 0.26843527895785935 + }, + { + "epoch": 1.0938141307534448, + "grad_norm": 0.6497742092191846, + "learning_rate": 0.00029998196454864934, + "loss": 3.3000540733337402, + "step": 1866, + "token_acc": 0.27061344441312496 + }, + { + "epoch": 1.0944004690706537, + "grad_norm": 0.7048221938275748, + "learning_rate": 0.0002999817384055864, + "loss": 3.3393657207489014, + "step": 1867, + "token_acc": 0.2679184850062057 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.7908784387797431, + "learning_rate": 0.00029998151085364714, + "loss": 3.378603458404541, + "step": 1868, + "token_acc": 0.259456069449902 + }, + { + "epoch": 1.095573145705072, + "grad_norm": 0.8068618658514728, + "learning_rate": 0.00029998128189283373, + "loss": 3.36336612701416, + "step": 1869, + "token_acc": 0.2633997413482546 + }, + { + "epoch": 1.0961594840222808, + "grad_norm": 0.8659161802934441, + "learning_rate": 0.00029998105152314827, + "loss": 3.376901865005493, + "step": 1870, + "token_acc": 0.2623447480247874 + }, + { + "epoch": 1.09674582233949, + "grad_norm": 0.9134912370764321, + "learning_rate": 0.00029998081974459294, + "loss": 3.3381829261779785, + "step": 1871, + "token_acc": 0.2662731509391431 + }, + { + "epoch": 1.0973321606566988, + "grad_norm": 0.7803111438405833, + "learning_rate": 0.0002999805865571699, + "loss": 3.3406195640563965, + "step": 1872, + "token_acc": 0.2671025408892982 + }, + { + "epoch": 1.097918498973908, + "grad_norm": 0.6447883447424895, + "learning_rate": 0.0002999803519608813, + "loss": 3.3208789825439453, + "step": 1873, + "token_acc": 0.2666537644785086 + }, + { + "epoch": 1.098504837291117, + "grad_norm": 0.5818048970487153, + "learning_rate": 0.0002999801159557295, + "loss": 3.3246748447418213, + "step": 1874, + "token_acc": 0.269779814782225 + }, + { + "epoch": 1.099091175608326, + "grad_norm": 0.7194365061619795, + "learning_rate": 0.00029997987854171656, + "loss": 3.3521764278411865, + "step": 1875, + "token_acc": 0.2627429729652276 + }, + { + "epoch": 1.099677513925535, + "grad_norm": 0.7418192473254891, + "learning_rate": 0.00029997963971884476, + "loss": 3.3831262588500977, + "step": 1876, + "token_acc": 0.26058105384542124 + }, + { + "epoch": 1.1002638522427441, + "grad_norm": 0.7360889287029435, + "learning_rate": 0.0002999793994871163, + "loss": 3.2953128814697266, + "step": 1877, + "token_acc": 0.2726489190900754 + }, + { + "epoch": 1.100850190559953, + "grad_norm": 0.6914228117661096, + "learning_rate": 0.0002999791578465335, + "loss": 3.3322625160217285, + "step": 1878, + "token_acc": 0.26731065516909897 + }, + { + "epoch": 1.1014365288771621, + "grad_norm": 0.6818145221951787, + "learning_rate": 0.00029997891479709865, + "loss": 3.431363105773926, + "step": 1879, + "token_acc": 0.2527140101432057 + }, + { + "epoch": 1.1020228671943713, + "grad_norm": 0.6380090435573896, + "learning_rate": 0.000299978670338814, + "loss": 3.3048906326293945, + "step": 1880, + "token_acc": 0.26954131041178025 + }, + { + "epoch": 1.1026092055115801, + "grad_norm": 0.6626461330029279, + "learning_rate": 0.0002999784244716818, + "loss": 3.4155962467193604, + "step": 1881, + "token_acc": 0.25694825620672185 + }, + { + "epoch": 1.1031955438287893, + "grad_norm": 0.8506625506583408, + "learning_rate": 0.0002999781771957044, + "loss": 3.3267247676849365, + "step": 1882, + "token_acc": 0.2661804153999438 + }, + { + "epoch": 1.1037818821459981, + "grad_norm": 1.0169888861684042, + "learning_rate": 0.0002999779285108841, + "loss": 3.3621134757995605, + "step": 1883, + "token_acc": 0.2641188721236499 + }, + { + "epoch": 1.1043682204632073, + "grad_norm": 0.9996126969486607, + "learning_rate": 0.0002999776784172234, + "loss": 3.4028024673461914, + "step": 1884, + "token_acc": 0.25747495324083963 + }, + { + "epoch": 1.1049545587804164, + "grad_norm": 0.8139645269537045, + "learning_rate": 0.0002999774269147244, + "loss": 3.353316307067871, + "step": 1885, + "token_acc": 0.26401771069917107 + }, + { + "epoch": 1.1055408970976253, + "grad_norm": 0.80233470952487, + "learning_rate": 0.00029997717400338954, + "loss": 3.3447303771972656, + "step": 1886, + "token_acc": 0.2648047630674455 + }, + { + "epoch": 1.1061272354148344, + "grad_norm": 0.9067542121117775, + "learning_rate": 0.00029997691968322126, + "loss": 3.34726619720459, + "step": 1887, + "token_acc": 0.26563167601421334 + }, + { + "epoch": 1.1067135737320435, + "grad_norm": 0.8790008003714386, + "learning_rate": 0.0002999766639542219, + "loss": 3.383406162261963, + "step": 1888, + "token_acc": 0.26137054282255673 + }, + { + "epoch": 1.1072999120492524, + "grad_norm": 0.7723925882335825, + "learning_rate": 0.0002999764068163939, + "loss": 3.375410556793213, + "step": 1889, + "token_acc": 0.2602062758529014 + }, + { + "epoch": 1.1078862503664615, + "grad_norm": 0.6862856940559612, + "learning_rate": 0.0002999761482697397, + "loss": 3.3254730701446533, + "step": 1890, + "token_acc": 0.2677047978495208 + }, + { + "epoch": 1.1084725886836704, + "grad_norm": 0.7005596297591745, + "learning_rate": 0.0002999758883142616, + "loss": 3.3733391761779785, + "step": 1891, + "token_acc": 0.26148496348237105 + }, + { + "epoch": 1.1090589270008795, + "grad_norm": 0.6574644069151541, + "learning_rate": 0.0002999756269499622, + "loss": 3.3428986072540283, + "step": 1892, + "token_acc": 0.263758845483692 + }, + { + "epoch": 1.1096452653180886, + "grad_norm": 0.7374671406319119, + "learning_rate": 0.0002999753641768438, + "loss": 3.34000301361084, + "step": 1893, + "token_acc": 0.26486557171252223 + }, + { + "epoch": 1.1102316036352975, + "grad_norm": 0.7089183877569187, + "learning_rate": 0.000299975099994909, + "loss": 3.343444585800171, + "step": 1894, + "token_acc": 0.2662241450838618 + }, + { + "epoch": 1.1108179419525066, + "grad_norm": 0.7150249875529383, + "learning_rate": 0.00029997483440416024, + "loss": 3.355675220489502, + "step": 1895, + "token_acc": 0.2632219663815144 + }, + { + "epoch": 1.1114042802697157, + "grad_norm": 0.68784403268337, + "learning_rate": 0.0002999745674046, + "loss": 3.355212688446045, + "step": 1896, + "token_acc": 0.26356509016888774 + }, + { + "epoch": 1.1119906185869246, + "grad_norm": 0.688190966174502, + "learning_rate": 0.00029997429899623077, + "loss": 3.3388872146606445, + "step": 1897, + "token_acc": 0.2651463904585778 + }, + { + "epoch": 1.1125769569041337, + "grad_norm": 0.7259598076374931, + "learning_rate": 0.0002999740291790551, + "loss": 3.3208112716674805, + "step": 1898, + "token_acc": 0.26923304342217014 + }, + { + "epoch": 1.1131632952213426, + "grad_norm": 0.8619283887543173, + "learning_rate": 0.0002999737579530755, + "loss": 3.371981143951416, + "step": 1899, + "token_acc": 0.26365867501918205 + }, + { + "epoch": 1.1137496335385517, + "grad_norm": 0.7331512846683491, + "learning_rate": 0.00029997348531829454, + "loss": 3.286670684814453, + "step": 1900, + "token_acc": 0.2714882102821801 + }, + { + "epoch": 1.1143359718557608, + "grad_norm": 0.6969807793380998, + "learning_rate": 0.0002999732112747148, + "loss": 3.376579761505127, + "step": 1901, + "token_acc": 0.2624837004382263 + }, + { + "epoch": 1.1149223101729697, + "grad_norm": 0.6956932096027781, + "learning_rate": 0.0002999729358223388, + "loss": 3.3629045486450195, + "step": 1902, + "token_acc": 0.2634608450689611 + }, + { + "epoch": 1.1155086484901788, + "grad_norm": 0.6236796923785131, + "learning_rate": 0.00029997265896116923, + "loss": 3.304075002670288, + "step": 1903, + "token_acc": 0.27021195106626383 + }, + { + "epoch": 1.116094986807388, + "grad_norm": 0.5722823413972503, + "learning_rate": 0.00029997238069120856, + "loss": 3.3786699771881104, + "step": 1904, + "token_acc": 0.2614218425268879 + }, + { + "epoch": 1.1166813251245968, + "grad_norm": 0.6006724897405271, + "learning_rate": 0.00029997210101245943, + "loss": 3.277489185333252, + "step": 1905, + "token_acc": 0.27338236577241487 + }, + { + "epoch": 1.117267663441806, + "grad_norm": 0.6142685318454425, + "learning_rate": 0.0002999718199249246, + "loss": 3.367166042327881, + "step": 1906, + "token_acc": 0.2626761168846522 + }, + { + "epoch": 1.117854001759015, + "grad_norm": 0.7853224904120498, + "learning_rate": 0.00029997153742860656, + "loss": 3.393280267715454, + "step": 1907, + "token_acc": 0.25774006826009216 + }, + { + "epoch": 1.118440340076224, + "grad_norm": 0.9933270488076913, + "learning_rate": 0.000299971253523508, + "loss": 3.3495571613311768, + "step": 1908, + "token_acc": 0.26349458047421176 + }, + { + "epoch": 1.119026678393433, + "grad_norm": 1.0631415138893656, + "learning_rate": 0.0002999709682096316, + "loss": 3.311051368713379, + "step": 1909, + "token_acc": 0.2663021597683087 + }, + { + "epoch": 1.119613016710642, + "grad_norm": 0.9524295589845513, + "learning_rate": 0.0002999706814869801, + "loss": 3.3175277709960938, + "step": 1910, + "token_acc": 0.267980606631419 + }, + { + "epoch": 1.120199355027851, + "grad_norm": 0.8239734835740735, + "learning_rate": 0.000299970393355556, + "loss": 3.3693792819976807, + "step": 1911, + "token_acc": 0.2606141410774551 + }, + { + "epoch": 1.1207856933450602, + "grad_norm": 0.7856677472781632, + "learning_rate": 0.0002999701038153623, + "loss": 3.362311840057373, + "step": 1912, + "token_acc": 0.2618071321468802 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 0.706648113129583, + "learning_rate": 0.0002999698128664015, + "loss": 3.335555076599121, + "step": 1913, + "token_acc": 0.2662314007183171 + }, + { + "epoch": 1.1219583699794782, + "grad_norm": 0.7102652196766037, + "learning_rate": 0.0002999695205086764, + "loss": 3.388075113296509, + "step": 1914, + "token_acc": 0.2599739683509952 + }, + { + "epoch": 1.1225447082966873, + "grad_norm": 0.6667556280557873, + "learning_rate": 0.0002999692267421897, + "loss": 3.3488645553588867, + "step": 1915, + "token_acc": 0.26448339628374307 + }, + { + "epoch": 1.1231310466138962, + "grad_norm": 0.6601217240036369, + "learning_rate": 0.00029996893156694426, + "loss": 3.3449301719665527, + "step": 1916, + "token_acc": 0.2628081070982206 + }, + { + "epoch": 1.1237173849311053, + "grad_norm": 0.7229753657573753, + "learning_rate": 0.00029996863498294276, + "loss": 3.33357310295105, + "step": 1917, + "token_acc": 0.2662222444199685 + }, + { + "epoch": 1.1243037232483142, + "grad_norm": 0.8791007568872177, + "learning_rate": 0.00029996833699018805, + "loss": 3.35505747795105, + "step": 1918, + "token_acc": 0.26359758503775726 + }, + { + "epoch": 1.1248900615655233, + "grad_norm": 1.0999690846524, + "learning_rate": 0.0002999680375886829, + "loss": 3.3831076622009277, + "step": 1919, + "token_acc": 0.260181298035195 + }, + { + "epoch": 1.1254763998827324, + "grad_norm": 0.8369759326026645, + "learning_rate": 0.00029996773677843004, + "loss": 3.359495162963867, + "step": 1920, + "token_acc": 0.2637172948018531 + }, + { + "epoch": 1.1260627381999413, + "grad_norm": 0.7845576540450709, + "learning_rate": 0.0002999674345594325, + "loss": 3.3230931758880615, + "step": 1921, + "token_acc": 0.26525522740727936 + }, + { + "epoch": 1.1266490765171504, + "grad_norm": 0.8410633495346551, + "learning_rate": 0.00029996713093169284, + "loss": 3.359400749206543, + "step": 1922, + "token_acc": 0.26570629795092193 + }, + { + "epoch": 1.1272354148343595, + "grad_norm": 0.7303650681504213, + "learning_rate": 0.0002999668258952142, + "loss": 3.332303524017334, + "step": 1923, + "token_acc": 0.26684466823821207 + }, + { + "epoch": 1.1278217531515684, + "grad_norm": 0.6731782228706984, + "learning_rate": 0.0002999665194499992, + "loss": 3.3381526470184326, + "step": 1924, + "token_acc": 0.2665927327288147 + }, + { + "epoch": 1.1284080914687775, + "grad_norm": 0.875065253599284, + "learning_rate": 0.000299966211596051, + "loss": 3.337651491165161, + "step": 1925, + "token_acc": 0.2667549806915921 + }, + { + "epoch": 1.1289944297859864, + "grad_norm": 0.8093063015717047, + "learning_rate": 0.00029996590233337216, + "loss": 3.352814197540283, + "step": 1926, + "token_acc": 0.26216892438334005 + }, + { + "epoch": 1.1295807681031955, + "grad_norm": 0.7451980234972398, + "learning_rate": 0.0002999655916619658, + "loss": 3.338909149169922, + "step": 1927, + "token_acc": 0.26322852809931563 + }, + { + "epoch": 1.1301671064204046, + "grad_norm": 0.6049228277401555, + "learning_rate": 0.0002999652795818348, + "loss": 3.3408162593841553, + "step": 1928, + "token_acc": 0.2655704082368825 + }, + { + "epoch": 1.1307534447376135, + "grad_norm": 0.619707357719485, + "learning_rate": 0.00029996496609298215, + "loss": 3.289022445678711, + "step": 1929, + "token_acc": 0.27188357714557826 + }, + { + "epoch": 1.1313397830548226, + "grad_norm": 0.8673011790171524, + "learning_rate": 0.00029996465119541064, + "loss": 3.334179639816284, + "step": 1930, + "token_acc": 0.2659278985940694 + }, + { + "epoch": 1.1319261213720317, + "grad_norm": 0.7982743584041475, + "learning_rate": 0.0002999643348891233, + "loss": 3.3319764137268066, + "step": 1931, + "token_acc": 0.266465007929949 + }, + { + "epoch": 1.1325124596892406, + "grad_norm": 0.7837746878822562, + "learning_rate": 0.0002999640171741232, + "loss": 3.323746681213379, + "step": 1932, + "token_acc": 0.2686681787230121 + }, + { + "epoch": 1.1330987980064497, + "grad_norm": 0.7451614648824124, + "learning_rate": 0.00029996369805041314, + "loss": 3.3534750938415527, + "step": 1933, + "token_acc": 0.26392933798243023 + }, + { + "epoch": 1.1336851363236589, + "grad_norm": 0.6631188997441021, + "learning_rate": 0.00029996337751799624, + "loss": 3.380854368209839, + "step": 1934, + "token_acc": 0.2589157624066479 + }, + { + "epoch": 1.1342714746408677, + "grad_norm": 0.6273536900390022, + "learning_rate": 0.00029996305557687555, + "loss": 3.3189263343811035, + "step": 1935, + "token_acc": 0.2681923947343257 + }, + { + "epoch": 1.1348578129580769, + "grad_norm": 0.5243896912843286, + "learning_rate": 0.00029996273222705397, + "loss": 3.301912307739258, + "step": 1936, + "token_acc": 0.27202488927476914 + }, + { + "epoch": 1.1354441512752858, + "grad_norm": 0.552250075539328, + "learning_rate": 0.0002999624074685346, + "loss": 3.3436598777770996, + "step": 1937, + "token_acc": 0.2636355927262705 + }, + { + "epoch": 1.1360304895924949, + "grad_norm": 0.5929807875975288, + "learning_rate": 0.0002999620813013205, + "loss": 3.3347506523132324, + "step": 1938, + "token_acc": 0.26471321695760597 + }, + { + "epoch": 1.136616827909704, + "grad_norm": 0.7032005102990087, + "learning_rate": 0.0002999617537254147, + "loss": 3.374617099761963, + "step": 1939, + "token_acc": 0.2601524976615624 + }, + { + "epoch": 1.1372031662269129, + "grad_norm": 0.7285075832661516, + "learning_rate": 0.0002999614247408204, + "loss": 3.360698699951172, + "step": 1940, + "token_acc": 0.26319181077740567 + }, + { + "epoch": 1.137789504544122, + "grad_norm": 0.8826469909185993, + "learning_rate": 0.0002999610943475404, + "loss": 3.3209612369537354, + "step": 1941, + "token_acc": 0.26784537231409444 + }, + { + "epoch": 1.1383758428613309, + "grad_norm": 1.0492315463364479, + "learning_rate": 0.00029996076254557816, + "loss": 3.338071346282959, + "step": 1942, + "token_acc": 0.2661972017069852 + }, + { + "epoch": 1.13896218117854, + "grad_norm": 0.860027410585991, + "learning_rate": 0.0002999604293349366, + "loss": 3.377962350845337, + "step": 1943, + "token_acc": 0.2602863675511627 + }, + { + "epoch": 1.139548519495749, + "grad_norm": 0.650907376080584, + "learning_rate": 0.0002999600947156188, + "loss": 3.389468193054199, + "step": 1944, + "token_acc": 0.25905989249603856 + }, + { + "epoch": 1.140134857812958, + "grad_norm": 0.5969851658839853, + "learning_rate": 0.0002999597586876281, + "loss": 3.2962870597839355, + "step": 1945, + "token_acc": 0.2698709445534775 + }, + { + "epoch": 1.140721196130167, + "grad_norm": 0.7242920636922087, + "learning_rate": 0.00029995942125096746, + "loss": 3.3191795349121094, + "step": 1946, + "token_acc": 0.26826151214228594 + }, + { + "epoch": 1.1413075344473762, + "grad_norm": 0.8087653998720067, + "learning_rate": 0.0002999590824056401, + "loss": 3.3854565620422363, + "step": 1947, + "token_acc": 0.25963435618733316 + }, + { + "epoch": 1.141893872764585, + "grad_norm": 0.7869087307846857, + "learning_rate": 0.00029995874215164934, + "loss": 3.305237054824829, + "step": 1948, + "token_acc": 0.26998533896463617 + }, + { + "epoch": 1.1424802110817942, + "grad_norm": 0.6910460118517419, + "learning_rate": 0.0002999584004889982, + "loss": 3.338068962097168, + "step": 1949, + "token_acc": 0.2669910527853209 + }, + { + "epoch": 1.1430665493990033, + "grad_norm": 0.7108637470013623, + "learning_rate": 0.00029995805741769, + "loss": 3.3965020179748535, + "step": 1950, + "token_acc": 0.25785697121285334 + }, + { + "epoch": 1.1436528877162122, + "grad_norm": 0.7178269562933071, + "learning_rate": 0.0002999577129377279, + "loss": 3.3654632568359375, + "step": 1951, + "token_acc": 0.2610368533812058 + }, + { + "epoch": 1.1442392260334213, + "grad_norm": 0.65037033338626, + "learning_rate": 0.0002999573670491151, + "loss": 3.357334613800049, + "step": 1952, + "token_acc": 0.26266939619784835 + }, + { + "epoch": 1.1448255643506302, + "grad_norm": 0.7189769920989445, + "learning_rate": 0.000299957019751855, + "loss": 3.327446460723877, + "step": 1953, + "token_acc": 0.26656079129405996 + }, + { + "epoch": 1.1454119026678393, + "grad_norm": 0.8095041811195423, + "learning_rate": 0.00029995667104595067, + "loss": 3.321824073791504, + "step": 1954, + "token_acc": 0.26718440404792615 + }, + { + "epoch": 1.1459982409850484, + "grad_norm": 0.7130710663788038, + "learning_rate": 0.00029995632093140557, + "loss": 3.3693463802337646, + "step": 1955, + "token_acc": 0.2625461369695096 + }, + { + "epoch": 1.1465845793022573, + "grad_norm": 0.6500165133231406, + "learning_rate": 0.0002999559694082229, + "loss": 3.375372886657715, + "step": 1956, + "token_acc": 0.2590127074425589 + }, + { + "epoch": 1.1471709176194664, + "grad_norm": 0.7787068125894548, + "learning_rate": 0.0002999556164764059, + "loss": 3.3446969985961914, + "step": 1957, + "token_acc": 0.2645939922055181 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.8195009412002108, + "learning_rate": 0.00029995526213595804, + "loss": 3.3114867210388184, + "step": 1958, + "token_acc": 0.2697273415397499 + }, + { + "epoch": 1.1483435942538844, + "grad_norm": 0.8081389286150346, + "learning_rate": 0.0002999549063868825, + "loss": 3.3430967330932617, + "step": 1959, + "token_acc": 0.2663364055895911 + }, + { + "epoch": 1.1489299325710935, + "grad_norm": 0.7759123761988718, + "learning_rate": 0.00029995454922918265, + "loss": 3.3480381965637207, + "step": 1960, + "token_acc": 0.2663450319231223 + }, + { + "epoch": 1.1495162708883027, + "grad_norm": 0.719609283634722, + "learning_rate": 0.00029995419066286197, + "loss": 3.284679412841797, + "step": 1961, + "token_acc": 0.271282795366428 + }, + { + "epoch": 1.1501026092055116, + "grad_norm": 0.7015104305571958, + "learning_rate": 0.00029995383068792366, + "loss": 3.3615212440490723, + "step": 1962, + "token_acc": 0.2632653326234596 + }, + { + "epoch": 1.1506889475227207, + "grad_norm": 0.5689924705042977, + "learning_rate": 0.0002999534693043712, + "loss": 3.3382349014282227, + "step": 1963, + "token_acc": 0.2646267546566649 + }, + { + "epoch": 1.1512752858399296, + "grad_norm": 0.6262209952327991, + "learning_rate": 0.000299953106512208, + "loss": 3.286991834640503, + "step": 1964, + "token_acc": 0.27078639958660117 + }, + { + "epoch": 1.1518616241571387, + "grad_norm": 0.7388822545265264, + "learning_rate": 0.00029995274231143734, + "loss": 3.3689193725585938, + "step": 1965, + "token_acc": 0.2620067219652344 + }, + { + "epoch": 1.1524479624743478, + "grad_norm": 0.7812418842743305, + "learning_rate": 0.0002999523767020628, + "loss": 3.352614641189575, + "step": 1966, + "token_acc": 0.2621751615766729 + }, + { + "epoch": 1.1530343007915567, + "grad_norm": 0.6051669723795385, + "learning_rate": 0.0002999520096840877, + "loss": 3.3416333198547363, + "step": 1967, + "token_acc": 0.2651756764996726 + }, + { + "epoch": 1.1536206391087658, + "grad_norm": 0.6706426620703033, + "learning_rate": 0.0002999516412575156, + "loss": 3.3312907218933105, + "step": 1968, + "token_acc": 0.26512975722841137 + }, + { + "epoch": 1.1542069774259747, + "grad_norm": 0.765885205827473, + "learning_rate": 0.00029995127142234983, + "loss": 3.288945436477661, + "step": 1969, + "token_acc": 0.27127552373375763 + }, + { + "epoch": 1.1547933157431838, + "grad_norm": 0.6795166117387362, + "learning_rate": 0.00029995090017859394, + "loss": 3.293445587158203, + "step": 1970, + "token_acc": 0.27237392049574793 + }, + { + "epoch": 1.155379654060393, + "grad_norm": 0.7916104091506182, + "learning_rate": 0.0002999505275262514, + "loss": 3.379434585571289, + "step": 1971, + "token_acc": 0.2616566646034608 + }, + { + "epoch": 1.1559659923776018, + "grad_norm": 0.7188022054827424, + "learning_rate": 0.00029995015346532573, + "loss": 3.304378032684326, + "step": 1972, + "token_acc": 0.2698788996793357 + }, + { + "epoch": 1.156552330694811, + "grad_norm": 0.569945433372692, + "learning_rate": 0.0002999497779958204, + "loss": 3.291182041168213, + "step": 1973, + "token_acc": 0.2715777536506419 + }, + { + "epoch": 1.15713866901202, + "grad_norm": 0.6201960741217912, + "learning_rate": 0.000299949401117739, + "loss": 3.3668785095214844, + "step": 1974, + "token_acc": 0.2601957358099146 + }, + { + "epoch": 1.157725007329229, + "grad_norm": 0.7329999714887171, + "learning_rate": 0.00029994902283108504, + "loss": 3.3286333084106445, + "step": 1975, + "token_acc": 0.267235791596362 + }, + { + "epoch": 1.158311345646438, + "grad_norm": 0.7213961361869522, + "learning_rate": 0.000299948643135862, + "loss": 3.282303810119629, + "step": 1976, + "token_acc": 0.2734704559352425 + }, + { + "epoch": 1.1588976839636471, + "grad_norm": 0.6427228218191209, + "learning_rate": 0.0002999482620320736, + "loss": 3.368710517883301, + "step": 1977, + "token_acc": 0.26174031511292994 + }, + { + "epoch": 1.159484022280856, + "grad_norm": 0.5613460399318453, + "learning_rate": 0.00029994787951972333, + "loss": 3.356588363647461, + "step": 1978, + "token_acc": 0.26274101309296055 + }, + { + "epoch": 1.1600703605980651, + "grad_norm": 0.6097384180377884, + "learning_rate": 0.00029994749559881476, + "loss": 3.332831859588623, + "step": 1979, + "token_acc": 0.26737021804919686 + }, + { + "epoch": 1.160656698915274, + "grad_norm": 0.6316714066209003, + "learning_rate": 0.0002999471102693515, + "loss": 3.332819938659668, + "step": 1980, + "token_acc": 0.2647212923507352 + }, + { + "epoch": 1.1612430372324831, + "grad_norm": 0.6795550957237514, + "learning_rate": 0.00029994672353133726, + "loss": 3.3542909622192383, + "step": 1981, + "token_acc": 0.2649227253279347 + }, + { + "epoch": 1.1618293755496922, + "grad_norm": 0.6264201925215888, + "learning_rate": 0.00029994633538477555, + "loss": 3.322784900665283, + "step": 1982, + "token_acc": 0.2667106243600671 + }, + { + "epoch": 1.1624157138669011, + "grad_norm": 0.6633182100862598, + "learning_rate": 0.0002999459458296701, + "loss": 3.3186521530151367, + "step": 1983, + "token_acc": 0.2680578769918358 + }, + { + "epoch": 1.1630020521841102, + "grad_norm": 0.6881980978248731, + "learning_rate": 0.0002999455548660245, + "loss": 3.3235373497009277, + "step": 1984, + "token_acc": 0.26742215491840887 + }, + { + "epoch": 1.1635883905013193, + "grad_norm": 0.5706343264002469, + "learning_rate": 0.0002999451624938425, + "loss": 3.3294029235839844, + "step": 1985, + "token_acc": 0.265123171920702 + }, + { + "epoch": 1.1641747288185282, + "grad_norm": 0.6942383787856404, + "learning_rate": 0.00029994476871312783, + "loss": 3.34446382522583, + "step": 1986, + "token_acc": 0.2650353065399591 + }, + { + "epoch": 1.1647610671357373, + "grad_norm": 0.7099706142371945, + "learning_rate": 0.00029994437352388404, + "loss": 3.2746572494506836, + "step": 1987, + "token_acc": 0.27445367288568606 + }, + { + "epoch": 1.1653474054529465, + "grad_norm": 0.5898544531498877, + "learning_rate": 0.00029994397692611487, + "loss": 3.3302407264709473, + "step": 1988, + "token_acc": 0.2661184072943243 + }, + { + "epoch": 1.1659337437701554, + "grad_norm": 0.6300487622753115, + "learning_rate": 0.00029994357891982413, + "loss": 3.3049159049987793, + "step": 1989, + "token_acc": 0.26821039513661626 + }, + { + "epoch": 1.1665200820873645, + "grad_norm": 0.6516852825734706, + "learning_rate": 0.00029994317950501556, + "loss": 3.3591113090515137, + "step": 1990, + "token_acc": 0.26239879899201113 + }, + { + "epoch": 1.1671064204045734, + "grad_norm": 0.6863933468120201, + "learning_rate": 0.00029994277868169284, + "loss": 3.338371753692627, + "step": 1991, + "token_acc": 0.2648220212142904 + }, + { + "epoch": 1.1676927587217825, + "grad_norm": 0.8924121778566915, + "learning_rate": 0.00029994237644985977, + "loss": 3.317682981491089, + "step": 1992, + "token_acc": 0.2689219445659727 + }, + { + "epoch": 1.1682790970389916, + "grad_norm": 1.0840220933324198, + "learning_rate": 0.0002999419728095201, + "loss": 3.3796143531799316, + "step": 1993, + "token_acc": 0.2584881158708978 + }, + { + "epoch": 1.1688654353562005, + "grad_norm": 1.2412303166841476, + "learning_rate": 0.00029994156776067763, + "loss": 3.3809375762939453, + "step": 1994, + "token_acc": 0.25951028021167694 + }, + { + "epoch": 1.1694517736734096, + "grad_norm": 0.7827911872446766, + "learning_rate": 0.0002999411613033362, + "loss": 3.3278648853302, + "step": 1995, + "token_acc": 0.26477981639327036 + }, + { + "epoch": 1.1700381119906185, + "grad_norm": 0.7929071837072107, + "learning_rate": 0.0002999407534374996, + "loss": 3.3798394203186035, + "step": 1996, + "token_acc": 0.2608343818359324 + }, + { + "epoch": 1.1706244503078276, + "grad_norm": 0.7986184390573784, + "learning_rate": 0.00029994034416317165, + "loss": 3.3762965202331543, + "step": 1997, + "token_acc": 0.25908969564208567 + }, + { + "epoch": 1.1712107886250367, + "grad_norm": 0.7999578194088334, + "learning_rate": 0.00029993993348035626, + "loss": 3.3295092582702637, + "step": 1998, + "token_acc": 0.26638387307608513 + }, + { + "epoch": 1.1717971269422456, + "grad_norm": 0.7126091151644154, + "learning_rate": 0.00029993952138905724, + "loss": 3.3363890647888184, + "step": 1999, + "token_acc": 0.2659289195422305 + }, + { + "epoch": 1.1723834652594547, + "grad_norm": 0.893795821143371, + "learning_rate": 0.0002999391078892784, + "loss": 3.3406131267547607, + "step": 2000, + "token_acc": 0.26555549402883144 + }, + { + "epoch": 1.1729698035766638, + "grad_norm": 1.0080838403923484, + "learning_rate": 0.0002999386929810238, + "loss": 3.332104206085205, + "step": 2001, + "token_acc": 0.265471036794742 + }, + { + "epoch": 1.1735561418938727, + "grad_norm": 0.7842580658459681, + "learning_rate": 0.00029993827666429713, + "loss": 3.2980542182922363, + "step": 2002, + "token_acc": 0.27015937623092384 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.6727063001572678, + "learning_rate": 0.0002999378589391024, + "loss": 3.3635566234588623, + "step": 2003, + "token_acc": 0.2622830703961223 + }, + { + "epoch": 1.174728818528291, + "grad_norm": 0.8024219203341931, + "learning_rate": 0.00029993743980544354, + "loss": 3.331637144088745, + "step": 2004, + "token_acc": 0.26550438163009815 + }, + { + "epoch": 1.1753151568454998, + "grad_norm": 0.6713497173993388, + "learning_rate": 0.0002999370192633245, + "loss": 3.337149143218994, + "step": 2005, + "token_acc": 0.265036261659202 + }, + { + "epoch": 1.175901495162709, + "grad_norm": 0.6415603183059132, + "learning_rate": 0.0002999365973127492, + "loss": 3.307196855545044, + "step": 2006, + "token_acc": 0.2695347666004744 + }, + { + "epoch": 1.1764878334799178, + "grad_norm": 0.5961756599005775, + "learning_rate": 0.00029993617395372165, + "loss": 3.362119197845459, + "step": 2007, + "token_acc": 0.260503534487951 + }, + { + "epoch": 1.177074171797127, + "grad_norm": 0.682835441678395, + "learning_rate": 0.00029993574918624574, + "loss": 3.3564674854278564, + "step": 2008, + "token_acc": 0.26026551084234156 + }, + { + "epoch": 1.177660510114336, + "grad_norm": 0.6625098307529516, + "learning_rate": 0.0002999353230103255, + "loss": 3.310089111328125, + "step": 2009, + "token_acc": 0.2677890816436882 + }, + { + "epoch": 1.178246848431545, + "grad_norm": 0.6123330858898528, + "learning_rate": 0.0002999348954259649, + "loss": 3.3231968879699707, + "step": 2010, + "token_acc": 0.26636346379378045 + }, + { + "epoch": 1.178833186748754, + "grad_norm": 0.5702395878478161, + "learning_rate": 0.0002999344664331681, + "loss": 3.2643089294433594, + "step": 2011, + "token_acc": 0.27394198960915056 + }, + { + "epoch": 1.1794195250659631, + "grad_norm": 0.6425606904272094, + "learning_rate": 0.00029993403603193895, + "loss": 3.3612194061279297, + "step": 2012, + "token_acc": 0.2606042466011451 + }, + { + "epoch": 1.180005863383172, + "grad_norm": 0.6899898103543229, + "learning_rate": 0.0002999336042222816, + "loss": 3.321394920349121, + "step": 2013, + "token_acc": 0.26843857951443134 + }, + { + "epoch": 1.1805922017003811, + "grad_norm": 0.593362004041284, + "learning_rate": 0.00029993317100420006, + "loss": 3.314401149749756, + "step": 2014, + "token_acc": 0.2682603445690519 + }, + { + "epoch": 1.1811785400175903, + "grad_norm": 0.5580832500825603, + "learning_rate": 0.00029993273637769844, + "loss": 3.3018605709075928, + "step": 2015, + "token_acc": 0.2695492449314311 + }, + { + "epoch": 1.1817648783347992, + "grad_norm": 0.6903425112203811, + "learning_rate": 0.0002999323003427808, + "loss": 3.3319854736328125, + "step": 2016, + "token_acc": 0.2660228246601794 + }, + { + "epoch": 1.1823512166520083, + "grad_norm": 0.6967150806433087, + "learning_rate": 0.0002999318628994512, + "loss": 3.294027090072632, + "step": 2017, + "token_acc": 0.2699184743561589 + }, + { + "epoch": 1.1829375549692172, + "grad_norm": 0.756701148809816, + "learning_rate": 0.0002999314240477138, + "loss": 3.3299171924591064, + "step": 2018, + "token_acc": 0.2662063167072781 + }, + { + "epoch": 1.1835238932864263, + "grad_norm": 0.628809448696654, + "learning_rate": 0.00029993098378757274, + "loss": 3.2967135906219482, + "step": 2019, + "token_acc": 0.2696584922702615 + }, + { + "epoch": 1.1841102316036354, + "grad_norm": 0.6411136343546889, + "learning_rate": 0.0002999305421190321, + "loss": 3.3544015884399414, + "step": 2020, + "token_acc": 0.26283174762143213 + }, + { + "epoch": 1.1846965699208443, + "grad_norm": 0.6333796484586045, + "learning_rate": 0.00029993009904209604, + "loss": 3.3268680572509766, + "step": 2021, + "token_acc": 0.265913253455844 + }, + { + "epoch": 1.1852829082380534, + "grad_norm": 0.6107410027655137, + "learning_rate": 0.00029992965455676875, + "loss": 3.3241801261901855, + "step": 2022, + "token_acc": 0.26734860660873355 + }, + { + "epoch": 1.1858692465552623, + "grad_norm": 0.7356535562613199, + "learning_rate": 0.00029992920866305433, + "loss": 3.2969963550567627, + "step": 2023, + "token_acc": 0.27054739263723854 + }, + { + "epoch": 1.1864555848724714, + "grad_norm": 0.7695591936268781, + "learning_rate": 0.00029992876136095706, + "loss": 3.3530123233795166, + "step": 2024, + "token_acc": 0.26591298729045404 + }, + { + "epoch": 1.1870419231896805, + "grad_norm": 0.6838717378354494, + "learning_rate": 0.00029992831265048117, + "loss": 3.3304524421691895, + "step": 2025, + "token_acc": 0.26564195578705857 + }, + { + "epoch": 1.1876282615068894, + "grad_norm": 0.7090986012275077, + "learning_rate": 0.00029992786253163077, + "loss": 3.334564447402954, + "step": 2026, + "token_acc": 0.263806425270407 + }, + { + "epoch": 1.1882145998240985, + "grad_norm": 0.8212285587289102, + "learning_rate": 0.0002999274110044101, + "loss": 3.3476967811584473, + "step": 2027, + "token_acc": 0.2638440899530132 + }, + { + "epoch": 1.1888009381413076, + "grad_norm": 0.6842681880156068, + "learning_rate": 0.00029992695806882344, + "loss": 3.309312105178833, + "step": 2028, + "token_acc": 0.26729882987588954 + }, + { + "epoch": 1.1893872764585165, + "grad_norm": 0.7247761063798277, + "learning_rate": 0.00029992650372487507, + "loss": 3.32065486907959, + "step": 2029, + "token_acc": 0.2655774066670965 + }, + { + "epoch": 1.1899736147757256, + "grad_norm": 0.7610741636989733, + "learning_rate": 0.0002999260479725692, + "loss": 3.3356432914733887, + "step": 2030, + "token_acc": 0.26399924497307664 + }, + { + "epoch": 1.1905599530929347, + "grad_norm": 0.7515939632275938, + "learning_rate": 0.0002999255908119101, + "loss": 3.298417091369629, + "step": 2031, + "token_acc": 0.2671629509828957 + }, + { + "epoch": 1.1911462914101436, + "grad_norm": 0.6858455339466643, + "learning_rate": 0.0002999251322429022, + "loss": 3.303715229034424, + "step": 2032, + "token_acc": 0.269738735405361 + }, + { + "epoch": 1.1917326297273527, + "grad_norm": 0.6665509936380439, + "learning_rate": 0.0002999246722655497, + "loss": 3.3836960792541504, + "step": 2033, + "token_acc": 0.2594626330508719 + }, + { + "epoch": 1.1923189680445616, + "grad_norm": 0.8279750421855766, + "learning_rate": 0.00029992421087985684, + "loss": 3.2851297855377197, + "step": 2034, + "token_acc": 0.27180549127884934 + }, + { + "epoch": 1.1929053063617707, + "grad_norm": 0.8588560314510714, + "learning_rate": 0.0002999237480858281, + "loss": 3.3636527061462402, + "step": 2035, + "token_acc": 0.26119030348901995 + }, + { + "epoch": 1.1934916446789798, + "grad_norm": 0.6777087470019907, + "learning_rate": 0.0002999232838834678, + "loss": 3.3011584281921387, + "step": 2036, + "token_acc": 0.27084546902627493 + }, + { + "epoch": 1.1940779829961887, + "grad_norm": 0.6595819956758585, + "learning_rate": 0.0002999228182727802, + "loss": 3.333683490753174, + "step": 2037, + "token_acc": 0.2648266417208313 + }, + { + "epoch": 1.1946643213133978, + "grad_norm": 0.6792667286840062, + "learning_rate": 0.0002999223512537698, + "loss": 3.2768514156341553, + "step": 2038, + "token_acc": 0.27290284993735087 + }, + { + "epoch": 1.195250659630607, + "grad_norm": 0.7246748573378871, + "learning_rate": 0.00029992188282644094, + "loss": 3.319718599319458, + "step": 2039, + "token_acc": 0.26794886323207356 + }, + { + "epoch": 1.1958369979478158, + "grad_norm": 0.6551161235233398, + "learning_rate": 0.00029992141299079795, + "loss": 3.3104288578033447, + "step": 2040, + "token_acc": 0.26690501895865604 + }, + { + "epoch": 1.196423336265025, + "grad_norm": 0.6345854619264507, + "learning_rate": 0.00029992094174684534, + "loss": 3.337841033935547, + "step": 2041, + "token_acc": 0.26508669058902135 + }, + { + "epoch": 1.197009674582234, + "grad_norm": 0.605743758218586, + "learning_rate": 0.00029992046909458757, + "loss": 3.3117339611053467, + "step": 2042, + "token_acc": 0.26860436925895 + }, + { + "epoch": 1.197596012899443, + "grad_norm": 0.6815455246425074, + "learning_rate": 0.0002999199950340289, + "loss": 3.359574317932129, + "step": 2043, + "token_acc": 0.2614712308812819 + }, + { + "epoch": 1.198182351216652, + "grad_norm": 0.6085384835152693, + "learning_rate": 0.000299919519565174, + "loss": 3.288266181945801, + "step": 2044, + "token_acc": 0.2713576653036435 + }, + { + "epoch": 1.198768689533861, + "grad_norm": 0.6388341826321748, + "learning_rate": 0.00029991904268802716, + "loss": 3.3446741104125977, + "step": 2045, + "token_acc": 0.26320325363280656 + }, + { + "epoch": 1.19935502785107, + "grad_norm": 0.6225496120296988, + "learning_rate": 0.00029991856440259295, + "loss": 3.305783271789551, + "step": 2046, + "token_acc": 0.27007207788571536 + }, + { + "epoch": 1.1999413661682792, + "grad_norm": 0.6117408936972606, + "learning_rate": 0.00029991808470887586, + "loss": 3.259596586227417, + "step": 2047, + "token_acc": 0.27576484625261 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.6212806005482442, + "learning_rate": 0.0002999176036068804, + "loss": 3.3930552005767822, + "step": 2048, + "token_acc": 0.2598445319987222 + }, + { + "epoch": 1.2011140428026972, + "grad_norm": 0.6205630689809001, + "learning_rate": 0.000299917121096611, + "loss": 3.3198723793029785, + "step": 2049, + "token_acc": 0.26617474176503886 + }, + { + "epoch": 1.201700381119906, + "grad_norm": 0.5045313452997845, + "learning_rate": 0.0002999166371780723, + "loss": 3.3389158248901367, + "step": 2050, + "token_acc": 0.2651433625332621 + }, + { + "epoch": 1.2022867194371152, + "grad_norm": 0.6515204260945489, + "learning_rate": 0.0002999161518512688, + "loss": 3.2973780632019043, + "step": 2051, + "token_acc": 0.2703869284946264 + }, + { + "epoch": 1.2028730577543243, + "grad_norm": 0.7038350642826086, + "learning_rate": 0.0002999156651162051, + "loss": 3.31166934967041, + "step": 2052, + "token_acc": 0.26836957652689625 + }, + { + "epoch": 1.2034593960715332, + "grad_norm": 0.6779333257048654, + "learning_rate": 0.0002999151769728857, + "loss": 3.2851827144622803, + "step": 2053, + "token_acc": 0.27303459285729387 + }, + { + "epoch": 1.2040457343887423, + "grad_norm": 0.6554007635654845, + "learning_rate": 0.00029991468742131527, + "loss": 3.3222532272338867, + "step": 2054, + "token_acc": 0.2669137360037849 + }, + { + "epoch": 1.2046320727059514, + "grad_norm": 0.6947073524264371, + "learning_rate": 0.00029991419646149836, + "loss": 3.258507490158081, + "step": 2055, + "token_acc": 0.2737537868355825 + }, + { + "epoch": 1.2052184110231603, + "grad_norm": 0.9867715450035401, + "learning_rate": 0.00029991370409343954, + "loss": 3.3458991050720215, + "step": 2056, + "token_acc": 0.2644667105110449 + }, + { + "epoch": 1.2058047493403694, + "grad_norm": 1.014434252127322, + "learning_rate": 0.0002999132103171435, + "loss": 3.3140206336975098, + "step": 2057, + "token_acc": 0.2679720580220517 + }, + { + "epoch": 1.2063910876575785, + "grad_norm": 0.8976812794199026, + "learning_rate": 0.0002999127151326149, + "loss": 3.374518871307373, + "step": 2058, + "token_acc": 0.26009924829384207 + }, + { + "epoch": 1.2069774259747874, + "grad_norm": 0.7563778389108587, + "learning_rate": 0.0002999122185398583, + "loss": 3.319183349609375, + "step": 2059, + "token_acc": 0.26805570631252384 + }, + { + "epoch": 1.2075637642919965, + "grad_norm": 0.6415493707011815, + "learning_rate": 0.00029991172053887844, + "loss": 3.3099513053894043, + "step": 2060, + "token_acc": 0.2680718605018229 + }, + { + "epoch": 1.2081501026092054, + "grad_norm": 0.730315208009389, + "learning_rate": 0.00029991122112968, + "loss": 3.347346305847168, + "step": 2061, + "token_acc": 0.26325385942183804 + }, + { + "epoch": 1.2087364409264145, + "grad_norm": 0.7558494702149601, + "learning_rate": 0.0002999107203122676, + "loss": 3.3253543376922607, + "step": 2062, + "token_acc": 0.26826446543544563 + }, + { + "epoch": 1.2093227792436236, + "grad_norm": 0.7042346754398479, + "learning_rate": 0.000299910218086646, + "loss": 3.3066747188568115, + "step": 2063, + "token_acc": 0.2686815020082981 + }, + { + "epoch": 1.2099091175608325, + "grad_norm": 0.6783900312066156, + "learning_rate": 0.00029990971445281994, + "loss": 3.24532413482666, + "step": 2064, + "token_acc": 0.27718448487988084 + }, + { + "epoch": 1.2104954558780416, + "grad_norm": 0.6581625381317505, + "learning_rate": 0.0002999092094107941, + "loss": 3.334491014480591, + "step": 2065, + "token_acc": 0.26478884441027545 + }, + { + "epoch": 1.2110817941952507, + "grad_norm": 0.6606948607208916, + "learning_rate": 0.0002999087029605732, + "loss": 3.343390941619873, + "step": 2066, + "token_acc": 0.26227076693719525 + }, + { + "epoch": 1.2116681325124596, + "grad_norm": 0.6933547540981225, + "learning_rate": 0.00029990819510216206, + "loss": 3.2889957427978516, + "step": 2067, + "token_acc": 0.2707499161853571 + }, + { + "epoch": 1.2122544708296688, + "grad_norm": 0.6648536542965798, + "learning_rate": 0.00029990768583556545, + "loss": 3.3179306983947754, + "step": 2068, + "token_acc": 0.2670377563584098 + }, + { + "epoch": 1.2128408091468779, + "grad_norm": 0.6622009355891474, + "learning_rate": 0.00029990717516078814, + "loss": 3.267537832260132, + "step": 2069, + "token_acc": 0.27337674834722997 + }, + { + "epoch": 1.2134271474640868, + "grad_norm": 0.5744462168261999, + "learning_rate": 0.00029990666307783495, + "loss": 3.283907890319824, + "step": 2070, + "token_acc": 0.2718030100029918 + }, + { + "epoch": 1.2140134857812959, + "grad_norm": 0.658076160083686, + "learning_rate": 0.0002999061495867106, + "loss": 3.3672618865966797, + "step": 2071, + "token_acc": 0.26128527523143646 + }, + { + "epoch": 1.2145998240985048, + "grad_norm": 0.5899163347167272, + "learning_rate": 0.00029990563468741997, + "loss": 3.342081069946289, + "step": 2072, + "token_acc": 0.26453657439598416 + }, + { + "epoch": 1.2151861624157139, + "grad_norm": 0.6603644157144087, + "learning_rate": 0.00029990511837996793, + "loss": 3.330611228942871, + "step": 2073, + "token_acc": 0.2653444757807728 + }, + { + "epoch": 1.215772500732923, + "grad_norm": 0.6987118690256372, + "learning_rate": 0.0002999046006643593, + "loss": 3.318565845489502, + "step": 2074, + "token_acc": 0.2684502602749571 + }, + { + "epoch": 1.2163588390501319, + "grad_norm": 0.7429090514141424, + "learning_rate": 0.00029990408154059896, + "loss": 3.3112151622772217, + "step": 2075, + "token_acc": 0.2694966418029221 + }, + { + "epoch": 1.216945177367341, + "grad_norm": 0.8263653609898882, + "learning_rate": 0.0002999035610086918, + "loss": 3.2740535736083984, + "step": 2076, + "token_acc": 0.27266720219740354 + }, + { + "epoch": 1.2175315156845499, + "grad_norm": 0.5784419288905609, + "learning_rate": 0.0002999030390686426, + "loss": 3.2838969230651855, + "step": 2077, + "token_acc": 0.2708504916925395 + }, + { + "epoch": 1.218117854001759, + "grad_norm": 0.5494118620339634, + "learning_rate": 0.0002999025157204564, + "loss": 3.299497127532959, + "step": 2078, + "token_acc": 0.26937650438698657 + }, + { + "epoch": 1.218704192318968, + "grad_norm": 0.7012737830317644, + "learning_rate": 0.00029990199096413805, + "loss": 3.2877395153045654, + "step": 2079, + "token_acc": 0.27112755504958824 + }, + { + "epoch": 1.219290530636177, + "grad_norm": 0.6027322829673991, + "learning_rate": 0.0002999014647996925, + "loss": 3.3021931648254395, + "step": 2080, + "token_acc": 0.26911808449310193 + }, + { + "epoch": 1.219876868953386, + "grad_norm": 0.6353499255322259, + "learning_rate": 0.0002999009372271247, + "loss": 3.288527011871338, + "step": 2081, + "token_acc": 0.27196617088599173 + }, + { + "epoch": 1.2204632072705952, + "grad_norm": 0.6948989016078879, + "learning_rate": 0.00029990040824643955, + "loss": 3.3416969776153564, + "step": 2082, + "token_acc": 0.2640719622441191 + }, + { + "epoch": 1.221049545587804, + "grad_norm": 0.5545287494977666, + "learning_rate": 0.00029989987785764206, + "loss": 3.3221025466918945, + "step": 2083, + "token_acc": 0.26650663705858285 + }, + { + "epoch": 1.2216358839050132, + "grad_norm": 0.5481319206238314, + "learning_rate": 0.0002998993460607372, + "loss": 3.2976815700531006, + "step": 2084, + "token_acc": 0.2691629015802952 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.660616497730608, + "learning_rate": 0.00029989881285573004, + "loss": 3.343980312347412, + "step": 2085, + "token_acc": 0.2617564609454103 + }, + { + "epoch": 1.2228085605394312, + "grad_norm": 0.6250957253394741, + "learning_rate": 0.00029989827824262544, + "loss": 3.300699472427368, + "step": 2086, + "token_acc": 0.2693041229655468 + }, + { + "epoch": 1.2233948988566403, + "grad_norm": 0.7360852306767541, + "learning_rate": 0.00029989774222142856, + "loss": 3.3323047161102295, + "step": 2087, + "token_acc": 0.2669919517928538 + }, + { + "epoch": 1.2239812371738492, + "grad_norm": 0.5814890530796598, + "learning_rate": 0.0002998972047921444, + "loss": 3.2863571643829346, + "step": 2088, + "token_acc": 0.2708321311853791 + }, + { + "epoch": 1.2245675754910583, + "grad_norm": 0.6940225721146759, + "learning_rate": 0.00029989666595477794, + "loss": 3.307682991027832, + "step": 2089, + "token_acc": 0.2691685352651767 + }, + { + "epoch": 1.2251539138082674, + "grad_norm": 0.680532291131526, + "learning_rate": 0.0002998961257093343, + "loss": 3.308253049850464, + "step": 2090, + "token_acc": 0.2669477857321235 + }, + { + "epoch": 1.2257402521254763, + "grad_norm": 0.6947177813801204, + "learning_rate": 0.0002998955840558186, + "loss": 3.3120388984680176, + "step": 2091, + "token_acc": 0.2693546326558857 + }, + { + "epoch": 1.2263265904426854, + "grad_norm": 0.5968647826683237, + "learning_rate": 0.0002998950409942358, + "loss": 3.290501117706299, + "step": 2092, + "token_acc": 0.26951688733753715 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.5968713433910895, + "learning_rate": 0.00029989449652459107, + "loss": 3.3237626552581787, + "step": 2093, + "token_acc": 0.26627131744870497 + }, + { + "epoch": 1.2274992670771034, + "grad_norm": 0.6173449887506648, + "learning_rate": 0.00029989395064688963, + "loss": 3.336862564086914, + "step": 2094, + "token_acc": 0.2635481364488165 + }, + { + "epoch": 1.2280856053943126, + "grad_norm": 0.5203168234892392, + "learning_rate": 0.00029989340336113645, + "loss": 3.3187806606292725, + "step": 2095, + "token_acc": 0.26681958605693373 + }, + { + "epoch": 1.2286719437115217, + "grad_norm": 0.5637549069752911, + "learning_rate": 0.0002998928546673367, + "loss": 3.275979995727539, + "step": 2096, + "token_acc": 0.27155464321759 + }, + { + "epoch": 1.2292582820287306, + "grad_norm": 0.5190385652435072, + "learning_rate": 0.00029989230456549555, + "loss": 3.337808132171631, + "step": 2097, + "token_acc": 0.26503801536820215 + }, + { + "epoch": 1.2298446203459397, + "grad_norm": 0.5540363970569976, + "learning_rate": 0.00029989175305561824, + "loss": 3.313361883163452, + "step": 2098, + "token_acc": 0.26775947617046136 + }, + { + "epoch": 1.2304309586631486, + "grad_norm": 0.6106092818476985, + "learning_rate": 0.0002998912001377099, + "loss": 3.3052308559417725, + "step": 2099, + "token_acc": 0.26819892095965714 + }, + { + "epoch": 1.2310172969803577, + "grad_norm": 0.5022731012521641, + "learning_rate": 0.00029989064581177567, + "loss": 3.3475341796875, + "step": 2100, + "token_acc": 0.2632253307149824 + }, + { + "epoch": 1.2316036352975668, + "grad_norm": 0.5254456969685513, + "learning_rate": 0.0002998900900778208, + "loss": 3.3345439434051514, + "step": 2101, + "token_acc": 0.2657941701368233 + }, + { + "epoch": 1.2321899736147757, + "grad_norm": 0.5776643481379248, + "learning_rate": 0.0002998895329358506, + "loss": 3.4040868282318115, + "step": 2102, + "token_acc": 0.25648854556837114 + }, + { + "epoch": 1.2327763119319848, + "grad_norm": 0.7126250083185364, + "learning_rate": 0.0002998889743858701, + "loss": 3.308574676513672, + "step": 2103, + "token_acc": 0.271233421251954 + }, + { + "epoch": 1.2333626502491937, + "grad_norm": 0.7493689898375907, + "learning_rate": 0.0002998884144278847, + "loss": 3.332040309906006, + "step": 2104, + "token_acc": 0.2659551196009431 + }, + { + "epoch": 1.2339489885664028, + "grad_norm": 0.6717319061364705, + "learning_rate": 0.00029988785306189964, + "loss": 3.3404903411865234, + "step": 2105, + "token_acc": 0.26252485571560213 + }, + { + "epoch": 1.234535326883612, + "grad_norm": 0.5721399807371714, + "learning_rate": 0.0002998872902879202, + "loss": 3.326077938079834, + "step": 2106, + "token_acc": 0.2647285406209432 + }, + { + "epoch": 1.2351216652008208, + "grad_norm": 0.7785647488256546, + "learning_rate": 0.0002998867261059516, + "loss": 3.348332405090332, + "step": 2107, + "token_acc": 0.26316347234966464 + }, + { + "epoch": 1.23570800351803, + "grad_norm": 0.7699710288909138, + "learning_rate": 0.00029988616051599917, + "loss": 3.365997552871704, + "step": 2108, + "token_acc": 0.26083881518406926 + }, + { + "epoch": 1.236294341835239, + "grad_norm": 0.5983932136814264, + "learning_rate": 0.0002998855935180683, + "loss": 3.294644832611084, + "step": 2109, + "token_acc": 0.26960612325314015 + }, + { + "epoch": 1.236880680152448, + "grad_norm": 0.5634442210006753, + "learning_rate": 0.00029988502511216425, + "loss": 3.3013739585876465, + "step": 2110, + "token_acc": 0.26949246456810777 + }, + { + "epoch": 1.237467018469657, + "grad_norm": 0.6218794532833704, + "learning_rate": 0.0002998844552982923, + "loss": 3.334766387939453, + "step": 2111, + "token_acc": 0.26451730324224537 + }, + { + "epoch": 1.2380533567868661, + "grad_norm": 0.6565069088306307, + "learning_rate": 0.0002998838840764579, + "loss": 3.270565986633301, + "step": 2112, + "token_acc": 0.2734178335559079 + }, + { + "epoch": 1.238639695104075, + "grad_norm": 0.70861216376591, + "learning_rate": 0.00029988331144666634, + "loss": 3.326977252960205, + "step": 2113, + "token_acc": 0.26604877996723814 + }, + { + "epoch": 1.2392260334212841, + "grad_norm": 0.6303386920349181, + "learning_rate": 0.0002998827374089231, + "loss": 3.306929588317871, + "step": 2114, + "token_acc": 0.26793842901214593 + }, + { + "epoch": 1.239812371738493, + "grad_norm": 0.7554077898506995, + "learning_rate": 0.0002998821619632335, + "loss": 3.3262505531311035, + "step": 2115, + "token_acc": 0.2663884511126521 + }, + { + "epoch": 1.2403987100557021, + "grad_norm": 0.7159698777971694, + "learning_rate": 0.0002998815851096029, + "loss": 3.327603816986084, + "step": 2116, + "token_acc": 0.26626987362413373 + }, + { + "epoch": 1.2409850483729112, + "grad_norm": 0.6951961866627198, + "learning_rate": 0.00029988100684803684, + "loss": 3.2882933616638184, + "step": 2117, + "token_acc": 0.2700128567755207 + }, + { + "epoch": 1.2415713866901201, + "grad_norm": 0.6730787396317341, + "learning_rate": 0.0002998804271785407, + "loss": 3.351698875427246, + "step": 2118, + "token_acc": 0.2627433005467766 + }, + { + "epoch": 1.2421577250073292, + "grad_norm": 0.5918778624078417, + "learning_rate": 0.00029987984610111985, + "loss": 3.31082820892334, + "step": 2119, + "token_acc": 0.2667638290120026 + }, + { + "epoch": 1.2427440633245384, + "grad_norm": 0.5570989689294167, + "learning_rate": 0.00029987926361577983, + "loss": 3.3346266746520996, + "step": 2120, + "token_acc": 0.26360399516591426 + }, + { + "epoch": 1.2433304016417472, + "grad_norm": 0.5726842910744052, + "learning_rate": 0.0002998786797225261, + "loss": 3.3188109397888184, + "step": 2121, + "token_acc": 0.26551651479008254 + }, + { + "epoch": 1.2439167399589564, + "grad_norm": 0.5654421812957456, + "learning_rate": 0.0002998780944213642, + "loss": 3.3154945373535156, + "step": 2122, + "token_acc": 0.26729529812825625 + }, + { + "epoch": 1.2445030782761655, + "grad_norm": 0.5035371758236616, + "learning_rate": 0.0002998775077122995, + "loss": 3.2638792991638184, + "step": 2123, + "token_acc": 0.2725763459344415 + }, + { + "epoch": 1.2450894165933744, + "grad_norm": 0.5344870894728856, + "learning_rate": 0.00029987691959533757, + "loss": 3.3320822715759277, + "step": 2124, + "token_acc": 0.2656573648055263 + }, + { + "epoch": 1.2456757549105835, + "grad_norm": 0.550705871090821, + "learning_rate": 0.00029987633007048394, + "loss": 3.335256576538086, + "step": 2125, + "token_acc": 0.26625558448750514 + }, + { + "epoch": 1.2462620932277924, + "grad_norm": 0.6969259635416923, + "learning_rate": 0.0002998757391377442, + "loss": 3.3727118968963623, + "step": 2126, + "token_acc": 0.25852920045160627 + }, + { + "epoch": 1.2468484315450015, + "grad_norm": 0.7668450938910254, + "learning_rate": 0.0002998751467971238, + "loss": 3.354109048843384, + "step": 2127, + "token_acc": 0.2606746758061422 + }, + { + "epoch": 1.2474347698622106, + "grad_norm": 0.828645404198967, + "learning_rate": 0.0002998745530486284, + "loss": 3.310776472091675, + "step": 2128, + "token_acc": 0.2677291944231717 + }, + { + "epoch": 1.2480211081794195, + "grad_norm": 0.8397830233194004, + "learning_rate": 0.00029987395789226343, + "loss": 3.284890651702881, + "step": 2129, + "token_acc": 0.27193671556716664 + }, + { + "epoch": 1.2486074464966286, + "grad_norm": 0.8319792205900353, + "learning_rate": 0.0002998733613280347, + "loss": 3.3155770301818848, + "step": 2130, + "token_acc": 0.2663592983399422 + }, + { + "epoch": 1.2491937848138375, + "grad_norm": 0.6358709918855738, + "learning_rate": 0.00029987276335594764, + "loss": 3.3107969760894775, + "step": 2131, + "token_acc": 0.26838361579233844 + }, + { + "epoch": 1.2497801231310466, + "grad_norm": 0.8187670448347658, + "learning_rate": 0.000299872163976008, + "loss": 3.327176570892334, + "step": 2132, + "token_acc": 0.265643496393001 + }, + { + "epoch": 1.2503664614482557, + "grad_norm": 0.8799754377740789, + "learning_rate": 0.00029987156318822127, + "loss": 3.3459300994873047, + "step": 2133, + "token_acc": 0.2619655418437908 + }, + { + "epoch": 1.2509527997654648, + "grad_norm": 0.8607173781795262, + "learning_rate": 0.00029987096099259316, + "loss": 3.3512959480285645, + "step": 2134, + "token_acc": 0.2622847739708617 + }, + { + "epoch": 1.2515391380826737, + "grad_norm": 0.7310525150665035, + "learning_rate": 0.0002998703573891293, + "loss": 3.311583995819092, + "step": 2135, + "token_acc": 0.2678241671033153 + }, + { + "epoch": 1.2521254763998828, + "grad_norm": 0.6104269828926102, + "learning_rate": 0.0002998697523778354, + "loss": 3.307077169418335, + "step": 2136, + "token_acc": 0.26972578460034025 + }, + { + "epoch": 1.2527118147170917, + "grad_norm": 0.6156106157703805, + "learning_rate": 0.00029986914595871714, + "loss": 3.2971267700195312, + "step": 2137, + "token_acc": 0.26959025959009825 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.7302402155802491, + "learning_rate": 0.00029986853813178025, + "loss": 3.3069005012512207, + "step": 2138, + "token_acc": 0.26683005227004186 + }, + { + "epoch": 1.25388449135151, + "grad_norm": 0.6563072697916883, + "learning_rate": 0.0002998679288970303, + "loss": 3.2868282794952393, + "step": 2139, + "token_acc": 0.2711505057068376 + }, + { + "epoch": 1.2544708296687188, + "grad_norm": 0.6429617948728832, + "learning_rate": 0.0002998673182544732, + "loss": 3.359733819961548, + "step": 2140, + "token_acc": 0.26204009305651266 + }, + { + "epoch": 1.255057167985928, + "grad_norm": 0.7273487865264391, + "learning_rate": 0.00029986670620411453, + "loss": 3.303636074066162, + "step": 2141, + "token_acc": 0.2696956868148483 + }, + { + "epoch": 1.2556435063031368, + "grad_norm": 0.6274279233808178, + "learning_rate": 0.00029986609274596014, + "loss": 3.3028361797332764, + "step": 2142, + "token_acc": 0.26970384555553767 + }, + { + "epoch": 1.256229844620346, + "grad_norm": 0.6306918888174531, + "learning_rate": 0.00029986547788001574, + "loss": 3.300806999206543, + "step": 2143, + "token_acc": 0.2689138383225023 + }, + { + "epoch": 1.256816182937555, + "grad_norm": 0.734844206472174, + "learning_rate": 0.00029986486160628713, + "loss": 3.381234645843506, + "step": 2144, + "token_acc": 0.26008237340921425 + }, + { + "epoch": 1.257402521254764, + "grad_norm": 0.5949168326818668, + "learning_rate": 0.0002998642439247801, + "loss": 3.3264591693878174, + "step": 2145, + "token_acc": 0.26503336622213736 + }, + { + "epoch": 1.257988859571973, + "grad_norm": 0.6565293655178087, + "learning_rate": 0.00029986362483550044, + "loss": 3.32952880859375, + "step": 2146, + "token_acc": 0.2647331726408004 + }, + { + "epoch": 1.258575197889182, + "grad_norm": 0.621292523357113, + "learning_rate": 0.00029986300433845395, + "loss": 3.257190704345703, + "step": 2147, + "token_acc": 0.27434129537484353 + }, + { + "epoch": 1.259161536206391, + "grad_norm": 0.5922717530220168, + "learning_rate": 0.00029986238243364646, + "loss": 3.345968008041382, + "step": 2148, + "token_acc": 0.26301378221506966 + }, + { + "epoch": 1.2597478745236002, + "grad_norm": 0.5224942640936433, + "learning_rate": 0.00029986175912108387, + "loss": 3.2582998275756836, + "step": 2149, + "token_acc": 0.2728868559638074 + }, + { + "epoch": 1.2603342128408093, + "grad_norm": 0.5731204151215652, + "learning_rate": 0.000299861134400772, + "loss": 3.289397716522217, + "step": 2150, + "token_acc": 0.2709508107414401 + }, + { + "epoch": 1.2609205511580182, + "grad_norm": 0.6082071246763562, + "learning_rate": 0.00029986050827271666, + "loss": 3.2928662300109863, + "step": 2151, + "token_acc": 0.26959458607957054 + }, + { + "epoch": 1.2615068894752273, + "grad_norm": 0.5060442865288772, + "learning_rate": 0.0002998598807369238, + "loss": 3.28161883354187, + "step": 2152, + "token_acc": 0.2724963848125584 + }, + { + "epoch": 1.2620932277924362, + "grad_norm": 0.5695834381928475, + "learning_rate": 0.0002998592517933993, + "loss": 3.289191246032715, + "step": 2153, + "token_acc": 0.270518978365867 + }, + { + "epoch": 1.2626795661096453, + "grad_norm": 0.5509504992003815, + "learning_rate": 0.0002998586214421491, + "loss": 3.3023438453674316, + "step": 2154, + "token_acc": 0.26857629581654063 + }, + { + "epoch": 1.2632659044268544, + "grad_norm": 0.5772255455075851, + "learning_rate": 0.0002998579896831791, + "loss": 3.2766575813293457, + "step": 2155, + "token_acc": 0.2724554690592426 + }, + { + "epoch": 1.2638522427440633, + "grad_norm": 0.5637676735137469, + "learning_rate": 0.00029985735651649514, + "loss": 3.2982006072998047, + "step": 2156, + "token_acc": 0.26910432723269 + }, + { + "epoch": 1.2644385810612724, + "grad_norm": 0.5368074090825047, + "learning_rate": 0.0002998567219421033, + "loss": 3.3654940128326416, + "step": 2157, + "token_acc": 0.25963998813782174 + }, + { + "epoch": 1.2650249193784813, + "grad_norm": 0.5540173743536623, + "learning_rate": 0.00029985608596000955, + "loss": 3.3347549438476562, + "step": 2158, + "token_acc": 0.2637975215842937 + }, + { + "epoch": 1.2656112576956904, + "grad_norm": 0.5834929108363435, + "learning_rate": 0.00029985544857021973, + "loss": 3.280578136444092, + "step": 2159, + "token_acc": 0.2717119415351722 + }, + { + "epoch": 1.2661975960128995, + "grad_norm": 0.5646854900649066, + "learning_rate": 0.00029985480977273997, + "loss": 3.2801413536071777, + "step": 2160, + "token_acc": 0.2709653277562486 + }, + { + "epoch": 1.2667839343301086, + "grad_norm": 0.5810387304476278, + "learning_rate": 0.00029985416956757613, + "loss": 3.297654151916504, + "step": 2161, + "token_acc": 0.2692621505658639 + }, + { + "epoch": 1.2673702726473175, + "grad_norm": 0.5157863683401149, + "learning_rate": 0.0002998535279547343, + "loss": 3.268155336380005, + "step": 2162, + "token_acc": 0.27402030403450633 + }, + { + "epoch": 1.2679566109645266, + "grad_norm": 0.5262582893830188, + "learning_rate": 0.00029985288493422055, + "loss": 3.33351469039917, + "step": 2163, + "token_acc": 0.2630807953017006 + }, + { + "epoch": 1.2685429492817355, + "grad_norm": 0.5693554269915471, + "learning_rate": 0.00029985224050604084, + "loss": 3.2613868713378906, + "step": 2164, + "token_acc": 0.272203195903776 + }, + { + "epoch": 1.2691292875989446, + "grad_norm": 0.5628776134524188, + "learning_rate": 0.0002998515946702013, + "loss": 3.261300563812256, + "step": 2165, + "token_acc": 0.27635754927622497 + }, + { + "epoch": 1.2697156259161537, + "grad_norm": 0.5315611500611698, + "learning_rate": 0.00029985094742670794, + "loss": 3.29467511177063, + "step": 2166, + "token_acc": 0.2689114566322074 + }, + { + "epoch": 1.2703019642333626, + "grad_norm": 0.6780690989771148, + "learning_rate": 0.0002998502987755668, + "loss": 3.312556743621826, + "step": 2167, + "token_acc": 0.2678858355306526 + }, + { + "epoch": 1.2708883025505717, + "grad_norm": 0.7336877112335726, + "learning_rate": 0.0002998496487167841, + "loss": 3.3274106979370117, + "step": 2168, + "token_acc": 0.2637095072144464 + }, + { + "epoch": 1.2714746408677806, + "grad_norm": 0.7929841867017643, + "learning_rate": 0.00029984899725036586, + "loss": 3.3308005332946777, + "step": 2169, + "token_acc": 0.26378217565644807 + }, + { + "epoch": 1.2720609791849897, + "grad_norm": 0.7609426399054392, + "learning_rate": 0.0002998483443763182, + "loss": 3.307753086090088, + "step": 2170, + "token_acc": 0.2681502766484272 + }, + { + "epoch": 1.2726473175021988, + "grad_norm": 0.670693378394733, + "learning_rate": 0.0002998476900946473, + "loss": 3.342557668685913, + "step": 2171, + "token_acc": 0.2627241317916868 + }, + { + "epoch": 1.2732336558194077, + "grad_norm": 0.7124681031691964, + "learning_rate": 0.0002998470344053592, + "loss": 3.3329200744628906, + "step": 2172, + "token_acc": 0.26445544732905407 + }, + { + "epoch": 1.2738199941366168, + "grad_norm": 0.8109197732369587, + "learning_rate": 0.0002998463773084602, + "loss": 3.3145103454589844, + "step": 2173, + "token_acc": 0.26800017507769075 + }, + { + "epoch": 1.2744063324538257, + "grad_norm": 0.7220782542966019, + "learning_rate": 0.0002998457188039564, + "loss": 3.304135799407959, + "step": 2174, + "token_acc": 0.26850617613080957 + }, + { + "epoch": 1.2749926707710348, + "grad_norm": 0.5695688236095846, + "learning_rate": 0.00029984505889185393, + "loss": 3.3218445777893066, + "step": 2175, + "token_acc": 0.265615223517853 + }, + { + "epoch": 1.275579009088244, + "grad_norm": 0.5632506471567461, + "learning_rate": 0.0002998443975721591, + "loss": 3.31032657623291, + "step": 2176, + "token_acc": 0.26712138626345544 + }, + { + "epoch": 1.276165347405453, + "grad_norm": 0.5493900285684672, + "learning_rate": 0.0002998437348448781, + "loss": 3.300459146499634, + "step": 2177, + "token_acc": 0.2686587265487423 + }, + { + "epoch": 1.276751685722662, + "grad_norm": 0.49386124711919854, + "learning_rate": 0.0002998430707100171, + "loss": 3.303426504135132, + "step": 2178, + "token_acc": 0.2671934050417376 + }, + { + "epoch": 1.277338024039871, + "grad_norm": 0.5057559180714568, + "learning_rate": 0.00029984240516758235, + "loss": 3.2868337631225586, + "step": 2179, + "token_acc": 0.2698031723822465 + }, + { + "epoch": 1.27792436235708, + "grad_norm": 0.5357431217413532, + "learning_rate": 0.0002998417382175802, + "loss": 3.3203537464141846, + "step": 2180, + "token_acc": 0.26492578571315745 + }, + { + "epoch": 1.278510700674289, + "grad_norm": 0.6264817217942751, + "learning_rate": 0.0002998410698600167, + "loss": 3.31192684173584, + "step": 2181, + "token_acc": 0.2670937979174721 + }, + { + "epoch": 1.2790970389914982, + "grad_norm": 0.6113179654224454, + "learning_rate": 0.00029984040009489835, + "loss": 3.338350296020508, + "step": 2182, + "token_acc": 0.26445690526793636 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.6258366014006113, + "learning_rate": 0.00029983972892223137, + "loss": 3.2962443828582764, + "step": 2183, + "token_acc": 0.2697837823874351 + }, + { + "epoch": 1.2802697156259162, + "grad_norm": 0.5354736116305405, + "learning_rate": 0.00029983905634202196, + "loss": 3.266925811767578, + "step": 2184, + "token_acc": 0.27151789719703806 + }, + { + "epoch": 1.280856053943125, + "grad_norm": 0.6242763361588439, + "learning_rate": 0.00029983838235427663, + "loss": 3.326183557510376, + "step": 2185, + "token_acc": 0.26395035729367183 + }, + { + "epoch": 1.2814423922603342, + "grad_norm": 0.6884774051788985, + "learning_rate": 0.0002998377069590015, + "loss": 3.266263484954834, + "step": 2186, + "token_acc": 0.27291413885433147 + }, + { + "epoch": 1.2820287305775433, + "grad_norm": 0.7406855818729945, + "learning_rate": 0.0002998370301562031, + "loss": 3.340160846710205, + "step": 2187, + "token_acc": 0.2630227821000322 + }, + { + "epoch": 1.2826150688947524, + "grad_norm": 0.8701427670352354, + "learning_rate": 0.0002998363519458877, + "loss": 3.3322606086730957, + "step": 2188, + "token_acc": 0.26376326131152666 + }, + { + "epoch": 1.2832014072119613, + "grad_norm": 0.7544234078823233, + "learning_rate": 0.00029983567232806164, + "loss": 3.2814273834228516, + "step": 2189, + "token_acc": 0.2704299604478318 + }, + { + "epoch": 1.2837877455291704, + "grad_norm": 0.6431713594079861, + "learning_rate": 0.0002998349913027314, + "loss": 3.301873207092285, + "step": 2190, + "token_acc": 0.2690658332514611 + }, + { + "epoch": 1.2843740838463793, + "grad_norm": 0.6027721183581168, + "learning_rate": 0.00029983430886990325, + "loss": 3.302947521209717, + "step": 2191, + "token_acc": 0.2693317864681523 + }, + { + "epoch": 1.2849604221635884, + "grad_norm": 0.6557131651669685, + "learning_rate": 0.00029983362502958375, + "loss": 3.30894136428833, + "step": 2192, + "token_acc": 0.2691040586622573 + }, + { + "epoch": 1.2855467604807975, + "grad_norm": 0.7842394522723533, + "learning_rate": 0.0002998329397817792, + "loss": 3.342106580734253, + "step": 2193, + "token_acc": 0.26160489929919817 + }, + { + "epoch": 1.2861330987980064, + "grad_norm": 0.6925606265064673, + "learning_rate": 0.0002998322531264961, + "loss": 3.2745885848999023, + "step": 2194, + "token_acc": 0.2721911261384877 + }, + { + "epoch": 1.2867194371152155, + "grad_norm": 0.6734126567631493, + "learning_rate": 0.0002998315650637409, + "loss": 3.3048202991485596, + "step": 2195, + "token_acc": 0.2682049500296348 + }, + { + "epoch": 1.2873057754324244, + "grad_norm": 0.6690636141458522, + "learning_rate": 0.00029983087559352, + "loss": 3.375196933746338, + "step": 2196, + "token_acc": 0.2598232836366127 + }, + { + "epoch": 1.2878921137496335, + "grad_norm": 0.6019589446764615, + "learning_rate": 0.00029983018471583996, + "loss": 3.290217876434326, + "step": 2197, + "token_acc": 0.2707252430754147 + }, + { + "epoch": 1.2884784520668426, + "grad_norm": 0.49823082026053717, + "learning_rate": 0.0002998294924307072, + "loss": 3.3269500732421875, + "step": 2198, + "token_acc": 0.26613234798009616 + }, + { + "epoch": 1.2890647903840515, + "grad_norm": 0.5598927830961332, + "learning_rate": 0.0002998287987381283, + "loss": 3.3405699729919434, + "step": 2199, + "token_acc": 0.26544866571603176 + }, + { + "epoch": 1.2896511287012606, + "grad_norm": 0.6150140558428371, + "learning_rate": 0.00029982810363810973, + "loss": 3.242016315460205, + "step": 2200, + "token_acc": 0.2765774777684134 + }, + { + "epoch": 1.2902374670184695, + "grad_norm": 0.5821702231390028, + "learning_rate": 0.00029982740713065803, + "loss": 3.3304026126861572, + "step": 2201, + "token_acc": 0.26573595847740283 + }, + { + "epoch": 1.2908238053356786, + "grad_norm": 0.50217474958973, + "learning_rate": 0.0002998267092157797, + "loss": 3.270297050476074, + "step": 2202, + "token_acc": 0.2706662825294923 + }, + { + "epoch": 1.2914101436528878, + "grad_norm": 0.5219864888913429, + "learning_rate": 0.0002998260098934813, + "loss": 3.229401111602783, + "step": 2203, + "token_acc": 0.2787177716390424 + }, + { + "epoch": 1.2919964819700969, + "grad_norm": 0.47728938688848777, + "learning_rate": 0.0002998253091637695, + "loss": 3.327571153640747, + "step": 2204, + "token_acc": 0.2667041876046901 + }, + { + "epoch": 1.2925828202873058, + "grad_norm": 0.6053655829327786, + "learning_rate": 0.00029982460702665075, + "loss": 3.2703559398651123, + "step": 2205, + "token_acc": 0.2750776633646159 + }, + { + "epoch": 1.2931691586045149, + "grad_norm": 0.5543560292064499, + "learning_rate": 0.0002998239034821318, + "loss": 3.2333743572235107, + "step": 2206, + "token_acc": 0.2766598417471794 + }, + { + "epoch": 1.2937554969217238, + "grad_norm": 0.6200252388513652, + "learning_rate": 0.00029982319853021907, + "loss": 3.3022828102111816, + "step": 2207, + "token_acc": 0.26804412565243846 + }, + { + "epoch": 1.2943418352389329, + "grad_norm": 0.6137523939801273, + "learning_rate": 0.00029982249217091935, + "loss": 3.2986888885498047, + "step": 2208, + "token_acc": 0.2690404348293827 + }, + { + "epoch": 1.294928173556142, + "grad_norm": 0.5815537682012434, + "learning_rate": 0.0002998217844042392, + "loss": 3.3160715103149414, + "step": 2209, + "token_acc": 0.2676236498010233 + }, + { + "epoch": 1.2955145118733509, + "grad_norm": 0.5360001246244428, + "learning_rate": 0.00029982107523018523, + "loss": 3.304793357849121, + "step": 2210, + "token_acc": 0.2670159527002698 + }, + { + "epoch": 1.29610085019056, + "grad_norm": 0.5506234846767539, + "learning_rate": 0.0002998203646487641, + "loss": 3.326186418533325, + "step": 2211, + "token_acc": 0.26720169951611 + }, + { + "epoch": 1.2966871885077689, + "grad_norm": 0.6450482166492333, + "learning_rate": 0.0002998196526599826, + "loss": 3.287656784057617, + "step": 2212, + "token_acc": 0.27086998553684066 + }, + { + "epoch": 1.297273526824978, + "grad_norm": 0.5588678446519576, + "learning_rate": 0.00029981893926384734, + "loss": 3.283034086227417, + "step": 2213, + "token_acc": 0.27199671424807426 + }, + { + "epoch": 1.297859865142187, + "grad_norm": 0.506091078079632, + "learning_rate": 0.00029981822446036504, + "loss": 3.298727512359619, + "step": 2214, + "token_acc": 0.2684945198066332 + }, + { + "epoch": 1.2984462034593962, + "grad_norm": 0.5701354761565592, + "learning_rate": 0.00029981750824954235, + "loss": 3.315445899963379, + "step": 2215, + "token_acc": 0.2670569210002272 + }, + { + "epoch": 1.299032541776605, + "grad_norm": 0.565753216094878, + "learning_rate": 0.0002998167906313861, + "loss": 3.2935709953308105, + "step": 2216, + "token_acc": 0.27084292473319654 + }, + { + "epoch": 1.2996188800938142, + "grad_norm": 0.5214210663432226, + "learning_rate": 0.00029981607160590296, + "loss": 3.3164734840393066, + "step": 2217, + "token_acc": 0.2651342564656032 + }, + { + "epoch": 1.300205218411023, + "grad_norm": 0.6998020169228616, + "learning_rate": 0.00029981535117309976, + "loss": 3.308281898498535, + "step": 2218, + "token_acc": 0.2670380981598282 + }, + { + "epoch": 1.3007915567282322, + "grad_norm": 0.6891429930221858, + "learning_rate": 0.00029981462933298316, + "loss": 3.268507957458496, + "step": 2219, + "token_acc": 0.27166720579672643 + }, + { + "epoch": 1.3013778950454413, + "grad_norm": 0.6163740036075614, + "learning_rate": 0.00029981390608556005, + "loss": 3.270542621612549, + "step": 2220, + "token_acc": 0.27115693944444297 + }, + { + "epoch": 1.3019642333626502, + "grad_norm": 0.46880773007916293, + "learning_rate": 0.0002998131814308371, + "loss": 3.3270061016082764, + "step": 2221, + "token_acc": 0.26720261579919397 + }, + { + "epoch": 1.3025505716798593, + "grad_norm": 0.5666034478188257, + "learning_rate": 0.0002998124553688212, + "loss": 3.2751564979553223, + "step": 2222, + "token_acc": 0.27265387243028383 + }, + { + "epoch": 1.3031369099970682, + "grad_norm": 0.5425295643701308, + "learning_rate": 0.0002998117278995192, + "loss": 3.2680187225341797, + "step": 2223, + "token_acc": 0.27294542354507517 + }, + { + "epoch": 1.3037232483142773, + "grad_norm": 0.5345252467687768, + "learning_rate": 0.00029981099902293785, + "loss": 3.282696008682251, + "step": 2224, + "token_acc": 0.2712544818053268 + }, + { + "epoch": 1.3043095866314864, + "grad_norm": 0.5976044303423198, + "learning_rate": 0.00029981026873908406, + "loss": 3.2881011962890625, + "step": 2225, + "token_acc": 0.2695382115199115 + }, + { + "epoch": 1.3048959249486953, + "grad_norm": 0.6646242809482138, + "learning_rate": 0.00029980953704796464, + "loss": 3.3530287742614746, + "step": 2226, + "token_acc": 0.2618140612067457 + }, + { + "epoch": 1.3054822632659044, + "grad_norm": 0.7626656999327691, + "learning_rate": 0.0002998088039495865, + "loss": 3.312908172607422, + "step": 2227, + "token_acc": 0.26741101721053606 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.7075294236327936, + "learning_rate": 0.0002998080694439566, + "loss": 3.2757482528686523, + "step": 2228, + "token_acc": 0.27232796119856223 + }, + { + "epoch": 1.3066549399003224, + "grad_norm": 0.6467313184595685, + "learning_rate": 0.00029980733353108163, + "loss": 3.259946346282959, + "step": 2229, + "token_acc": 0.2749463767074785 + }, + { + "epoch": 1.3072412782175316, + "grad_norm": 0.5313339026228017, + "learning_rate": 0.0002998065962109687, + "loss": 3.274688482284546, + "step": 2230, + "token_acc": 0.2707252904584371 + }, + { + "epoch": 1.3078276165347407, + "grad_norm": 0.5687353383839434, + "learning_rate": 0.0002998058574836246, + "loss": 3.2825818061828613, + "step": 2231, + "token_acc": 0.272266373274864 + }, + { + "epoch": 1.3084139548519496, + "grad_norm": 0.6345212048483689, + "learning_rate": 0.0002998051173490564, + "loss": 3.3325624465942383, + "step": 2232, + "token_acc": 0.26486385848219296 + }, + { + "epoch": 1.3090002931691587, + "grad_norm": 0.5905731563190346, + "learning_rate": 0.00029980437580727097, + "loss": 3.298100233078003, + "step": 2233, + "token_acc": 0.2694881684002884 + }, + { + "epoch": 1.3095866314863676, + "grad_norm": 0.6631967215698978, + "learning_rate": 0.00029980363285827524, + "loss": 3.282688617706299, + "step": 2234, + "token_acc": 0.2703405792626444 + }, + { + "epoch": 1.3101729698035767, + "grad_norm": 0.6697413420367967, + "learning_rate": 0.00029980288850207633, + "loss": 3.307828426361084, + "step": 2235, + "token_acc": 0.26742929706570484 + }, + { + "epoch": 1.3107593081207858, + "grad_norm": 0.6421058698742232, + "learning_rate": 0.0002998021427386811, + "loss": 3.334409713745117, + "step": 2236, + "token_acc": 0.26487303204272716 + }, + { + "epoch": 1.3113456464379947, + "grad_norm": 0.6595973161924285, + "learning_rate": 0.0002998013955680966, + "loss": 3.331205129623413, + "step": 2237, + "token_acc": 0.2625318072819041 + }, + { + "epoch": 1.3119319847552038, + "grad_norm": 0.7495110331718255, + "learning_rate": 0.00029980064699032993, + "loss": 3.3111205101013184, + "step": 2238, + "token_acc": 0.26595421879158904 + }, + { + "epoch": 1.3125183230724127, + "grad_norm": 0.6875944127587434, + "learning_rate": 0.00029979989700538794, + "loss": 3.328988552093506, + "step": 2239, + "token_acc": 0.264067124763369 + }, + { + "epoch": 1.3131046613896218, + "grad_norm": 0.5537285182729828, + "learning_rate": 0.0002997991456132778, + "loss": 3.2696051597595215, + "step": 2240, + "token_acc": 0.2711926762707038 + }, + { + "epoch": 1.313690999706831, + "grad_norm": 0.5038321969219325, + "learning_rate": 0.0002997983928140065, + "loss": 3.243635654449463, + "step": 2241, + "token_acc": 0.27540068579160754 + }, + { + "epoch": 1.31427733802404, + "grad_norm": 0.560099986162438, + "learning_rate": 0.00029979763860758123, + "loss": 3.3241019248962402, + "step": 2242, + "token_acc": 0.2646147170818195 + }, + { + "epoch": 1.314863676341249, + "grad_norm": 0.4985661792619478, + "learning_rate": 0.000299796882994009, + "loss": 3.276165008544922, + "step": 2243, + "token_acc": 0.27285826459326656 + }, + { + "epoch": 1.315450014658458, + "grad_norm": 0.541273585665269, + "learning_rate": 0.0002997961259732968, + "loss": 3.294405698776245, + "step": 2244, + "token_acc": 0.2686876611005267 + }, + { + "epoch": 1.316036352975667, + "grad_norm": 0.5535245116614177, + "learning_rate": 0.00029979536754545197, + "loss": 3.253227949142456, + "step": 2245, + "token_acc": 0.2758418513501213 + }, + { + "epoch": 1.316622691292876, + "grad_norm": 0.6179274535339254, + "learning_rate": 0.0002997946077104815, + "loss": 3.2938966751098633, + "step": 2246, + "token_acc": 0.2703017518349082 + }, + { + "epoch": 1.3172090296100851, + "grad_norm": 0.5951189275555914, + "learning_rate": 0.00029979384646839247, + "loss": 3.2918429374694824, + "step": 2247, + "token_acc": 0.27033843980167194 + }, + { + "epoch": 1.317795367927294, + "grad_norm": 0.5024457090964609, + "learning_rate": 0.00029979308381919217, + "loss": 3.294224977493286, + "step": 2248, + "token_acc": 0.2694072544057162 + }, + { + "epoch": 1.3183817062445031, + "grad_norm": 0.5121211423651184, + "learning_rate": 0.00029979231976288767, + "loss": 3.2594432830810547, + "step": 2249, + "token_acc": 0.272581892813331 + }, + { + "epoch": 1.318968044561712, + "grad_norm": 0.6351375919838699, + "learning_rate": 0.0002997915542994862, + "loss": 3.2962005138397217, + "step": 2250, + "token_acc": 0.2674360636767293 + }, + { + "epoch": 1.3195543828789211, + "grad_norm": 0.6626472556212978, + "learning_rate": 0.00029979078742899487, + "loss": 3.227412223815918, + "step": 2251, + "token_acc": 0.2785516448274187 + }, + { + "epoch": 1.3201407211961302, + "grad_norm": 0.6226872992898699, + "learning_rate": 0.000299790019151421, + "loss": 3.343895435333252, + "step": 2252, + "token_acc": 0.26338035186394887 + }, + { + "epoch": 1.3207270595133391, + "grad_norm": 0.529315840672225, + "learning_rate": 0.0002997892494667717, + "loss": 3.3050756454467773, + "step": 2253, + "token_acc": 0.2683314780718646 + }, + { + "epoch": 1.3213133978305482, + "grad_norm": 0.4544086896748536, + "learning_rate": 0.00029978847837505425, + "loss": 3.3003289699554443, + "step": 2254, + "token_acc": 0.26957124667498705 + }, + { + "epoch": 1.3218997361477571, + "grad_norm": 0.5899682285454962, + "learning_rate": 0.00029978770587627587, + "loss": 3.3053488731384277, + "step": 2255, + "token_acc": 0.26736030324815374 + }, + { + "epoch": 1.3224860744649662, + "grad_norm": 0.6577105034602148, + "learning_rate": 0.00029978693197044387, + "loss": 3.3156371116638184, + "step": 2256, + "token_acc": 0.267959370174683 + }, + { + "epoch": 1.3230724127821754, + "grad_norm": 0.5921161976960245, + "learning_rate": 0.00029978615665756547, + "loss": 3.2383341789245605, + "step": 2257, + "token_acc": 0.2739855408900975 + }, + { + "epoch": 1.3236587510993845, + "grad_norm": 0.5663747775677623, + "learning_rate": 0.00029978537993764797, + "loss": 3.2610905170440674, + "step": 2258, + "token_acc": 0.27425446710973833 + }, + { + "epoch": 1.3242450894165934, + "grad_norm": 0.5432351136244487, + "learning_rate": 0.0002997846018106987, + "loss": 3.3185648918151855, + "step": 2259, + "token_acc": 0.2668741444768718 + }, + { + "epoch": 1.3248314277338025, + "grad_norm": 0.5823335899973447, + "learning_rate": 0.00029978382227672487, + "loss": 3.2780375480651855, + "step": 2260, + "token_acc": 0.2708189737822669 + }, + { + "epoch": 1.3254177660510114, + "grad_norm": 0.5675378446185119, + "learning_rate": 0.0002997830413357339, + "loss": 3.320552349090576, + "step": 2261, + "token_acc": 0.265323481784585 + }, + { + "epoch": 1.3260041043682205, + "grad_norm": 0.5865634430756527, + "learning_rate": 0.00029978225898773307, + "loss": 3.2714736461639404, + "step": 2262, + "token_acc": 0.27051810035013035 + }, + { + "epoch": 1.3265904426854296, + "grad_norm": 0.6176964931750497, + "learning_rate": 0.0002997814752327298, + "loss": 3.268202304840088, + "step": 2263, + "token_acc": 0.27317149897761234 + }, + { + "epoch": 1.3271767810026385, + "grad_norm": 0.5047083333210624, + "learning_rate": 0.00029978069007073133, + "loss": 3.2561774253845215, + "step": 2264, + "token_acc": 0.2755039722930905 + }, + { + "epoch": 1.3277631193198476, + "grad_norm": 0.588378636125102, + "learning_rate": 0.00029977990350174517, + "loss": 3.3012447357177734, + "step": 2265, + "token_acc": 0.26793030979583815 + }, + { + "epoch": 1.3283494576370565, + "grad_norm": 0.743236948460754, + "learning_rate": 0.00029977911552577863, + "loss": 3.2998924255371094, + "step": 2266, + "token_acc": 0.26817500695626867 + }, + { + "epoch": 1.3289357959542656, + "grad_norm": 0.6282502429321196, + "learning_rate": 0.00029977832614283914, + "loss": 3.278174877166748, + "step": 2267, + "token_acc": 0.2730944084780009 + }, + { + "epoch": 1.3295221342714747, + "grad_norm": 0.5959214202056641, + "learning_rate": 0.0002997775353529341, + "loss": 3.267796039581299, + "step": 2268, + "token_acc": 0.27000682980889423 + }, + { + "epoch": 1.3301084725886836, + "grad_norm": 0.6101010501919545, + "learning_rate": 0.00029977674315607094, + "loss": 3.326608180999756, + "step": 2269, + "token_acc": 0.26466105136576284 + }, + { + "epoch": 1.3306948109058927, + "grad_norm": 0.5333506217601538, + "learning_rate": 0.0002997759495522571, + "loss": 3.256826400756836, + "step": 2270, + "token_acc": 0.27342956359489995 + }, + { + "epoch": 1.3312811492231018, + "grad_norm": 0.5751998940429687, + "learning_rate": 0.00029977515454150005, + "loss": 3.280289649963379, + "step": 2271, + "token_acc": 0.26893647421556277 + }, + { + "epoch": 1.3318674875403107, + "grad_norm": 0.5674708798246865, + "learning_rate": 0.0002997743581238072, + "loss": 3.2676568031311035, + "step": 2272, + "token_acc": 0.2717521335569251 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.5209257182537923, + "learning_rate": 0.00029977356029918615, + "loss": 3.300467014312744, + "step": 2273, + "token_acc": 0.2677439458457302 + }, + { + "epoch": 1.333040164174729, + "grad_norm": 0.5370827990424959, + "learning_rate": 0.00029977276106764425, + "loss": 3.3089516162872314, + "step": 2274, + "token_acc": 0.2687172844075504 + }, + { + "epoch": 1.3336265024919378, + "grad_norm": 0.5117476927981967, + "learning_rate": 0.00029977196042918914, + "loss": 3.3296666145324707, + "step": 2275, + "token_acc": 0.2638010496518057 + }, + { + "epoch": 1.334212840809147, + "grad_norm": 0.5240396460556052, + "learning_rate": 0.00029977115838382824, + "loss": 3.3252065181732178, + "step": 2276, + "token_acc": 0.26566904884115766 + }, + { + "epoch": 1.3347991791263558, + "grad_norm": 0.5433609453309353, + "learning_rate": 0.00029977035493156915, + "loss": 3.2982425689697266, + "step": 2277, + "token_acc": 0.2677894071368362 + }, + { + "epoch": 1.335385517443565, + "grad_norm": 0.5132947145326135, + "learning_rate": 0.0002997695500724194, + "loss": 3.21835994720459, + "step": 2278, + "token_acc": 0.27913682908391174 + }, + { + "epoch": 1.335971855760774, + "grad_norm": 0.5682318840731981, + "learning_rate": 0.00029976874380638655, + "loss": 3.3052456378936768, + "step": 2279, + "token_acc": 0.26642364995259366 + }, + { + "epoch": 1.336558194077983, + "grad_norm": 0.5318389133635304, + "learning_rate": 0.0002997679361334781, + "loss": 3.2847137451171875, + "step": 2280, + "token_acc": 0.2714304702661031 + }, + { + "epoch": 1.337144532395192, + "grad_norm": 0.6270022632832412, + "learning_rate": 0.0002997671270537018, + "loss": 3.3030083179473877, + "step": 2281, + "token_acc": 0.2691590412104362 + }, + { + "epoch": 1.337730870712401, + "grad_norm": 0.70998324144504, + "learning_rate": 0.0002997663165670651, + "loss": 3.225252628326416, + "step": 2282, + "token_acc": 0.2787204019371605 + }, + { + "epoch": 1.33831720902961, + "grad_norm": 0.8485721915014303, + "learning_rate": 0.0002997655046735757, + "loss": 3.262086868286133, + "step": 2283, + "token_acc": 0.27217654694937254 + }, + { + "epoch": 1.3389035473468192, + "grad_norm": 0.8247834873770069, + "learning_rate": 0.00029976469137324115, + "loss": 3.30789852142334, + "step": 2284, + "token_acc": 0.2660406803940995 + }, + { + "epoch": 1.3394898856640283, + "grad_norm": 0.5625533896064778, + "learning_rate": 0.0002997638766660692, + "loss": 3.29653263092041, + "step": 2285, + "token_acc": 0.26793905552334746 + }, + { + "epoch": 1.3400762239812372, + "grad_norm": 0.5372431669185745, + "learning_rate": 0.00029976306055206736, + "loss": 3.3913815021514893, + "step": 2286, + "token_acc": 0.25672483877416535 + }, + { + "epoch": 1.3406625622984463, + "grad_norm": 0.6821846701498606, + "learning_rate": 0.0002997622430312434, + "loss": 3.2478461265563965, + "step": 2287, + "token_acc": 0.27491543099404936 + }, + { + "epoch": 1.3412489006156552, + "grad_norm": 0.6297146434549384, + "learning_rate": 0.00029976142410360505, + "loss": 3.3208537101745605, + "step": 2288, + "token_acc": 0.2639167947051176 + }, + { + "epoch": 1.3418352389328643, + "grad_norm": 0.5580286324362485, + "learning_rate": 0.0002997606037691599, + "loss": 3.3070850372314453, + "step": 2289, + "token_acc": 0.2686028368980932 + }, + { + "epoch": 1.3424215772500734, + "grad_norm": 0.5895264944518573, + "learning_rate": 0.0002997597820279156, + "loss": 3.278052806854248, + "step": 2290, + "token_acc": 0.2703500987619902 + }, + { + "epoch": 1.3430079155672823, + "grad_norm": 0.5111750332848601, + "learning_rate": 0.00029975895887987997, + "loss": 3.281803607940674, + "step": 2291, + "token_acc": 0.2708238112499803 + }, + { + "epoch": 1.3435942538844914, + "grad_norm": 0.6031842677870055, + "learning_rate": 0.0002997581343250608, + "loss": 3.2881124019622803, + "step": 2292, + "token_acc": 0.268021370745733 + }, + { + "epoch": 1.3441805922017003, + "grad_norm": 0.6392491756576695, + "learning_rate": 0.00029975730836346567, + "loss": 3.310147762298584, + "step": 2293, + "token_acc": 0.26805406132329207 + }, + { + "epoch": 1.3447669305189094, + "grad_norm": 0.5598359262732455, + "learning_rate": 0.0002997564809951025, + "loss": 3.336435317993164, + "step": 2294, + "token_acc": 0.26340598981112007 + }, + { + "epoch": 1.3453532688361185, + "grad_norm": 0.5837609017456445, + "learning_rate": 0.00029975565221997894, + "loss": 3.22463321685791, + "step": 2295, + "token_acc": 0.2779614562411489 + }, + { + "epoch": 1.3459396071533274, + "grad_norm": 0.6692356965286059, + "learning_rate": 0.0002997548220381029, + "loss": 3.294100761413574, + "step": 2296, + "token_acc": 0.2688499578845879 + }, + { + "epoch": 1.3465259454705365, + "grad_norm": 0.5471252942064326, + "learning_rate": 0.00029975399044948197, + "loss": 3.299065589904785, + "step": 2297, + "token_acc": 0.26853766617429836 + }, + { + "epoch": 1.3471122837877456, + "grad_norm": 0.5528774624095585, + "learning_rate": 0.0002997531574541242, + "loss": 3.2783710956573486, + "step": 2298, + "token_acc": 0.27027325073058533 + }, + { + "epoch": 1.3476986221049545, + "grad_norm": 0.5230929152248286, + "learning_rate": 0.0002997523230520373, + "loss": 3.3362293243408203, + "step": 2299, + "token_acc": 0.2632044637859116 + }, + { + "epoch": 1.3482849604221636, + "grad_norm": 0.515685632881243, + "learning_rate": 0.0002997514872432291, + "loss": 3.2717537879943848, + "step": 2300, + "token_acc": 0.27259666650907965 + }, + { + "epoch": 1.3488712987393727, + "grad_norm": 0.5154966451679686, + "learning_rate": 0.0002997506500277075, + "loss": 3.2458300590515137, + "step": 2301, + "token_acc": 0.2752459647033405 + }, + { + "epoch": 1.3494576370565816, + "grad_norm": 0.50216030163088, + "learning_rate": 0.0002997498114054803, + "loss": 3.2456283569335938, + "step": 2302, + "token_acc": 0.27291917246404374 + }, + { + "epoch": 1.3500439753737907, + "grad_norm": 0.5399919049371538, + "learning_rate": 0.00029974897137655544, + "loss": 3.27744460105896, + "step": 2303, + "token_acc": 0.2716898248868578 + }, + { + "epoch": 1.3506303136909996, + "grad_norm": 0.5884172704660784, + "learning_rate": 0.00029974812994094073, + "loss": 3.263371229171753, + "step": 2304, + "token_acc": 0.27181613916596753 + }, + { + "epoch": 1.3512166520082087, + "grad_norm": 0.5957021930861959, + "learning_rate": 0.0002997472870986442, + "loss": 3.3235244750976562, + "step": 2305, + "token_acc": 0.2657339947294993 + }, + { + "epoch": 1.3518029903254178, + "grad_norm": 0.5954054239633952, + "learning_rate": 0.00029974644284967364, + "loss": 3.2575554847717285, + "step": 2306, + "token_acc": 0.2736577356130108 + }, + { + "epoch": 1.3523893286426267, + "grad_norm": 0.5734337005440051, + "learning_rate": 0.00029974559719403703, + "loss": 3.283459186553955, + "step": 2307, + "token_acc": 0.2715370737687761 + }, + { + "epoch": 1.3529756669598358, + "grad_norm": 0.4995061504747152, + "learning_rate": 0.0002997447501317424, + "loss": 3.3653249740600586, + "step": 2308, + "token_acc": 0.26058986776084037 + }, + { + "epoch": 1.3535620052770447, + "grad_norm": 0.46836013653694625, + "learning_rate": 0.00029974390166279753, + "loss": 3.3367135524749756, + "step": 2309, + "token_acc": 0.26363310947625995 + }, + { + "epoch": 1.3541483435942538, + "grad_norm": 0.5519592326440945, + "learning_rate": 0.0002997430517872105, + "loss": 3.263765335083008, + "step": 2310, + "token_acc": 0.27324423567623685 + }, + { + "epoch": 1.354734681911463, + "grad_norm": 0.4686918427230092, + "learning_rate": 0.0002997422005049894, + "loss": 3.27644681930542, + "step": 2311, + "token_acc": 0.2715050032342289 + }, + { + "epoch": 1.355321020228672, + "grad_norm": 0.5265614040160932, + "learning_rate": 0.00029974134781614195, + "loss": 3.2583961486816406, + "step": 2312, + "token_acc": 0.27383285759188797 + }, + { + "epoch": 1.355907358545881, + "grad_norm": 0.5415736335051998, + "learning_rate": 0.0002997404937206764, + "loss": 3.265746593475342, + "step": 2313, + "token_acc": 0.26987788838870413 + }, + { + "epoch": 1.35649369686309, + "grad_norm": 0.5451073129843799, + "learning_rate": 0.0002997396382186006, + "loss": 3.313465118408203, + "step": 2314, + "token_acc": 0.2647275781957552 + }, + { + "epoch": 1.357080035180299, + "grad_norm": 0.5048112950713483, + "learning_rate": 0.00029973878130992276, + "loss": 3.254274368286133, + "step": 2315, + "token_acc": 0.2741221464856752 + }, + { + "epoch": 1.357666373497508, + "grad_norm": 0.5186716972014628, + "learning_rate": 0.0002997379229946509, + "loss": 3.2755894660949707, + "step": 2316, + "token_acc": 0.2700103412616339 + }, + { + "epoch": 1.3582527118147172, + "grad_norm": 0.6155813111810708, + "learning_rate": 0.00029973706327279294, + "loss": 3.264812469482422, + "step": 2317, + "token_acc": 0.2711816984424038 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.7629225360859194, + "learning_rate": 0.000299736202144357, + "loss": 3.282783031463623, + "step": 2318, + "token_acc": 0.2679287160063328 + }, + { + "epoch": 1.3594253884491352, + "grad_norm": 0.8317424796066938, + "learning_rate": 0.0002997353396093513, + "loss": 3.2683820724487305, + "step": 2319, + "token_acc": 0.2729518172617446 + }, + { + "epoch": 1.360011726766344, + "grad_norm": 0.7492516148193727, + "learning_rate": 0.0002997344756677838, + "loss": 3.2349185943603516, + "step": 2320, + "token_acc": 0.2771028025181669 + }, + { + "epoch": 1.3605980650835532, + "grad_norm": 0.4687240034529518, + "learning_rate": 0.00029973361031966275, + "loss": 3.3158764839172363, + "step": 2321, + "token_acc": 0.26525094437277424 + }, + { + "epoch": 1.3611844034007623, + "grad_norm": 0.5595775221457051, + "learning_rate": 0.0002997327435649961, + "loss": 3.285459041595459, + "step": 2322, + "token_acc": 0.27008047433073845 + }, + { + "epoch": 1.3617707417179712, + "grad_norm": 0.714260067304, + "learning_rate": 0.0002997318754037922, + "loss": 3.298811912536621, + "step": 2323, + "token_acc": 0.2685461804958983 + }, + { + "epoch": 1.3623570800351803, + "grad_norm": 0.647391508108505, + "learning_rate": 0.000299731005836059, + "loss": 3.2870395183563232, + "step": 2324, + "token_acc": 0.27038044195686034 + }, + { + "epoch": 1.3629434183523892, + "grad_norm": 0.6276206671948867, + "learning_rate": 0.0002997301348618048, + "loss": 3.3104248046875, + "step": 2325, + "token_acc": 0.26754086657077764 + }, + { + "epoch": 1.3635297566695983, + "grad_norm": 0.5811453966151366, + "learning_rate": 0.00029972926248103776, + "loss": 3.317330837249756, + "step": 2326, + "token_acc": 0.26639975988564046 + }, + { + "epoch": 1.3641160949868074, + "grad_norm": 0.5399857548100404, + "learning_rate": 0.000299728388693766, + "loss": 3.2647323608398438, + "step": 2327, + "token_acc": 0.2717178676930631 + }, + { + "epoch": 1.3647024333040165, + "grad_norm": 0.5896669596479472, + "learning_rate": 0.0002997275134999979, + "loss": 3.323700428009033, + "step": 2328, + "token_acc": 0.2641683661758537 + }, + { + "epoch": 1.3652887716212254, + "grad_norm": 0.4968272930499289, + "learning_rate": 0.0002997266368997415, + "loss": 3.252244234085083, + "step": 2329, + "token_acc": 0.27375088689857996 + }, + { + "epoch": 1.3658751099384345, + "grad_norm": 0.5661909633476219, + "learning_rate": 0.0002997257588930051, + "loss": 3.303410768508911, + "step": 2330, + "token_acc": 0.26806372108880716 + }, + { + "epoch": 1.3664614482556434, + "grad_norm": 0.4889002229709549, + "learning_rate": 0.000299724879479797, + "loss": 3.2588319778442383, + "step": 2331, + "token_acc": 0.27261013297282455 + }, + { + "epoch": 1.3670477865728525, + "grad_norm": 0.523452646609507, + "learning_rate": 0.00029972399866012536, + "loss": 3.3019442558288574, + "step": 2332, + "token_acc": 0.26883919464917316 + }, + { + "epoch": 1.3676341248900616, + "grad_norm": 0.5991602508845554, + "learning_rate": 0.0002997231164339985, + "loss": 3.303267002105713, + "step": 2333, + "token_acc": 0.2693654809997365 + }, + { + "epoch": 1.3682204632072705, + "grad_norm": 0.47922301702977266, + "learning_rate": 0.00029972223280142477, + "loss": 3.276179790496826, + "step": 2334, + "token_acc": 0.270415965323388 + }, + { + "epoch": 1.3688068015244796, + "grad_norm": 0.5374123016409618, + "learning_rate": 0.0002997213477624124, + "loss": 3.2509799003601074, + "step": 2335, + "token_acc": 0.2733777940268025 + }, + { + "epoch": 1.3693931398416885, + "grad_norm": 0.5131823712888214, + "learning_rate": 0.0002997204613169697, + "loss": 3.2458348274230957, + "step": 2336, + "token_acc": 0.27555502597071996 + }, + { + "epoch": 1.3699794781588976, + "grad_norm": 0.4720311535025932, + "learning_rate": 0.00029971957346510504, + "loss": 3.2800302505493164, + "step": 2337, + "token_acc": 0.2712871592371743 + }, + { + "epoch": 1.3705658164761068, + "grad_norm": 0.5885560370358275, + "learning_rate": 0.0002997186842068267, + "loss": 3.286646842956543, + "step": 2338, + "token_acc": 0.26988527067000606 + }, + { + "epoch": 1.3711521547933159, + "grad_norm": 0.5253751759878109, + "learning_rate": 0.0002997177935421431, + "loss": 3.2615408897399902, + "step": 2339, + "token_acc": 0.2730822049669512 + }, + { + "epoch": 1.3717384931105248, + "grad_norm": 0.5917183244379426, + "learning_rate": 0.0002997169014710626, + "loss": 3.2781829833984375, + "step": 2340, + "token_acc": 0.2718929845048517 + }, + { + "epoch": 1.3723248314277339, + "grad_norm": 0.55712570027739, + "learning_rate": 0.0002997160079935936, + "loss": 3.3119707107543945, + "step": 2341, + "token_acc": 0.26857088250284233 + }, + { + "epoch": 1.3729111697449428, + "grad_norm": 0.5413481180585094, + "learning_rate": 0.0002997151131097443, + "loss": 3.3005857467651367, + "step": 2342, + "token_acc": 0.27016522415079375 + }, + { + "epoch": 1.3734975080621519, + "grad_norm": 0.6250074378623586, + "learning_rate": 0.00029971421681952335, + "loss": 3.269123077392578, + "step": 2343, + "token_acc": 0.2720781157235526 + }, + { + "epoch": 1.374083846379361, + "grad_norm": 0.5787286755194841, + "learning_rate": 0.0002997133191229391, + "loss": 3.2638661861419678, + "step": 2344, + "token_acc": 0.2729091496580322 + }, + { + "epoch": 1.3746701846965699, + "grad_norm": 0.5382273479345289, + "learning_rate": 0.0002997124200199999, + "loss": 3.3391809463500977, + "step": 2345, + "token_acc": 0.2620415033259662 + }, + { + "epoch": 1.375256523013779, + "grad_norm": 0.5154236967742366, + "learning_rate": 0.00029971151951071426, + "loss": 3.2429590225219727, + "step": 2346, + "token_acc": 0.2757056478511695 + }, + { + "epoch": 1.3758428613309879, + "grad_norm": 0.560470275544452, + "learning_rate": 0.0002997106175950907, + "loss": 3.2860841751098633, + "step": 2347, + "token_acc": 0.2716123313618209 + }, + { + "epoch": 1.376429199648197, + "grad_norm": 0.6224421636749669, + "learning_rate": 0.00029970971427313754, + "loss": 3.2779831886291504, + "step": 2348, + "token_acc": 0.26955697385638366 + }, + { + "epoch": 1.377015537965406, + "grad_norm": 0.5577765452220557, + "learning_rate": 0.00029970880954486337, + "loss": 3.2940945625305176, + "step": 2349, + "token_acc": 0.2670984949514193 + }, + { + "epoch": 1.377601876282615, + "grad_norm": 0.46331051423448555, + "learning_rate": 0.00029970790341027673, + "loss": 3.2180240154266357, + "step": 2350, + "token_acc": 0.27915366951166093 + }, + { + "epoch": 1.378188214599824, + "grad_norm": 0.47276994060129046, + "learning_rate": 0.000299706995869386, + "loss": 3.3227555751800537, + "step": 2351, + "token_acc": 0.2656742777847862 + }, + { + "epoch": 1.378774552917033, + "grad_norm": 0.42270136977989675, + "learning_rate": 0.0002997060869221998, + "loss": 3.291868209838867, + "step": 2352, + "token_acc": 0.26848096194483795 + }, + { + "epoch": 1.379360891234242, + "grad_norm": 0.5199632673138261, + "learning_rate": 0.0002997051765687266, + "loss": 3.3101983070373535, + "step": 2353, + "token_acc": 0.26690153528760036 + }, + { + "epoch": 1.3799472295514512, + "grad_norm": 0.5586597279144494, + "learning_rate": 0.00029970426480897507, + "loss": 3.2956953048706055, + "step": 2354, + "token_acc": 0.2680487070170344 + }, + { + "epoch": 1.3805335678686603, + "grad_norm": 0.5608563153143209, + "learning_rate": 0.00029970335164295365, + "loss": 3.2905173301696777, + "step": 2355, + "token_acc": 0.27045489587334937 + }, + { + "epoch": 1.3811199061858692, + "grad_norm": 0.5536133785227201, + "learning_rate": 0.000299702437070671, + "loss": 3.300647735595703, + "step": 2356, + "token_acc": 0.26856061683049826 + }, + { + "epoch": 1.3817062445030783, + "grad_norm": 0.6649924455336458, + "learning_rate": 0.0002997015210921357, + "loss": 3.2790932655334473, + "step": 2357, + "token_acc": 0.27087235785508185 + }, + { + "epoch": 1.3822925828202872, + "grad_norm": 0.5964107461280297, + "learning_rate": 0.0002997006037073563, + "loss": 3.231511116027832, + "step": 2358, + "token_acc": 0.2764793733863132 + }, + { + "epoch": 1.3828789211374963, + "grad_norm": 0.6224699335368878, + "learning_rate": 0.0002996996849163414, + "loss": 3.2941975593566895, + "step": 2359, + "token_acc": 0.26802402326596514 + }, + { + "epoch": 1.3834652594547054, + "grad_norm": 0.5651333259570841, + "learning_rate": 0.0002996987647190998, + "loss": 3.2717056274414062, + "step": 2360, + "token_acc": 0.26955286794936356 + }, + { + "epoch": 1.3840515977719143, + "grad_norm": 0.5860490237091708, + "learning_rate": 0.00029969784311563994, + "loss": 3.322009563446045, + "step": 2361, + "token_acc": 0.2652774888238249 + }, + { + "epoch": 1.3846379360891234, + "grad_norm": 0.5573661304462181, + "learning_rate": 0.0002996969201059706, + "loss": 3.2978432178497314, + "step": 2362, + "token_acc": 0.267980186658104 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.5667250924505168, + "learning_rate": 0.0002996959956901004, + "loss": 3.2606427669525146, + "step": 2363, + "token_acc": 0.27151666839368604 + }, + { + "epoch": 1.3858106127235414, + "grad_norm": 0.44134976736199516, + "learning_rate": 0.00029969506986803805, + "loss": 3.294374465942383, + "step": 2364, + "token_acc": 0.2676413468497316 + }, + { + "epoch": 1.3863969510407506, + "grad_norm": 0.5073717246582231, + "learning_rate": 0.00029969414263979226, + "loss": 3.272484302520752, + "step": 2365, + "token_acc": 0.27024941205917474 + }, + { + "epoch": 1.3869832893579597, + "grad_norm": 0.4450021797425381, + "learning_rate": 0.0002996932140053717, + "loss": 3.306857109069824, + "step": 2366, + "token_acc": 0.2669401195573612 + }, + { + "epoch": 1.3875696276751686, + "grad_norm": 0.429587450942379, + "learning_rate": 0.00029969228396478507, + "loss": 3.266036033630371, + "step": 2367, + "token_acc": 0.27178082770047435 + }, + { + "epoch": 1.3881559659923777, + "grad_norm": 0.5459278389143569, + "learning_rate": 0.00029969135251804117, + "loss": 3.3118112087249756, + "step": 2368, + "token_acc": 0.2668911205989404 + }, + { + "epoch": 1.3887423043095866, + "grad_norm": 0.5746779701717927, + "learning_rate": 0.00029969041966514874, + "loss": 3.2996721267700195, + "step": 2369, + "token_acc": 0.2670568911294909 + }, + { + "epoch": 1.3893286426267957, + "grad_norm": 0.5943429117838637, + "learning_rate": 0.0002996894854061165, + "loss": 3.2907819747924805, + "step": 2370, + "token_acc": 0.2694884014850312 + }, + { + "epoch": 1.3899149809440048, + "grad_norm": 0.6759504312438539, + "learning_rate": 0.0002996885497409533, + "loss": 3.2052154541015625, + "step": 2371, + "token_acc": 0.2799557856083761 + }, + { + "epoch": 1.3905013192612137, + "grad_norm": 0.6419167773567508, + "learning_rate": 0.0002996876126696678, + "loss": 3.2945470809936523, + "step": 2372, + "token_acc": 0.26735053694502325 + }, + { + "epoch": 1.3910876575784228, + "grad_norm": 0.6068298106982998, + "learning_rate": 0.000299686674192269, + "loss": 3.2378623485565186, + "step": 2373, + "token_acc": 0.27459639395454055 + }, + { + "epoch": 1.3916739958956317, + "grad_norm": 0.48792939042572236, + "learning_rate": 0.0002996857343087655, + "loss": 3.264981746673584, + "step": 2374, + "token_acc": 0.26990316896938676 + }, + { + "epoch": 1.3922603342128408, + "grad_norm": 0.5495416257441966, + "learning_rate": 0.00029968479301916627, + "loss": 3.294529914855957, + "step": 2375, + "token_acc": 0.26965454461963173 + }, + { + "epoch": 1.39284667253005, + "grad_norm": 0.6417954643977561, + "learning_rate": 0.0002996838503234801, + "loss": 3.2646875381469727, + "step": 2376, + "token_acc": 0.2729160587875012 + }, + { + "epoch": 1.3934330108472588, + "grad_norm": 0.586601301747982, + "learning_rate": 0.0002996829062217159, + "loss": 3.3135623931884766, + "step": 2377, + "token_acc": 0.2653031278156777 + }, + { + "epoch": 1.394019349164468, + "grad_norm": 0.6027622068121812, + "learning_rate": 0.00029968196071388246, + "loss": 3.2888572216033936, + "step": 2378, + "token_acc": 0.2704266764329258 + }, + { + "epoch": 1.3946056874816768, + "grad_norm": 0.5032874618310406, + "learning_rate": 0.0002996810137999887, + "loss": 3.2596287727355957, + "step": 2379, + "token_acc": 0.2735786967766483 + }, + { + "epoch": 1.395192025798886, + "grad_norm": 0.4906476191151634, + "learning_rate": 0.0002996800654800435, + "loss": 3.323955535888672, + "step": 2380, + "token_acc": 0.2652195641769417 + }, + { + "epoch": 1.395778364116095, + "grad_norm": 0.5560025432747817, + "learning_rate": 0.0002996791157540558, + "loss": 3.3399150371551514, + "step": 2381, + "token_acc": 0.26121292736913354 + }, + { + "epoch": 1.3963647024333041, + "grad_norm": 0.49341289746083594, + "learning_rate": 0.0002996781646220345, + "loss": 3.26633882522583, + "step": 2382, + "token_acc": 0.2722567751767122 + }, + { + "epoch": 1.396951040750513, + "grad_norm": 0.5448553545367426, + "learning_rate": 0.00029967721208398854, + "loss": 3.292320728302002, + "step": 2383, + "token_acc": 0.2673366496890594 + }, + { + "epoch": 1.3975373790677221, + "grad_norm": 0.5225034765496931, + "learning_rate": 0.00029967625813992683, + "loss": 3.3125152587890625, + "step": 2384, + "token_acc": 0.26563303134382255 + }, + { + "epoch": 1.398123717384931, + "grad_norm": 0.5500913852697767, + "learning_rate": 0.0002996753027898584, + "loss": 3.2874464988708496, + "step": 2385, + "token_acc": 0.27049894403379093 + }, + { + "epoch": 1.3987100557021401, + "grad_norm": 0.4645968084795441, + "learning_rate": 0.0002996743460337922, + "loss": 3.300896644592285, + "step": 2386, + "token_acc": 0.2677324724901682 + }, + { + "epoch": 1.3992963940193492, + "grad_norm": 0.49463476764608016, + "learning_rate": 0.0002996733878717372, + "loss": 3.307324171066284, + "step": 2387, + "token_acc": 0.26644562201364336 + }, + { + "epoch": 1.3998827323365581, + "grad_norm": 0.5539459308849792, + "learning_rate": 0.0002996724283037024, + "loss": 3.290579319000244, + "step": 2388, + "token_acc": 0.27121335645449546 + }, + { + "epoch": 1.4004690706537672, + "grad_norm": 0.548340525269374, + "learning_rate": 0.0002996714673296968, + "loss": 3.2658143043518066, + "step": 2389, + "token_acc": 0.2707590904790438 + }, + { + "epoch": 1.4010554089709761, + "grad_norm": 0.5190091175210023, + "learning_rate": 0.0002996705049497295, + "loss": 3.2678635120391846, + "step": 2390, + "token_acc": 0.2721067853626518 + }, + { + "epoch": 1.4016417472881852, + "grad_norm": 0.6100558408639117, + "learning_rate": 0.0002996695411638095, + "loss": 3.272930145263672, + "step": 2391, + "token_acc": 0.2702527083410063 + }, + { + "epoch": 1.4022280856053944, + "grad_norm": 0.6171545359403329, + "learning_rate": 0.00029966857597194576, + "loss": 3.299881935119629, + "step": 2392, + "token_acc": 0.2691575993616335 + }, + { + "epoch": 1.4028144239226035, + "grad_norm": 0.5992004860102884, + "learning_rate": 0.0002996676093741475, + "loss": 3.26491117477417, + "step": 2393, + "token_acc": 0.271936449112317 + }, + { + "epoch": 1.4034007622398124, + "grad_norm": 0.6052576275531372, + "learning_rate": 0.0002996666413704237, + "loss": 3.2830424308776855, + "step": 2394, + "token_acc": 0.27201498142862324 + }, + { + "epoch": 1.4039871005570215, + "grad_norm": 0.5422850506693514, + "learning_rate": 0.00029966567196078347, + "loss": 3.2469875812530518, + "step": 2395, + "token_acc": 0.27374289101304 + }, + { + "epoch": 1.4045734388742304, + "grad_norm": 0.46615885839772603, + "learning_rate": 0.000299664701145236, + "loss": 3.272097110748291, + "step": 2396, + "token_acc": 0.2695859074306419 + }, + { + "epoch": 1.4051597771914395, + "grad_norm": 0.5547669742528543, + "learning_rate": 0.00029966372892379023, + "loss": 3.2573513984680176, + "step": 2397, + "token_acc": 0.2730984196410369 + }, + { + "epoch": 1.4057461155086486, + "grad_norm": 0.5888378255725208, + "learning_rate": 0.00029966275529645544, + "loss": 3.2564949989318848, + "step": 2398, + "token_acc": 0.2733417284964659 + }, + { + "epoch": 1.4063324538258575, + "grad_norm": 0.6999742138984829, + "learning_rate": 0.0002996617802632408, + "loss": 3.3245022296905518, + "step": 2399, + "token_acc": 0.26458784607706437 + }, + { + "epoch": 1.4069187921430666, + "grad_norm": 0.6187743950601527, + "learning_rate": 0.00029966080382415534, + "loss": 3.289876937866211, + "step": 2400, + "token_acc": 0.2689946765163462 + }, + { + "epoch": 1.4075051304602755, + "grad_norm": 0.5574328057106079, + "learning_rate": 0.00029965982597920834, + "loss": 3.3295211791992188, + "step": 2401, + "token_acc": 0.26311146040639377 + }, + { + "epoch": 1.4080914687774846, + "grad_norm": 0.5444022659333221, + "learning_rate": 0.0002996588467284089, + "loss": 3.319186210632324, + "step": 2402, + "token_acc": 0.26461354050163755 + }, + { + "epoch": 1.4086778070946937, + "grad_norm": 0.5520874374873401, + "learning_rate": 0.00029965786607176627, + "loss": 3.270364761352539, + "step": 2403, + "token_acc": 0.27180070329508715 + }, + { + "epoch": 1.4092641454119026, + "grad_norm": 0.6257688853954135, + "learning_rate": 0.0002996568840092897, + "loss": 3.250545024871826, + "step": 2404, + "token_acc": 0.27658181478075167 + }, + { + "epoch": 1.4098504837291117, + "grad_norm": 0.6514541129360853, + "learning_rate": 0.00029965590054098837, + "loss": 3.2695682048797607, + "step": 2405, + "token_acc": 0.2693352791587935 + }, + { + "epoch": 1.4104368220463206, + "grad_norm": 0.5837524493790546, + "learning_rate": 0.0002996549156668715, + "loss": 3.2856874465942383, + "step": 2406, + "token_acc": 0.269773548605153 + }, + { + "epoch": 1.4110231603635297, + "grad_norm": 0.637914880472307, + "learning_rate": 0.0002996539293869483, + "loss": 3.3023757934570312, + "step": 2407, + "token_acc": 0.2672429490814686 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.6191978222907231, + "learning_rate": 0.00029965294170122814, + "loss": 3.2671241760253906, + "step": 2408, + "token_acc": 0.27227085209306223 + }, + { + "epoch": 1.412195836997948, + "grad_norm": 0.6222856911579304, + "learning_rate": 0.0002996519526097203, + "loss": 3.317592144012451, + "step": 2409, + "token_acc": 0.2658801899016178 + }, + { + "epoch": 1.4127821753151568, + "grad_norm": 0.6804586920788489, + "learning_rate": 0.00029965096211243393, + "loss": 3.284654140472412, + "step": 2410, + "token_acc": 0.2704206942915532 + }, + { + "epoch": 1.413368513632366, + "grad_norm": 0.6742883424313039, + "learning_rate": 0.0002996499702093785, + "loss": 3.230648994445801, + "step": 2411, + "token_acc": 0.275978607073568 + }, + { + "epoch": 1.4139548519495748, + "grad_norm": 0.5872339128456647, + "learning_rate": 0.0002996489769005632, + "loss": 3.290733575820923, + "step": 2412, + "token_acc": 0.26818145768895235 + }, + { + "epoch": 1.414541190266784, + "grad_norm": 0.48972058786107486, + "learning_rate": 0.0002996479821859975, + "loss": 3.28164005279541, + "step": 2413, + "token_acc": 0.2707975110690797 + }, + { + "epoch": 1.415127528583993, + "grad_norm": 0.5794363584208544, + "learning_rate": 0.0002996469860656906, + "loss": 3.244933605194092, + "step": 2414, + "token_acc": 0.27554281094849875 + }, + { + "epoch": 1.415713866901202, + "grad_norm": 0.6710234464995446, + "learning_rate": 0.0002996459885396519, + "loss": 3.302269220352173, + "step": 2415, + "token_acc": 0.26643798839579413 + }, + { + "epoch": 1.416300205218411, + "grad_norm": 0.5854137865708379, + "learning_rate": 0.00029964498960789087, + "loss": 3.2613236904144287, + "step": 2416, + "token_acc": 0.2721644409823628 + }, + { + "epoch": 1.41688654353562, + "grad_norm": 0.5571072159267442, + "learning_rate": 0.00029964398927041677, + "loss": 3.266636848449707, + "step": 2417, + "token_acc": 0.27184649630578067 + }, + { + "epoch": 1.417472881852829, + "grad_norm": 0.5983563131923706, + "learning_rate": 0.000299642987527239, + "loss": 3.273965835571289, + "step": 2418, + "token_acc": 0.2718297727084839 + }, + { + "epoch": 1.4180592201700382, + "grad_norm": 0.5485429302425923, + "learning_rate": 0.000299641984378367, + "loss": 3.3078503608703613, + "step": 2419, + "token_acc": 0.26543168160362646 + }, + { + "epoch": 1.4186455584872473, + "grad_norm": 0.43728030135305784, + "learning_rate": 0.00029964097982381025, + "loss": 3.2973577976226807, + "step": 2420, + "token_acc": 0.2682629762202354 + }, + { + "epoch": 1.4192318968044562, + "grad_norm": 0.4855300240859427, + "learning_rate": 0.00029963997386357814, + "loss": 3.2981672286987305, + "step": 2421, + "token_acc": 0.26688090869201003 + }, + { + "epoch": 1.4198182351216653, + "grad_norm": 0.5246237559632434, + "learning_rate": 0.0002996389664976801, + "loss": 3.255279541015625, + "step": 2422, + "token_acc": 0.2724458204334365 + }, + { + "epoch": 1.4204045734388742, + "grad_norm": 0.488053208292944, + "learning_rate": 0.0002996379577261256, + "loss": 3.2573599815368652, + "step": 2423, + "token_acc": 0.272176678993509 + }, + { + "epoch": 1.4209909117560833, + "grad_norm": 0.4910361792162045, + "learning_rate": 0.0002996369475489242, + "loss": 3.227957248687744, + "step": 2424, + "token_acc": 0.2761903787417567 + }, + { + "epoch": 1.4215772500732924, + "grad_norm": 0.5108161096315005, + "learning_rate": 0.0002996359359660852, + "loss": 3.2387242317199707, + "step": 2425, + "token_acc": 0.2769740314578275 + }, + { + "epoch": 1.4221635883905013, + "grad_norm": 0.531490502074727, + "learning_rate": 0.0002996349229776183, + "loss": 3.2796196937561035, + "step": 2426, + "token_acc": 0.2711439280782347 + }, + { + "epoch": 1.4227499267077104, + "grad_norm": 0.5284219595620157, + "learning_rate": 0.0002996339085835329, + "loss": 3.295367956161499, + "step": 2427, + "token_acc": 0.2688898236226973 + }, + { + "epoch": 1.4233362650249193, + "grad_norm": 0.5785852547202086, + "learning_rate": 0.0002996328927838386, + "loss": 3.335418701171875, + "step": 2428, + "token_acc": 0.2637451063486298 + }, + { + "epoch": 1.4239226033421284, + "grad_norm": 0.6141501793925301, + "learning_rate": 0.00029963187557854485, + "loss": 3.254877805709839, + "step": 2429, + "token_acc": 0.2738140096932212 + }, + { + "epoch": 1.4245089416593375, + "grad_norm": 0.5626640791176556, + "learning_rate": 0.00029963085696766133, + "loss": 3.276174545288086, + "step": 2430, + "token_acc": 0.26854819357419013 + }, + { + "epoch": 1.4250952799765464, + "grad_norm": 0.47233076945503105, + "learning_rate": 0.00029962983695119746, + "loss": 3.2461767196655273, + "step": 2431, + "token_acc": 0.27423772091689236 + }, + { + "epoch": 1.4256816182937555, + "grad_norm": 0.5179452046766786, + "learning_rate": 0.00029962881552916294, + "loss": 3.239842414855957, + "step": 2432, + "token_acc": 0.2751083173520847 + }, + { + "epoch": 1.4262679566109644, + "grad_norm": 0.5498096003757862, + "learning_rate": 0.0002996277927015673, + "loss": 3.311431646347046, + "step": 2433, + "token_acc": 0.26475392461252634 + }, + { + "epoch": 1.4268542949281735, + "grad_norm": 0.5162105974718673, + "learning_rate": 0.00029962676846842024, + "loss": 3.2687466144561768, + "step": 2434, + "token_acc": 0.27074020709256474 + }, + { + "epoch": 1.4274406332453826, + "grad_norm": 0.5041546692426218, + "learning_rate": 0.00029962574282973124, + "loss": 3.2616050243377686, + "step": 2435, + "token_acc": 0.2734515198591612 + }, + { + "epoch": 1.4280269715625917, + "grad_norm": 0.5773355803832131, + "learning_rate": 0.0002996247157855101, + "loss": 3.2735462188720703, + "step": 2436, + "token_acc": 0.2715997471842514 + }, + { + "epoch": 1.4286133098798006, + "grad_norm": 0.5803850793501364, + "learning_rate": 0.00029962368733576627, + "loss": 3.261991262435913, + "step": 2437, + "token_acc": 0.2722853880144978 + }, + { + "epoch": 1.4291996481970097, + "grad_norm": 0.5861804962959183, + "learning_rate": 0.0002996226574805096, + "loss": 3.2808706760406494, + "step": 2438, + "token_acc": 0.26932716153167163 + }, + { + "epoch": 1.4297859865142186, + "grad_norm": 0.5955044303907834, + "learning_rate": 0.00029962162621974964, + "loss": 3.2741129398345947, + "step": 2439, + "token_acc": 0.2702591623036649 + }, + { + "epoch": 1.4303723248314277, + "grad_norm": 0.5591415897897387, + "learning_rate": 0.00029962059355349613, + "loss": 3.2834272384643555, + "step": 2440, + "token_acc": 0.2697684864129831 + }, + { + "epoch": 1.4309586631486368, + "grad_norm": 0.5410184203513996, + "learning_rate": 0.0002996195594817587, + "loss": 3.269676923751831, + "step": 2441, + "token_acc": 0.26933012109167015 + }, + { + "epoch": 1.4315450014658457, + "grad_norm": 0.524093599477998, + "learning_rate": 0.00029961852400454725, + "loss": 3.237142324447632, + "step": 2442, + "token_acc": 0.2752851239349041 + }, + { + "epoch": 1.4321313397830548, + "grad_norm": 0.5862095138113799, + "learning_rate": 0.0002996174871218713, + "loss": 3.263597011566162, + "step": 2443, + "token_acc": 0.27275309835073225 + }, + { + "epoch": 1.4327176781002637, + "grad_norm": 0.5760703519676676, + "learning_rate": 0.0002996164488337407, + "loss": 3.2759804725646973, + "step": 2444, + "token_acc": 0.26915015393300123 + }, + { + "epoch": 1.4333040164174728, + "grad_norm": 0.5577065017705736, + "learning_rate": 0.00029961540914016514, + "loss": 3.264591932296753, + "step": 2445, + "token_acc": 0.27057818435130054 + }, + { + "epoch": 1.433890354734682, + "grad_norm": 0.5956832881235639, + "learning_rate": 0.00029961436804115443, + "loss": 3.2746620178222656, + "step": 2446, + "token_acc": 0.2715555018284022 + }, + { + "epoch": 1.434476693051891, + "grad_norm": 0.5882495532963262, + "learning_rate": 0.00029961332553671836, + "loss": 3.293856620788574, + "step": 2447, + "token_acc": 0.2683404549091803 + }, + { + "epoch": 1.4350630313691, + "grad_norm": 0.6746391294561705, + "learning_rate": 0.0002996122816268667, + "loss": 3.255826950073242, + "step": 2448, + "token_acc": 0.2724063393034722 + }, + { + "epoch": 1.435649369686309, + "grad_norm": 0.5670825996457249, + "learning_rate": 0.00029961123631160925, + "loss": 3.2983407974243164, + "step": 2449, + "token_acc": 0.26728366079646887 + }, + { + "epoch": 1.436235708003518, + "grad_norm": 0.48334039553728575, + "learning_rate": 0.0002996101895909558, + "loss": 3.231581926345825, + "step": 2450, + "token_acc": 0.278606310691341 + }, + { + "epoch": 1.436822046320727, + "grad_norm": 0.5454619521333512, + "learning_rate": 0.0002996091414649163, + "loss": 3.273036003112793, + "step": 2451, + "token_acc": 0.2712632334469765 + }, + { + "epoch": 1.4374083846379362, + "grad_norm": 0.49612619988177964, + "learning_rate": 0.00029960809193350045, + "loss": 3.209500312805176, + "step": 2452, + "token_acc": 0.27882888695334257 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.4780020641798862, + "learning_rate": 0.0002996070409967182, + "loss": 3.240847110748291, + "step": 2453, + "token_acc": 0.2755924029691783 + }, + { + "epoch": 1.4385810612723542, + "grad_norm": 0.5562260160280741, + "learning_rate": 0.00029960598865457936, + "loss": 3.300473213195801, + "step": 2454, + "token_acc": 0.2660521690753455 + }, + { + "epoch": 1.439167399589563, + "grad_norm": 0.41455540352615194, + "learning_rate": 0.00029960493490709393, + "loss": 3.225064754486084, + "step": 2455, + "token_acc": 0.27859735068088615 + }, + { + "epoch": 1.4397537379067722, + "grad_norm": 0.49060360934727537, + "learning_rate": 0.0002996038797542717, + "loss": 3.2779016494750977, + "step": 2456, + "token_acc": 0.2693551582520789 + }, + { + "epoch": 1.4403400762239813, + "grad_norm": 0.48134117070452553, + "learning_rate": 0.0002996028231961226, + "loss": 3.2480998039245605, + "step": 2457, + "token_acc": 0.2726294566841121 + }, + { + "epoch": 1.4409264145411902, + "grad_norm": 0.4846791705278586, + "learning_rate": 0.00029960176523265657, + "loss": 3.3117456436157227, + "step": 2458, + "token_acc": 0.2671325050363942 + }, + { + "epoch": 1.4415127528583993, + "grad_norm": 0.4864709920787233, + "learning_rate": 0.0002996007058638836, + "loss": 3.265331745147705, + "step": 2459, + "token_acc": 0.2698780669107512 + }, + { + "epoch": 1.4420990911756082, + "grad_norm": 0.5242517564470773, + "learning_rate": 0.0002995996450898135, + "loss": 3.267516613006592, + "step": 2460, + "token_acc": 0.2705900488416999 + }, + { + "epoch": 1.4426854294928173, + "grad_norm": 0.5732991431130952, + "learning_rate": 0.0002995985829104564, + "loss": 3.2622485160827637, + "step": 2461, + "token_acc": 0.27178177676418025 + }, + { + "epoch": 1.4432717678100264, + "grad_norm": 0.6434024227946015, + "learning_rate": 0.0002995975193258221, + "loss": 3.246582508087158, + "step": 2462, + "token_acc": 0.2724080968976392 + }, + { + "epoch": 1.4438581061272355, + "grad_norm": 0.6597641503705125, + "learning_rate": 0.0002995964543359208, + "loss": 3.2463173866271973, + "step": 2463, + "token_acc": 0.2752161421231569 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.5226985575894212, + "learning_rate": 0.00029959538794076235, + "loss": 3.26845645904541, + "step": 2464, + "token_acc": 0.2713407559059991 + }, + { + "epoch": 1.4450307827616535, + "grad_norm": 0.556848476299921, + "learning_rate": 0.0002995943201403568, + "loss": 3.30072021484375, + "step": 2465, + "token_acc": 0.26717867732532596 + }, + { + "epoch": 1.4456171210788624, + "grad_norm": 0.6705222556786472, + "learning_rate": 0.00029959325093471416, + "loss": 3.3088722229003906, + "step": 2466, + "token_acc": 0.26592082616179 + }, + { + "epoch": 1.4462034593960715, + "grad_norm": 0.6075013009060733, + "learning_rate": 0.00029959218032384456, + "loss": 3.2549691200256348, + "step": 2467, + "token_acc": 0.2737410320176459 + }, + { + "epoch": 1.4467897977132806, + "grad_norm": 0.5750620115404984, + "learning_rate": 0.00029959110830775804, + "loss": 3.267874002456665, + "step": 2468, + "token_acc": 0.27124176950102064 + }, + { + "epoch": 1.4473761360304895, + "grad_norm": 0.52606530692236, + "learning_rate": 0.0002995900348864646, + "loss": 3.3183746337890625, + "step": 2469, + "token_acc": 0.2662360618263493 + }, + { + "epoch": 1.4479624743476986, + "grad_norm": 0.6000248817758309, + "learning_rate": 0.00029958896005997433, + "loss": 3.267632484436035, + "step": 2470, + "token_acc": 0.2703505277730602 + }, + { + "epoch": 1.4485488126649075, + "grad_norm": 0.5913805316104338, + "learning_rate": 0.00029958788382829736, + "loss": 3.2394769191741943, + "step": 2471, + "token_acc": 0.27615980468536777 + }, + { + "epoch": 1.4491351509821166, + "grad_norm": 0.5939759552617605, + "learning_rate": 0.0002995868061914438, + "loss": 3.3121910095214844, + "step": 2472, + "token_acc": 0.266173446718813 + }, + { + "epoch": 1.4497214892993258, + "grad_norm": 0.5798415072155725, + "learning_rate": 0.0002995857271494238, + "loss": 3.2216055393218994, + "step": 2473, + "token_acc": 0.2757765561722712 + }, + { + "epoch": 1.4503078276165349, + "grad_norm": 0.5187684984997788, + "learning_rate": 0.0002995846467022474, + "loss": 3.2719712257385254, + "step": 2474, + "token_acc": 0.2699009506975831 + }, + { + "epoch": 1.4508941659337438, + "grad_norm": 0.5129690742529915, + "learning_rate": 0.0002995835648499249, + "loss": 3.314084053039551, + "step": 2475, + "token_acc": 0.26405700872426946 + }, + { + "epoch": 1.4514805042509529, + "grad_norm": 0.5269854602482452, + "learning_rate": 0.00029958248159246627, + "loss": 3.3296351432800293, + "step": 2476, + "token_acc": 0.26090290648858433 + }, + { + "epoch": 1.4520668425681618, + "grad_norm": 0.3886974435375772, + "learning_rate": 0.00029958139692988186, + "loss": 3.2300639152526855, + "step": 2477, + "token_acc": 0.27662006091775054 + }, + { + "epoch": 1.4526531808853709, + "grad_norm": 0.43471057332652185, + "learning_rate": 0.00029958031086218173, + "loss": 3.313685894012451, + "step": 2478, + "token_acc": 0.26468260757977585 + }, + { + "epoch": 1.45323951920258, + "grad_norm": 0.46782072655399637, + "learning_rate": 0.00029957922338937624, + "loss": 3.2337112426757812, + "step": 2479, + "token_acc": 0.2764421660479642 + }, + { + "epoch": 1.4538258575197889, + "grad_norm": 0.47160646249434063, + "learning_rate": 0.0002995781345114754, + "loss": 3.2678468227386475, + "step": 2480, + "token_acc": 0.2707927508459634 + }, + { + "epoch": 1.454412195836998, + "grad_norm": 0.5304232935220056, + "learning_rate": 0.0002995770442284896, + "loss": 3.2588658332824707, + "step": 2481, + "token_acc": 0.27340846374519817 + }, + { + "epoch": 1.4549985341542069, + "grad_norm": 0.4583550574197023, + "learning_rate": 0.0002995759525404291, + "loss": 3.225100040435791, + "step": 2482, + "token_acc": 0.2775747265453676 + }, + { + "epoch": 1.455584872471416, + "grad_norm": 0.4530442909995885, + "learning_rate": 0.00029957485944730395, + "loss": 3.2562572956085205, + "step": 2483, + "token_acc": 0.2740214823612326 + }, + { + "epoch": 1.456171210788625, + "grad_norm": 0.5157824579350037, + "learning_rate": 0.00029957376494912463, + "loss": 3.2559168338775635, + "step": 2484, + "token_acc": 0.2738537465506262 + }, + { + "epoch": 1.456757549105834, + "grad_norm": 0.48449141848147914, + "learning_rate": 0.00029957266904590127, + "loss": 3.2081894874572754, + "step": 2485, + "token_acc": 0.2790809295988357 + }, + { + "epoch": 1.457343887423043, + "grad_norm": 0.4741869173861958, + "learning_rate": 0.0002995715717376443, + "loss": 3.2100329399108887, + "step": 2486, + "token_acc": 0.27951631442792496 + }, + { + "epoch": 1.457930225740252, + "grad_norm": 0.5442997792728896, + "learning_rate": 0.0002995704730243639, + "loss": 3.2460222244262695, + "step": 2487, + "token_acc": 0.27498667292422824 + }, + { + "epoch": 1.458516564057461, + "grad_norm": 0.4962109401276086, + "learning_rate": 0.0002995693729060705, + "loss": 3.248276710510254, + "step": 2488, + "token_acc": 0.2753291479053252 + }, + { + "epoch": 1.4591029023746702, + "grad_norm": 0.5220143437471754, + "learning_rate": 0.00029956827138277444, + "loss": 3.2501816749572754, + "step": 2489, + "token_acc": 0.2723430666340558 + }, + { + "epoch": 1.4596892406918793, + "grad_norm": 0.5453953855106521, + "learning_rate": 0.00029956716845448597, + "loss": 3.215519905090332, + "step": 2490, + "token_acc": 0.2766432557093625 + }, + { + "epoch": 1.4602755790090882, + "grad_norm": 0.5130281087212133, + "learning_rate": 0.00029956606412121547, + "loss": 3.227383613586426, + "step": 2491, + "token_acc": 0.2762241683403509 + }, + { + "epoch": 1.4608619173262973, + "grad_norm": 0.5057125488916887, + "learning_rate": 0.00029956495838297334, + "loss": 3.2652158737182617, + "step": 2492, + "token_acc": 0.27056826503336723 + }, + { + "epoch": 1.4614482556435062, + "grad_norm": 0.5762047330434874, + "learning_rate": 0.00029956385123977, + "loss": 3.2863574028015137, + "step": 2493, + "token_acc": 0.2694660741562191 + }, + { + "epoch": 1.4620345939607153, + "grad_norm": 0.49294866725919084, + "learning_rate": 0.00029956274269161585, + "loss": 3.2374112606048584, + "step": 2494, + "token_acc": 0.2745273269853321 + }, + { + "epoch": 1.4626209322779244, + "grad_norm": 0.44041006703346686, + "learning_rate": 0.0002995616327385212, + "loss": 3.2690324783325195, + "step": 2495, + "token_acc": 0.27038789471204927 + }, + { + "epoch": 1.4632072705951333, + "grad_norm": 0.44474094645923584, + "learning_rate": 0.00029956052138049654, + "loss": 3.2864012718200684, + "step": 2496, + "token_acc": 0.26769168636364815 + }, + { + "epoch": 1.4637936089123424, + "grad_norm": 0.4583071001481383, + "learning_rate": 0.00029955940861755236, + "loss": 3.2882251739501953, + "step": 2497, + "token_acc": 0.26651701048883386 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.5544376954959978, + "learning_rate": 0.0002995582944496991, + "loss": 3.298161745071411, + "step": 2498, + "token_acc": 0.267579183259073 + }, + { + "epoch": 1.4649662855467604, + "grad_norm": 0.6396720187667903, + "learning_rate": 0.0002995571788769471, + "loss": 3.286190986633301, + "step": 2499, + "token_acc": 0.2688598415982279 + }, + { + "epoch": 1.4655526238639696, + "grad_norm": 0.7066212189495559, + "learning_rate": 0.000299556061899307, + "loss": 3.3011059761047363, + "step": 2500, + "token_acc": 0.2670835573413316 + }, + { + "epoch": 1.4661389621811787, + "grad_norm": 0.6975891701218879, + "learning_rate": 0.0002995549435167893, + "loss": 3.2530603408813477, + "step": 2501, + "token_acc": 0.2746035058430718 + }, + { + "epoch": 1.4667253004983876, + "grad_norm": 0.5440023876639446, + "learning_rate": 0.0002995538237294043, + "loss": 3.218017578125, + "step": 2502, + "token_acc": 0.2800704732415317 + }, + { + "epoch": 1.4673116388155967, + "grad_norm": 0.42733706521677034, + "learning_rate": 0.0002995527025371627, + "loss": 3.296980619430542, + "step": 2503, + "token_acc": 0.2661848333118229 + }, + { + "epoch": 1.4678979771328056, + "grad_norm": 0.5775984262825813, + "learning_rate": 0.00029955157994007497, + "loss": 3.2467880249023438, + "step": 2504, + "token_acc": 0.2720436499181045 + }, + { + "epoch": 1.4684843154500147, + "grad_norm": 0.607685630956546, + "learning_rate": 0.0002995504559381517, + "loss": 3.257120370864868, + "step": 2505, + "token_acc": 0.27233091228061096 + }, + { + "epoch": 1.4690706537672238, + "grad_norm": 0.5537875300196272, + "learning_rate": 0.00029954933053140344, + "loss": 3.267056465148926, + "step": 2506, + "token_acc": 0.27111471013819377 + }, + { + "epoch": 1.4696569920844327, + "grad_norm": 0.6099697965632727, + "learning_rate": 0.00029954820371984065, + "loss": 3.2631402015686035, + "step": 2507, + "token_acc": 0.27125433870658633 + }, + { + "epoch": 1.4702433304016418, + "grad_norm": 0.5226995278710362, + "learning_rate": 0.00029954707550347413, + "loss": 3.2801647186279297, + "step": 2508, + "token_acc": 0.2701979993757968 + }, + { + "epoch": 1.4708296687188507, + "grad_norm": 0.5569274326717343, + "learning_rate": 0.0002995459458823143, + "loss": 3.2543892860412598, + "step": 2509, + "token_acc": 0.27370963672742443 + }, + { + "epoch": 1.4714160070360598, + "grad_norm": 0.5216950567724785, + "learning_rate": 0.0002995448148563718, + "loss": 3.2383432388305664, + "step": 2510, + "token_acc": 0.27730205157462934 + }, + { + "epoch": 1.472002345353269, + "grad_norm": 0.49669954943848504, + "learning_rate": 0.00029954368242565726, + "loss": 3.2582168579101562, + "step": 2511, + "token_acc": 0.2735029777471202 + }, + { + "epoch": 1.4725886836704778, + "grad_norm": 0.5435656498050259, + "learning_rate": 0.0002995425485901814, + "loss": 3.3101158142089844, + "step": 2512, + "token_acc": 0.26606064979716393 + }, + { + "epoch": 1.473175021987687, + "grad_norm": 0.4658556567794753, + "learning_rate": 0.00029954141334995475, + "loss": 3.258605480194092, + "step": 2513, + "token_acc": 0.27394187380894275 + }, + { + "epoch": 1.4737613603048958, + "grad_norm": 0.4794428128559778, + "learning_rate": 0.000299540276704988, + "loss": 3.2497177124023438, + "step": 2514, + "token_acc": 0.27413987613493496 + }, + { + "epoch": 1.474347698622105, + "grad_norm": 0.3791618840094815, + "learning_rate": 0.0002995391386552919, + "loss": 3.2589731216430664, + "step": 2515, + "token_acc": 0.2722100528946372 + }, + { + "epoch": 1.474934036939314, + "grad_norm": 0.48772423027768735, + "learning_rate": 0.00029953799920087715, + "loss": 3.268817663192749, + "step": 2516, + "token_acc": 0.2708611584304379 + }, + { + "epoch": 1.4755203752565231, + "grad_norm": 0.508323440313134, + "learning_rate": 0.0002995368583417544, + "loss": 3.2834722995758057, + "step": 2517, + "token_acc": 0.2681912789286397 + }, + { + "epoch": 1.476106713573732, + "grad_norm": 0.5191147168077236, + "learning_rate": 0.00029953571607793433, + "loss": 3.2675676345825195, + "step": 2518, + "token_acc": 0.271583866244889 + }, + { + "epoch": 1.4766930518909411, + "grad_norm": 0.45053672047382126, + "learning_rate": 0.0002995345724094277, + "loss": 3.214259386062622, + "step": 2519, + "token_acc": 0.27819643693603263 + }, + { + "epoch": 1.47727939020815, + "grad_norm": 0.6000153775055845, + "learning_rate": 0.0002995334273362452, + "loss": 3.277830123901367, + "step": 2520, + "token_acc": 0.26850099042119374 + }, + { + "epoch": 1.4778657285253591, + "grad_norm": 0.6966706102589892, + "learning_rate": 0.00029953228085839777, + "loss": 3.2829794883728027, + "step": 2521, + "token_acc": 0.26748123373438953 + }, + { + "epoch": 1.4784520668425682, + "grad_norm": 0.5717307347648334, + "learning_rate": 0.00029953113297589604, + "loss": 3.198235034942627, + "step": 2522, + "token_acc": 0.27981393167070595 + }, + { + "epoch": 1.4790384051597771, + "grad_norm": 0.5715470010392587, + "learning_rate": 0.0002995299836887507, + "loss": 3.3156356811523438, + "step": 2523, + "token_acc": 0.26235408896485907 + }, + { + "epoch": 1.4796247434769862, + "grad_norm": 0.5552363518654317, + "learning_rate": 0.0002995288329969728, + "loss": 3.2741451263427734, + "step": 2524, + "token_acc": 0.2698504146005226 + }, + { + "epoch": 1.4802110817941951, + "grad_norm": 0.4539637922710809, + "learning_rate": 0.0002995276809005729, + "loss": 3.2360451221466064, + "step": 2525, + "token_acc": 0.27472926261265496 + }, + { + "epoch": 1.4807974201114043, + "grad_norm": 0.5802624707121228, + "learning_rate": 0.000299526527399562, + "loss": 3.282024383544922, + "step": 2526, + "token_acc": 0.2711384485135072 + }, + { + "epoch": 1.4813837584286134, + "grad_norm": 0.4483308268578799, + "learning_rate": 0.00029952537249395086, + "loss": 3.2770121097564697, + "step": 2527, + "token_acc": 0.26936493499545233 + }, + { + "epoch": 1.4819700967458225, + "grad_norm": 0.49547409787206204, + "learning_rate": 0.00029952421618375033, + "loss": 3.2745680809020996, + "step": 2528, + "token_acc": 0.26975465639049456 + }, + { + "epoch": 1.4825564350630314, + "grad_norm": 0.5260362243889769, + "learning_rate": 0.00029952305846897125, + "loss": 3.2819323539733887, + "step": 2529, + "token_acc": 0.2681352753574125 + }, + { + "epoch": 1.4831427733802405, + "grad_norm": 0.4988861512922218, + "learning_rate": 0.0002995218993496245, + "loss": 3.2443206310272217, + "step": 2530, + "token_acc": 0.2739736906149531 + }, + { + "epoch": 1.4837291116974494, + "grad_norm": 0.4954154624170425, + "learning_rate": 0.00029952073882572104, + "loss": 3.2632195949554443, + "step": 2531, + "token_acc": 0.2720941903107508 + }, + { + "epoch": 1.4843154500146585, + "grad_norm": 0.4384017291872042, + "learning_rate": 0.0002995195768972717, + "loss": 3.2593283653259277, + "step": 2532, + "token_acc": 0.2712260456708239 + }, + { + "epoch": 1.4849017883318676, + "grad_norm": 0.5259080653794915, + "learning_rate": 0.00029951841356428744, + "loss": 3.2523694038391113, + "step": 2533, + "token_acc": 0.27217687126680334 + }, + { + "epoch": 1.4854881266490765, + "grad_norm": 0.5265600721444553, + "learning_rate": 0.0002995172488267791, + "loss": 3.270918369293213, + "step": 2534, + "token_acc": 0.2706597149881245 + }, + { + "epoch": 1.4860744649662856, + "grad_norm": 0.4707641302751526, + "learning_rate": 0.00029951608268475775, + "loss": 3.2479538917541504, + "step": 2535, + "token_acc": 0.27229970352884797 + }, + { + "epoch": 1.4866608032834945, + "grad_norm": 0.44607243468239133, + "learning_rate": 0.00029951491513823425, + "loss": 3.277556896209717, + "step": 2536, + "token_acc": 0.26924773143384073 + }, + { + "epoch": 1.4872471416007036, + "grad_norm": 0.4228502518496769, + "learning_rate": 0.0002995137461872196, + "loss": 3.2539877891540527, + "step": 2537, + "token_acc": 0.2751127456955257 + }, + { + "epoch": 1.4878334799179127, + "grad_norm": 0.49795194784284313, + "learning_rate": 0.00029951257583172474, + "loss": 3.2471823692321777, + "step": 2538, + "token_acc": 0.2748196531822004 + }, + { + "epoch": 1.4884198182351216, + "grad_norm": 0.4719773575178683, + "learning_rate": 0.0002995114040717608, + "loss": 3.24350905418396, + "step": 2539, + "token_acc": 0.2763765569563413 + }, + { + "epoch": 1.4890061565523307, + "grad_norm": 0.45785338985826807, + "learning_rate": 0.00029951023090733856, + "loss": 3.3011040687561035, + "step": 2540, + "token_acc": 0.2656882657065162 + }, + { + "epoch": 1.4895924948695396, + "grad_norm": 0.4834041683333375, + "learning_rate": 0.00029950905633846926, + "loss": 3.2872345447540283, + "step": 2541, + "token_acc": 0.26841136074257543 + }, + { + "epoch": 1.4901788331867487, + "grad_norm": 0.4940169380604645, + "learning_rate": 0.00029950788036516376, + "loss": 3.238436698913574, + "step": 2542, + "token_acc": 0.2766128439210413 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.4058187144725442, + "learning_rate": 0.00029950670298743324, + "loss": 3.281667470932007, + "step": 2543, + "token_acc": 0.2691628245446668 + }, + { + "epoch": 1.491351509821167, + "grad_norm": 0.49064419683017546, + "learning_rate": 0.00029950552420528875, + "loss": 3.2514026165008545, + "step": 2544, + "token_acc": 0.2731924800712513 + }, + { + "epoch": 1.4919378481383758, + "grad_norm": 0.4368608617111464, + "learning_rate": 0.0002995043440187413, + "loss": 3.2364449501037598, + "step": 2545, + "token_acc": 0.27491803722871 + }, + { + "epoch": 1.492524186455585, + "grad_norm": 0.4860734974790874, + "learning_rate": 0.00029950316242780196, + "loss": 3.2919654846191406, + "step": 2546, + "token_acc": 0.26726588918143573 + }, + { + "epoch": 1.4931105247727938, + "grad_norm": 0.5389055302919861, + "learning_rate": 0.00029950197943248187, + "loss": 3.270216941833496, + "step": 2547, + "token_acc": 0.26860392407339323 + }, + { + "epoch": 1.493696863090003, + "grad_norm": 0.604460852909286, + "learning_rate": 0.00029950079503279217, + "loss": 3.2792298793792725, + "step": 2548, + "token_acc": 0.2682338042660968 + }, + { + "epoch": 1.494283201407212, + "grad_norm": 0.6795394845777744, + "learning_rate": 0.00029949960922874395, + "loss": 3.2567477226257324, + "step": 2549, + "token_acc": 0.27264136418897755 + }, + { + "epoch": 1.494869539724421, + "grad_norm": 0.7381154273102846, + "learning_rate": 0.00029949842202034834, + "loss": 3.2532291412353516, + "step": 2550, + "token_acc": 0.268973641211415 + }, + { + "epoch": 1.49545587804163, + "grad_norm": 0.6467591076161764, + "learning_rate": 0.0002994972334076165, + "loss": 3.2274885177612305, + "step": 2551, + "token_acc": 0.2757874817666925 + }, + { + "epoch": 1.496042216358839, + "grad_norm": 0.5478815680891119, + "learning_rate": 0.0002994960433905596, + "loss": 3.291308641433716, + "step": 2552, + "token_acc": 0.26750355269814396 + }, + { + "epoch": 1.496628554676048, + "grad_norm": 0.5303150284439865, + "learning_rate": 0.00029949485196918886, + "loss": 3.228041172027588, + "step": 2553, + "token_acc": 0.27546946845338643 + }, + { + "epoch": 1.4972148929932572, + "grad_norm": 0.6857198423433041, + "learning_rate": 0.00029949365914351544, + "loss": 3.227492570877075, + "step": 2554, + "token_acc": 0.2766444544376763 + }, + { + "epoch": 1.4978012313104663, + "grad_norm": 0.5785615169596057, + "learning_rate": 0.0002994924649135504, + "loss": 3.2660446166992188, + "step": 2555, + "token_acc": 0.2707877834578582 + }, + { + "epoch": 1.4983875696276752, + "grad_norm": 0.5484608991133217, + "learning_rate": 0.00029949126927930527, + "loss": 3.2557449340820312, + "step": 2556, + "token_acc": 0.2738700768334664 + }, + { + "epoch": 1.4989739079448843, + "grad_norm": 0.6157931503054472, + "learning_rate": 0.00029949007224079106, + "loss": 3.308929443359375, + "step": 2557, + "token_acc": 0.2669024173319344 + }, + { + "epoch": 1.4995602462620932, + "grad_norm": 0.5611333465156132, + "learning_rate": 0.000299488873798019, + "loss": 3.266162157058716, + "step": 2558, + "token_acc": 0.27174382703925715 + }, + { + "epoch": 1.5001465845793023, + "grad_norm": 0.5025520046643134, + "learning_rate": 0.00029948767395100045, + "loss": 3.2401084899902344, + "step": 2559, + "token_acc": 0.27444810720703383 + }, + { + "epoch": 1.5007329228965114, + "grad_norm": 0.5886691227972639, + "learning_rate": 0.0002994864726997466, + "loss": 3.246023654937744, + "step": 2560, + "token_acc": 0.27449062796746626 + }, + { + "epoch": 1.5013192612137203, + "grad_norm": 0.4680933992063443, + "learning_rate": 0.0002994852700442689, + "loss": 3.238621711730957, + "step": 2561, + "token_acc": 0.27515372696794815 + }, + { + "epoch": 1.5019055995309294, + "grad_norm": 0.5113795499723659, + "learning_rate": 0.0002994840659845784, + "loss": 3.309816360473633, + "step": 2562, + "token_acc": 0.26539564393491394 + }, + { + "epoch": 1.5024919378481383, + "grad_norm": 0.5286890906881446, + "learning_rate": 0.00029948286052068656, + "loss": 3.233323574066162, + "step": 2563, + "token_acc": 0.2752145175354863 + }, + { + "epoch": 1.5030782761653474, + "grad_norm": 0.4918747975834918, + "learning_rate": 0.0002994816536526047, + "loss": 3.2589094638824463, + "step": 2564, + "token_acc": 0.27382327151618374 + }, + { + "epoch": 1.5036646144825565, + "grad_norm": 0.5063870917674708, + "learning_rate": 0.0002994804453803441, + "loss": 3.249185085296631, + "step": 2565, + "token_acc": 0.2723194007251863 + }, + { + "epoch": 1.5042509527997656, + "grad_norm": 0.5342237900882338, + "learning_rate": 0.00029947923570391614, + "loss": 3.2482357025146484, + "step": 2566, + "token_acc": 0.2719251910233412 + }, + { + "epoch": 1.5048372911169745, + "grad_norm": 0.4536384078553172, + "learning_rate": 0.00029947802462333223, + "loss": 3.2479400634765625, + "step": 2567, + "token_acc": 0.2740501510749105 + }, + { + "epoch": 1.5054236294341834, + "grad_norm": 0.440432686457559, + "learning_rate": 0.00029947681213860367, + "loss": 3.255180835723877, + "step": 2568, + "token_acc": 0.2716750321588014 + }, + { + "epoch": 1.5060099677513925, + "grad_norm": 0.45371671423950166, + "learning_rate": 0.0002994755982497419, + "loss": 3.273400068283081, + "step": 2569, + "token_acc": 0.26875115146729833 + }, + { + "epoch": 1.5065963060686016, + "grad_norm": 0.5039327713370054, + "learning_rate": 0.0002994743829567583, + "loss": 3.2287909984588623, + "step": 2570, + "token_acc": 0.2754379212781892 + }, + { + "epoch": 1.5071826443858107, + "grad_norm": 0.446406781702838, + "learning_rate": 0.00029947316625966426, + "loss": 3.2684402465820312, + "step": 2571, + "token_acc": 0.2712274565912423 + }, + { + "epoch": 1.5077689827030196, + "grad_norm": 0.5615606327568824, + "learning_rate": 0.00029947194815847127, + "loss": 3.2657227516174316, + "step": 2572, + "token_acc": 0.2721600185111388 + }, + { + "epoch": 1.5083553210202285, + "grad_norm": 0.5297100396298892, + "learning_rate": 0.00029947072865319077, + "loss": 3.244450092315674, + "step": 2573, + "token_acc": 0.2752240192537881 + }, + { + "epoch": 1.5089416593374376, + "grad_norm": 0.4599382675637798, + "learning_rate": 0.00029946950774383413, + "loss": 3.282733917236328, + "step": 2574, + "token_acc": 0.26875566925201544 + }, + { + "epoch": 1.5095279976546467, + "grad_norm": 0.43347404231698083, + "learning_rate": 0.0002994682854304129, + "loss": 3.2786543369293213, + "step": 2575, + "token_acc": 0.2705432246026227 + }, + { + "epoch": 1.5101143359718558, + "grad_norm": 0.488587773862481, + "learning_rate": 0.00029946706171293856, + "loss": 3.239985466003418, + "step": 2576, + "token_acc": 0.27749877542602286 + }, + { + "epoch": 1.5107006742890647, + "grad_norm": 0.4545033777558242, + "learning_rate": 0.0002994658365914226, + "loss": 3.250013828277588, + "step": 2577, + "token_acc": 0.27398447090441574 + }, + { + "epoch": 1.5112870126062738, + "grad_norm": 0.391870837510196, + "learning_rate": 0.0002994646100658765, + "loss": 3.305877447128296, + "step": 2578, + "token_acc": 0.2647496858888263 + }, + { + "epoch": 1.5118733509234827, + "grad_norm": 0.4191626082476612, + "learning_rate": 0.00029946338213631177, + "loss": 3.2644736766815186, + "step": 2579, + "token_acc": 0.2711852438823465 + }, + { + "epoch": 1.5124596892406919, + "grad_norm": 0.42134967997244777, + "learning_rate": 0.00029946215280274, + "loss": 3.2502541542053223, + "step": 2580, + "token_acc": 0.27193422706976217 + }, + { + "epoch": 1.513046027557901, + "grad_norm": 0.47330365609355307, + "learning_rate": 0.0002994609220651726, + "loss": 3.3019003868103027, + "step": 2581, + "token_acc": 0.26723835582697253 + }, + { + "epoch": 1.51363236587511, + "grad_norm": 0.5174094776256812, + "learning_rate": 0.00029945968992362135, + "loss": 3.304482936859131, + "step": 2582, + "token_acc": 0.26464804916643586 + }, + { + "epoch": 1.514218704192319, + "grad_norm": 0.5783972425432596, + "learning_rate": 0.0002994584563780977, + "loss": 3.2893614768981934, + "step": 2583, + "token_acc": 0.268629930680548 + }, + { + "epoch": 1.5148050425095279, + "grad_norm": 0.5168786804593303, + "learning_rate": 0.00029945722142861323, + "loss": 3.2524969577789307, + "step": 2584, + "token_acc": 0.27404341412905603 + }, + { + "epoch": 1.515391380826737, + "grad_norm": 0.4719959664394992, + "learning_rate": 0.0002994559850751796, + "loss": 3.265166997909546, + "step": 2585, + "token_acc": 0.272577012544325 + }, + { + "epoch": 1.515977719143946, + "grad_norm": 0.5017331559458807, + "learning_rate": 0.00029945474731780827, + "loss": 3.2844796180725098, + "step": 2586, + "token_acc": 0.26557221925600133 + }, + { + "epoch": 1.5165640574611552, + "grad_norm": 0.46152632999954957, + "learning_rate": 0.0002994535081565111, + "loss": 3.2420570850372314, + "step": 2587, + "token_acc": 0.27436457048353646 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.5222176614826465, + "learning_rate": 0.00029945226759129956, + "loss": 3.255290985107422, + "step": 2588, + "token_acc": 0.27016946108801637 + }, + { + "epoch": 1.5177367340955732, + "grad_norm": 0.49994409525667705, + "learning_rate": 0.0002994510256221854, + "loss": 3.2530574798583984, + "step": 2589, + "token_acc": 0.2727175178947644 + }, + { + "epoch": 1.518323072412782, + "grad_norm": 0.4409735661011459, + "learning_rate": 0.00029944978224918017, + "loss": 3.2479567527770996, + "step": 2590, + "token_acc": 0.2735926242682123 + }, + { + "epoch": 1.5189094107299912, + "grad_norm": 0.46460038040727386, + "learning_rate": 0.0002994485374722957, + "loss": 3.2456612586975098, + "step": 2591, + "token_acc": 0.2726161522610505 + }, + { + "epoch": 1.5194957490472003, + "grad_norm": 0.41028938097018913, + "learning_rate": 0.00029944729129154356, + "loss": 3.2668919563293457, + "step": 2592, + "token_acc": 0.27042461135299656 + }, + { + "epoch": 1.5200820873644094, + "grad_norm": 0.5114396729619312, + "learning_rate": 0.00029944604370693556, + "loss": 3.2745766639709473, + "step": 2593, + "token_acc": 0.27089257463110983 + }, + { + "epoch": 1.5206684256816183, + "grad_norm": 0.48592698234181303, + "learning_rate": 0.0002994447947184833, + "loss": 3.260713815689087, + "step": 2594, + "token_acc": 0.2726756232001147 + }, + { + "epoch": 1.5212547639988272, + "grad_norm": 0.44329972163254655, + "learning_rate": 0.0002994435443261986, + "loss": 3.282197952270508, + "step": 2595, + "token_acc": 0.27206243577044925 + }, + { + "epoch": 1.5218411023160363, + "grad_norm": 0.41586322560510586, + "learning_rate": 0.0002994422925300931, + "loss": 3.2038583755493164, + "step": 2596, + "token_acc": 0.2789060609726591 + }, + { + "epoch": 1.5224274406332454, + "grad_norm": 0.5548817611130542, + "learning_rate": 0.00029944103933017877, + "loss": 3.2753467559814453, + "step": 2597, + "token_acc": 0.2711772745861943 + }, + { + "epoch": 1.5230137789504545, + "grad_norm": 0.5008536769083188, + "learning_rate": 0.00029943978472646716, + "loss": 3.2493529319763184, + "step": 2598, + "token_acc": 0.27152359422006217 + }, + { + "epoch": 1.5236001172676634, + "grad_norm": 0.5939104116254532, + "learning_rate": 0.00029943852871897015, + "loss": 3.245530843734741, + "step": 2599, + "token_acc": 0.2729049298462572 + }, + { + "epoch": 1.5241864555848723, + "grad_norm": 0.6654278794585069, + "learning_rate": 0.0002994372713076995, + "loss": 3.2976255416870117, + "step": 2600, + "token_acc": 0.26559652839716935 + }, + { + "epoch": 1.5247727939020814, + "grad_norm": 0.5722669192820382, + "learning_rate": 0.0002994360124926672, + "loss": 3.202688694000244, + "step": 2601, + "token_acc": 0.2794341849182823 + }, + { + "epoch": 1.5253591322192905, + "grad_norm": 0.4347782463005387, + "learning_rate": 0.0002994347522738848, + "loss": 3.2058236598968506, + "step": 2602, + "token_acc": 0.2794052739756437 + }, + { + "epoch": 1.5259454705364996, + "grad_norm": 0.4843206431020174, + "learning_rate": 0.0002994334906513643, + "loss": 3.234999895095825, + "step": 2603, + "token_acc": 0.27464556109033733 + }, + { + "epoch": 1.5265318088537085, + "grad_norm": 0.46618686937105536, + "learning_rate": 0.0002994322276251175, + "loss": 3.247464656829834, + "step": 2604, + "token_acc": 0.27201700205884305 + }, + { + "epoch": 1.5271181471709177, + "grad_norm": 0.49863968721517776, + "learning_rate": 0.00029943096319515634, + "loss": 3.25022292137146, + "step": 2605, + "token_acc": 0.2744658458134877 + }, + { + "epoch": 1.5277044854881265, + "grad_norm": 0.42156666304996887, + "learning_rate": 0.0002994296973614926, + "loss": 3.2403554916381836, + "step": 2606, + "token_acc": 0.27340907657041913 + }, + { + "epoch": 1.5282908238053357, + "grad_norm": 0.4976139037400748, + "learning_rate": 0.0002994284301241382, + "loss": 3.2825558185577393, + "step": 2607, + "token_acc": 0.2681418460954357 + }, + { + "epoch": 1.5288771621225448, + "grad_norm": 0.5069534446103522, + "learning_rate": 0.0002994271614831051, + "loss": 3.2484450340270996, + "step": 2608, + "token_acc": 0.27271462623160453 + }, + { + "epoch": 1.5294635004397539, + "grad_norm": 0.5699437655237528, + "learning_rate": 0.0002994258914384051, + "loss": 3.2138636112213135, + "step": 2609, + "token_acc": 0.2766587980420409 + }, + { + "epoch": 1.5300498387569628, + "grad_norm": 0.45998378336695894, + "learning_rate": 0.0002994246199900503, + "loss": 3.2879927158355713, + "step": 2610, + "token_acc": 0.2679223544928722 + }, + { + "epoch": 1.5306361770741717, + "grad_norm": 0.6154288515773049, + "learning_rate": 0.0002994233471380525, + "loss": 3.2672739028930664, + "step": 2611, + "token_acc": 0.27183243794909523 + }, + { + "epoch": 1.5312225153913808, + "grad_norm": 0.56610889987781, + "learning_rate": 0.00029942207288242366, + "loss": 3.26328182220459, + "step": 2612, + "token_acc": 0.27076172494849576 + }, + { + "epoch": 1.5318088537085899, + "grad_norm": 0.5501456404692603, + "learning_rate": 0.0002994207972231759, + "loss": 3.206486701965332, + "step": 2613, + "token_acc": 0.27696600736541566 + }, + { + "epoch": 1.532395192025799, + "grad_norm": 0.517087751685438, + "learning_rate": 0.000299419520160321, + "loss": 3.2929186820983887, + "step": 2614, + "token_acc": 0.2689949875132426 + }, + { + "epoch": 1.5329815303430079, + "grad_norm": 0.5075746750818896, + "learning_rate": 0.0002994182416938711, + "loss": 3.207481861114502, + "step": 2615, + "token_acc": 0.27807730634079825 + }, + { + "epoch": 1.533567868660217, + "grad_norm": 0.4878958821266842, + "learning_rate": 0.0002994169618238382, + "loss": 3.2330331802368164, + "step": 2616, + "token_acc": 0.27519103656947774 + }, + { + "epoch": 1.5341542069774259, + "grad_norm": 0.4831997251494441, + "learning_rate": 0.00029941568055023415, + "loss": 3.2875823974609375, + "step": 2617, + "token_acc": 0.2694377162413971 + }, + { + "epoch": 1.534740545294635, + "grad_norm": 0.5153014499430449, + "learning_rate": 0.00029941439787307126, + "loss": 3.2962586879730225, + "step": 2618, + "token_acc": 0.26776753712237583 + }, + { + "epoch": 1.535326883611844, + "grad_norm": 0.5186886324905743, + "learning_rate": 0.0002994131137923614, + "loss": 3.2489466667175293, + "step": 2619, + "token_acc": 0.27368841690673507 + }, + { + "epoch": 1.5359132219290532, + "grad_norm": 0.48159006395388476, + "learning_rate": 0.0002994118283081166, + "loss": 3.233454704284668, + "step": 2620, + "token_acc": 0.27397085893084916 + }, + { + "epoch": 1.536499560246262, + "grad_norm": 0.4378062199857777, + "learning_rate": 0.0002994105414203491, + "loss": 3.257981061935425, + "step": 2621, + "token_acc": 0.27231493367565257 + }, + { + "epoch": 1.537085898563471, + "grad_norm": 0.4942286525519708, + "learning_rate": 0.00029940925312907086, + "loss": 3.2460618019104004, + "step": 2622, + "token_acc": 0.27283662691695704 + }, + { + "epoch": 1.53767223688068, + "grad_norm": 0.5398683189916152, + "learning_rate": 0.00029940796343429406, + "loss": 3.278038740158081, + "step": 2623, + "token_acc": 0.2706649588910444 + }, + { + "epoch": 1.5382585751978892, + "grad_norm": 0.5380032709716642, + "learning_rate": 0.0002994066723360307, + "loss": 3.3141283988952637, + "step": 2624, + "token_acc": 0.2665029070840076 + }, + { + "epoch": 1.5388449135150983, + "grad_norm": 0.4758418024750476, + "learning_rate": 0.00029940537983429307, + "loss": 3.2510204315185547, + "step": 2625, + "token_acc": 0.2709560046683319 + }, + { + "epoch": 1.5394312518323072, + "grad_norm": 0.602552575717143, + "learning_rate": 0.0002994040859290932, + "loss": 3.279886484146118, + "step": 2626, + "token_acc": 0.2686613327256005 + }, + { + "epoch": 1.5400175901495161, + "grad_norm": 0.6005688332614376, + "learning_rate": 0.0002994027906204432, + "loss": 3.2397189140319824, + "step": 2627, + "token_acc": 0.2728930623339717 + }, + { + "epoch": 1.5406039284667252, + "grad_norm": 0.5543763550345601, + "learning_rate": 0.0002994014939083553, + "loss": 3.2142765522003174, + "step": 2628, + "token_acc": 0.27727140988171306 + }, + { + "epoch": 1.5411902667839343, + "grad_norm": 0.5186816863153937, + "learning_rate": 0.0002994001957928418, + "loss": 3.245211601257324, + "step": 2629, + "token_acc": 0.2738477770850319 + }, + { + "epoch": 1.5417766051011434, + "grad_norm": 0.5585858915856783, + "learning_rate": 0.00029939889627391466, + "loss": 3.2058024406433105, + "step": 2630, + "token_acc": 0.27804180291187236 + }, + { + "epoch": 1.5423629434183523, + "grad_norm": 0.5168940902518459, + "learning_rate": 0.0002993975953515863, + "loss": 3.2340588569641113, + "step": 2631, + "token_acc": 0.2759878002448394 + }, + { + "epoch": 1.5429492817355615, + "grad_norm": 0.508829846026294, + "learning_rate": 0.00029939629302586877, + "loss": 3.201805353164673, + "step": 2632, + "token_acc": 0.28036247465123154 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.5624196940686363, + "learning_rate": 0.0002993949892967744, + "loss": 3.2233927249908447, + "step": 2633, + "token_acc": 0.2764161731782088 + }, + { + "epoch": 1.5441219583699795, + "grad_norm": 0.5209484581224846, + "learning_rate": 0.00029939368416431544, + "loss": 3.266388416290283, + "step": 2634, + "token_acc": 0.2720329470497581 + }, + { + "epoch": 1.5447082966871886, + "grad_norm": 0.5955666412975484, + "learning_rate": 0.00029939237762850415, + "loss": 3.2704360485076904, + "step": 2635, + "token_acc": 0.2716294378773017 + }, + { + "epoch": 1.5452946350043977, + "grad_norm": 0.5188519140174653, + "learning_rate": 0.00029939106968935274, + "loss": 3.205293655395508, + "step": 2636, + "token_acc": 0.2799473088335506 + }, + { + "epoch": 1.5458809733216066, + "grad_norm": 0.4332729019256432, + "learning_rate": 0.0002993897603468736, + "loss": 3.257351875305176, + "step": 2637, + "token_acc": 0.2719226067026084 + }, + { + "epoch": 1.5464673116388155, + "grad_norm": 0.5313701918927636, + "learning_rate": 0.00029938844960107885, + "loss": 3.2809062004089355, + "step": 2638, + "token_acc": 0.2687805468310211 + }, + { + "epoch": 1.5470536499560246, + "grad_norm": 0.5217645341071558, + "learning_rate": 0.00029938713745198103, + "loss": 3.228593349456787, + "step": 2639, + "token_acc": 0.27617740058655116 + }, + { + "epoch": 1.5476399882732337, + "grad_norm": 0.49795970880218154, + "learning_rate": 0.0002993858238995923, + "loss": 3.2364792823791504, + "step": 2640, + "token_acc": 0.27504581632275815 + }, + { + "epoch": 1.5482263265904428, + "grad_norm": 0.41881416357796636, + "learning_rate": 0.00029938450894392507, + "loss": 3.3082218170166016, + "step": 2641, + "token_acc": 0.2665928531224979 + }, + { + "epoch": 1.5488126649076517, + "grad_norm": 0.42763663208929126, + "learning_rate": 0.0002993831925849917, + "loss": 3.226113796234131, + "step": 2642, + "token_acc": 0.2754932061667102 + }, + { + "epoch": 1.5493990032248608, + "grad_norm": 0.4859469581253531, + "learning_rate": 0.00029938187482280446, + "loss": 3.2522401809692383, + "step": 2643, + "token_acc": 0.27337353322900587 + }, + { + "epoch": 1.5499853415420697, + "grad_norm": 0.5248984494678559, + "learning_rate": 0.0002993805556573759, + "loss": 3.245025157928467, + "step": 2644, + "token_acc": 0.2746379663966521 + }, + { + "epoch": 1.5505716798592788, + "grad_norm": 0.5210220205602817, + "learning_rate": 0.00029937923508871825, + "loss": 3.226012706756592, + "step": 2645, + "token_acc": 0.27477981698167403 + }, + { + "epoch": 1.551158018176488, + "grad_norm": 0.5661392376928134, + "learning_rate": 0.00029937791311684394, + "loss": 3.2766690254211426, + "step": 2646, + "token_acc": 0.26932523335245345 + }, + { + "epoch": 1.551744356493697, + "grad_norm": 0.4764228029122335, + "learning_rate": 0.00029937658974176553, + "loss": 3.226942539215088, + "step": 2647, + "token_acc": 0.2755716987730077 + }, + { + "epoch": 1.552330694810906, + "grad_norm": 0.5232041755602224, + "learning_rate": 0.0002993752649634952, + "loss": 3.2254648208618164, + "step": 2648, + "token_acc": 0.2772045858875839 + }, + { + "epoch": 1.5529170331281148, + "grad_norm": 0.4967070088009408, + "learning_rate": 0.0002993739387820457, + "loss": 3.2678744792938232, + "step": 2649, + "token_acc": 0.27159833341439293 + }, + { + "epoch": 1.553503371445324, + "grad_norm": 0.5475988180413215, + "learning_rate": 0.0002993726111974292, + "loss": 3.2653234004974365, + "step": 2650, + "token_acc": 0.2714494517638791 + }, + { + "epoch": 1.554089709762533, + "grad_norm": 0.6757056347179742, + "learning_rate": 0.0002993712822096584, + "loss": 3.2989320755004883, + "step": 2651, + "token_acc": 0.2675348856242583 + }, + { + "epoch": 1.5546760480797421, + "grad_norm": 0.5549801353491003, + "learning_rate": 0.00029936995181874563, + "loss": 3.2337820529937744, + "step": 2652, + "token_acc": 0.27528565271777405 + }, + { + "epoch": 1.555262386396951, + "grad_norm": 0.5457228611590723, + "learning_rate": 0.00029936862002470345, + "loss": 3.2703475952148438, + "step": 2653, + "token_acc": 0.2722524574066917 + }, + { + "epoch": 1.55584872471416, + "grad_norm": 0.5383024036623529, + "learning_rate": 0.0002993672868275444, + "loss": 3.264014720916748, + "step": 2654, + "token_acc": 0.2710610812624135 + }, + { + "epoch": 1.556435063031369, + "grad_norm": 0.5650581786893103, + "learning_rate": 0.0002993659522272809, + "loss": 3.2954115867614746, + "step": 2655, + "token_acc": 0.2676637079668734 + }, + { + "epoch": 1.5570214013485781, + "grad_norm": 0.44519816202854007, + "learning_rate": 0.0002993646162239256, + "loss": 3.2726893424987793, + "step": 2656, + "token_acc": 0.2704731085852894 + }, + { + "epoch": 1.5576077396657872, + "grad_norm": 0.46865346415689174, + "learning_rate": 0.00029936327881749093, + "loss": 3.277496576309204, + "step": 2657, + "token_acc": 0.2702793251525586 + }, + { + "epoch": 1.5581940779829961, + "grad_norm": 0.4602070319871101, + "learning_rate": 0.00029936194000798963, + "loss": 3.246164321899414, + "step": 2658, + "token_acc": 0.2727380072285198 + }, + { + "epoch": 1.5587804163002053, + "grad_norm": 0.4626760257442825, + "learning_rate": 0.0002993605997954341, + "loss": 3.2376041412353516, + "step": 2659, + "token_acc": 0.2741505412078711 + }, + { + "epoch": 1.5593667546174141, + "grad_norm": 0.42804034608065045, + "learning_rate": 0.000299359258179837, + "loss": 3.24965763092041, + "step": 2660, + "token_acc": 0.2718377953399089 + }, + { + "epoch": 1.5599530929346233, + "grad_norm": 0.41635655726721815, + "learning_rate": 0.00029935791516121096, + "loss": 3.249459743499756, + "step": 2661, + "token_acc": 0.27161737626031757 + }, + { + "epoch": 1.5605394312518324, + "grad_norm": 0.42683243209117, + "learning_rate": 0.0002993565707395686, + "loss": 3.229635238647461, + "step": 2662, + "token_acc": 0.27518101071330847 + }, + { + "epoch": 1.5611257695690415, + "grad_norm": 0.44940662108950113, + "learning_rate": 0.00029935522491492247, + "loss": 3.217538833618164, + "step": 2663, + "token_acc": 0.2748281791377717 + }, + { + "epoch": 1.5617121078862504, + "grad_norm": 0.48855416948332153, + "learning_rate": 0.00029935387768728524, + "loss": 3.2292075157165527, + "step": 2664, + "token_acc": 0.2742352341409843 + }, + { + "epoch": 1.5622984462034593, + "grad_norm": 0.5076304708868539, + "learning_rate": 0.0002993525290566697, + "loss": 3.2850582599639893, + "step": 2665, + "token_acc": 0.2696157428239554 + }, + { + "epoch": 1.5628847845206684, + "grad_norm": 0.5492576815214156, + "learning_rate": 0.0002993511790230883, + "loss": 3.2378196716308594, + "step": 2666, + "token_acc": 0.2737053469185579 + }, + { + "epoch": 1.5634711228378775, + "grad_norm": 0.5677140969281658, + "learning_rate": 0.00029934982758655383, + "loss": 3.2756845951080322, + "step": 2667, + "token_acc": 0.2711083272361071 + }, + { + "epoch": 1.5640574611550866, + "grad_norm": 0.5576659664714068, + "learning_rate": 0.000299348474747079, + "loss": 3.328035354614258, + "step": 2668, + "token_acc": 0.26395129240124104 + }, + { + "epoch": 1.5646437994722955, + "grad_norm": 0.4934578169744558, + "learning_rate": 0.0002993471205046765, + "loss": 3.269674301147461, + "step": 2669, + "token_acc": 0.27037367648403177 + }, + { + "epoch": 1.5652301377895046, + "grad_norm": 0.5148148350339178, + "learning_rate": 0.0002993457648593591, + "loss": 3.2371411323547363, + "step": 2670, + "token_acc": 0.2724076585328459 + }, + { + "epoch": 1.5658164761067135, + "grad_norm": 0.4760550046760065, + "learning_rate": 0.0002993444078111394, + "loss": 3.307738780975342, + "step": 2671, + "token_acc": 0.26641953934475854 + }, + { + "epoch": 1.5664028144239226, + "grad_norm": 0.4691676065456747, + "learning_rate": 0.00029934304936003026, + "loss": 3.2141170501708984, + "step": 2672, + "token_acc": 0.2774571558808868 + }, + { + "epoch": 1.5669891527411317, + "grad_norm": 0.49930497229939125, + "learning_rate": 0.0002993416895060444, + "loss": 3.2581887245178223, + "step": 2673, + "token_acc": 0.2731966198103066 + }, + { + "epoch": 1.5675754910583408, + "grad_norm": 0.5530823212722192, + "learning_rate": 0.0002993403282491947, + "loss": 3.2199554443359375, + "step": 2674, + "token_acc": 0.27857337408074684 + }, + { + "epoch": 1.5681618293755497, + "grad_norm": 0.416715109705561, + "learning_rate": 0.00029933896558949374, + "loss": 3.2289445400238037, + "step": 2675, + "token_acc": 0.2758312410126748 + }, + { + "epoch": 1.5687481676927586, + "grad_norm": 0.42689102131088197, + "learning_rate": 0.0002993376015269545, + "loss": 3.243030548095703, + "step": 2676, + "token_acc": 0.27342611275656553 + }, + { + "epoch": 1.5693345060099677, + "grad_norm": 0.40203869612757054, + "learning_rate": 0.0002993362360615897, + "loss": 3.266345500946045, + "step": 2677, + "token_acc": 0.27061593465686135 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.4417616604422087, + "learning_rate": 0.0002993348691934122, + "loss": 3.2280187606811523, + "step": 2678, + "token_acc": 0.2741423602023626 + }, + { + "epoch": 1.570507182644386, + "grad_norm": 0.45648041885251595, + "learning_rate": 0.0002993335009224348, + "loss": 3.225739002227783, + "step": 2679, + "token_acc": 0.2772000304427242 + }, + { + "epoch": 1.5710935209615948, + "grad_norm": 0.44526434210683724, + "learning_rate": 0.00029933213124867043, + "loss": 3.2088475227355957, + "step": 2680, + "token_acc": 0.2777221585338652 + }, + { + "epoch": 1.5716798592788037, + "grad_norm": 0.3985179985085914, + "learning_rate": 0.00029933076017213194, + "loss": 3.222714424133301, + "step": 2681, + "token_acc": 0.27779638757291053 + }, + { + "epoch": 1.5722661975960128, + "grad_norm": 0.47015768871391006, + "learning_rate": 0.00029932938769283214, + "loss": 3.2354116439819336, + "step": 2682, + "token_acc": 0.2743406095173012 + }, + { + "epoch": 1.572852535913222, + "grad_norm": 0.4333603783048628, + "learning_rate": 0.00029932801381078397, + "loss": 3.2490038871765137, + "step": 2683, + "token_acc": 0.27225055105792967 + }, + { + "epoch": 1.573438874230431, + "grad_norm": 0.43622030814722107, + "learning_rate": 0.00029932663852600034, + "loss": 3.2444024085998535, + "step": 2684, + "token_acc": 0.2730213596844202 + }, + { + "epoch": 1.57402521254764, + "grad_norm": 0.3895775479547454, + "learning_rate": 0.00029932526183849416, + "loss": 3.258131980895996, + "step": 2685, + "token_acc": 0.27289741883877516 + }, + { + "epoch": 1.574611550864849, + "grad_norm": 0.4321711157940707, + "learning_rate": 0.0002993238837482784, + "loss": 3.253261089324951, + "step": 2686, + "token_acc": 0.27271341182903397 + }, + { + "epoch": 1.575197889182058, + "grad_norm": 0.40630776391228274, + "learning_rate": 0.0002993225042553659, + "loss": 3.2729482650756836, + "step": 2687, + "token_acc": 0.26927156014512976 + }, + { + "epoch": 1.575784227499267, + "grad_norm": 0.4560080758374562, + "learning_rate": 0.00029932112335976974, + "loss": 3.1932506561279297, + "step": 2688, + "token_acc": 0.2811465122869519 + }, + { + "epoch": 1.5763705658164762, + "grad_norm": 0.5089141075761099, + "learning_rate": 0.00029931974106150284, + "loss": 3.294118881225586, + "step": 2689, + "token_acc": 0.265711969174377 + }, + { + "epoch": 1.5769569041336853, + "grad_norm": 0.487999939243647, + "learning_rate": 0.00029931835736057816, + "loss": 3.25174617767334, + "step": 2690, + "token_acc": 0.27106734028266294 + }, + { + "epoch": 1.5775432424508942, + "grad_norm": 0.49475367666993286, + "learning_rate": 0.0002993169722570087, + "loss": 3.263667583465576, + "step": 2691, + "token_acc": 0.27103474973036273 + }, + { + "epoch": 1.578129580768103, + "grad_norm": 0.5130871297944195, + "learning_rate": 0.00029931558575080753, + "loss": 3.210399866104126, + "step": 2692, + "token_acc": 0.27724994649798185 + }, + { + "epoch": 1.5787159190853122, + "grad_norm": 0.5226661877928755, + "learning_rate": 0.00029931419784198765, + "loss": 3.2222676277160645, + "step": 2693, + "token_acc": 0.27536330691895805 + }, + { + "epoch": 1.5793022574025213, + "grad_norm": 0.5056452111250206, + "learning_rate": 0.0002993128085305621, + "loss": 3.2211380004882812, + "step": 2694, + "token_acc": 0.27632935474613923 + }, + { + "epoch": 1.5798885957197304, + "grad_norm": 0.4688092746118987, + "learning_rate": 0.00029931141781654386, + "loss": 3.260437488555908, + "step": 2695, + "token_acc": 0.27175967242374927 + }, + { + "epoch": 1.5804749340369393, + "grad_norm": 0.4810384518672649, + "learning_rate": 0.00029931002569994603, + "loss": 3.218383550643921, + "step": 2696, + "token_acc": 0.27488023398934663 + }, + { + "epoch": 1.5810612723541484, + "grad_norm": 0.47643477944060103, + "learning_rate": 0.00029930863218078176, + "loss": 3.2615809440612793, + "step": 2697, + "token_acc": 0.27051356914304736 + }, + { + "epoch": 1.5816476106713573, + "grad_norm": 0.45321266835930557, + "learning_rate": 0.0002993072372590641, + "loss": 3.238734245300293, + "step": 2698, + "token_acc": 0.27531953263768555 + }, + { + "epoch": 1.5822339489885664, + "grad_norm": 0.5161826450875109, + "learning_rate": 0.0002993058409348061, + "loss": 3.2160561084747314, + "step": 2699, + "token_acc": 0.27693314867124147 + }, + { + "epoch": 1.5828202873057755, + "grad_norm": 0.4200191476602458, + "learning_rate": 0.0002993044432080209, + "loss": 3.2701847553253174, + "step": 2700, + "token_acc": 0.2706830794647455 + }, + { + "epoch": 1.5834066256229846, + "grad_norm": 0.5149505536802924, + "learning_rate": 0.0002993030440787217, + "loss": 3.25761079788208, + "step": 2701, + "token_acc": 0.27214159917348735 + }, + { + "epoch": 1.5839929639401935, + "grad_norm": 0.5389300366798491, + "learning_rate": 0.00029930164354692156, + "loss": 3.242964267730713, + "step": 2702, + "token_acc": 0.27478999582895547 + }, + { + "epoch": 1.5845793022574024, + "grad_norm": 0.5419886449745961, + "learning_rate": 0.00029930024161263367, + "loss": 3.2561569213867188, + "step": 2703, + "token_acc": 0.2730442207122274 + }, + { + "epoch": 1.5851656405746115, + "grad_norm": 0.5264087051899076, + "learning_rate": 0.00029929883827587117, + "loss": 3.213006019592285, + "step": 2704, + "token_acc": 0.2771937969630755 + }, + { + "epoch": 1.5857519788918206, + "grad_norm": 0.5082829910876778, + "learning_rate": 0.0002992974335366473, + "loss": 3.240527391433716, + "step": 2705, + "token_acc": 0.27348725277958763 + }, + { + "epoch": 1.5863383172090297, + "grad_norm": 0.4936121740552838, + "learning_rate": 0.00029929602739497523, + "loss": 3.2195537090301514, + "step": 2706, + "token_acc": 0.2755467313225083 + }, + { + "epoch": 1.5869246555262386, + "grad_norm": 0.5363283151732983, + "learning_rate": 0.00029929461985086814, + "loss": 3.261514186859131, + "step": 2707, + "token_acc": 0.2724921294936598 + }, + { + "epoch": 1.5875109938434475, + "grad_norm": 0.47631800183404005, + "learning_rate": 0.00029929321090433925, + "loss": 3.191333293914795, + "step": 2708, + "token_acc": 0.27947795971107203 + }, + { + "epoch": 1.5880973321606566, + "grad_norm": 0.41306246136794844, + "learning_rate": 0.0002992918005554019, + "loss": 3.22574520111084, + "step": 2709, + "token_acc": 0.2748240597474446 + }, + { + "epoch": 1.5886836704778657, + "grad_norm": 0.45490859389660776, + "learning_rate": 0.0002992903888040692, + "loss": 3.2315351963043213, + "step": 2710, + "token_acc": 0.2770321729886836 + }, + { + "epoch": 1.5892700087950749, + "grad_norm": 0.46520111116611357, + "learning_rate": 0.00029928897565035444, + "loss": 3.2533652782440186, + "step": 2711, + "token_acc": 0.27225410858792914 + }, + { + "epoch": 1.5898563471122837, + "grad_norm": 0.5047586063427044, + "learning_rate": 0.00029928756109427095, + "loss": 3.244349479675293, + "step": 2712, + "token_acc": 0.27429234946526354 + }, + { + "epoch": 1.5904426854294929, + "grad_norm": 0.41400183297954696, + "learning_rate": 0.000299286145135832, + "loss": 3.1837916374206543, + "step": 2713, + "token_acc": 0.2806259953448487 + }, + { + "epoch": 1.5910290237467017, + "grad_norm": 0.501299474034693, + "learning_rate": 0.0002992847277750509, + "loss": 3.243840456008911, + "step": 2714, + "token_acc": 0.27279809027869467 + }, + { + "epoch": 1.5916153620639109, + "grad_norm": 0.6278109894191083, + "learning_rate": 0.0002992833090119409, + "loss": 3.300935745239258, + "step": 2715, + "token_acc": 0.26612083525452745 + }, + { + "epoch": 1.59220170038112, + "grad_norm": 0.5981522934222099, + "learning_rate": 0.0002992818888465154, + "loss": 3.284961700439453, + "step": 2716, + "token_acc": 0.26882906429691583 + }, + { + "epoch": 1.592788038698329, + "grad_norm": 0.5970889935366824, + "learning_rate": 0.00029928046727878773, + "loss": 3.2843661308288574, + "step": 2717, + "token_acc": 0.2701027348020585 + }, + { + "epoch": 1.593374377015538, + "grad_norm": 0.5153818190393806, + "learning_rate": 0.0002992790443087712, + "loss": 3.223395824432373, + "step": 2718, + "token_acc": 0.27696721000037994 + }, + { + "epoch": 1.5939607153327469, + "grad_norm": 0.43980250099170276, + "learning_rate": 0.00029927761993647924, + "loss": 3.222972869873047, + "step": 2719, + "token_acc": 0.2748214569116013 + }, + { + "epoch": 1.594547053649956, + "grad_norm": 0.5223287700086423, + "learning_rate": 0.00029927619416192516, + "loss": 3.2458887100219727, + "step": 2720, + "token_acc": 0.273188400236354 + }, + { + "epoch": 1.595133391967165, + "grad_norm": 0.4125632968524281, + "learning_rate": 0.00029927476698512237, + "loss": 3.230388641357422, + "step": 2721, + "token_acc": 0.2733371837580833 + }, + { + "epoch": 1.5957197302843742, + "grad_norm": 0.4879504470758975, + "learning_rate": 0.00029927333840608437, + "loss": 3.23653244972229, + "step": 2722, + "token_acc": 0.2762215148940487 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.5367539081977291, + "learning_rate": 0.00029927190842482445, + "loss": 3.249480724334717, + "step": 2723, + "token_acc": 0.27174708835914824 + }, + { + "epoch": 1.5968924069187922, + "grad_norm": 0.499365320177642, + "learning_rate": 0.0002992704770413561, + "loss": 3.2621824741363525, + "step": 2724, + "token_acc": 0.270077554037611 + }, + { + "epoch": 1.597478745236001, + "grad_norm": 0.4439137441648015, + "learning_rate": 0.00029926904425569276, + "loss": 3.209418535232544, + "step": 2725, + "token_acc": 0.2776703536729811 + }, + { + "epoch": 1.5980650835532102, + "grad_norm": 0.45933860630408435, + "learning_rate": 0.0002992676100678479, + "loss": 3.2532382011413574, + "step": 2726, + "token_acc": 0.27090482116543174 + }, + { + "epoch": 1.5986514218704193, + "grad_norm": 0.5137245456825731, + "learning_rate": 0.0002992661744778349, + "loss": 3.261685371398926, + "step": 2727, + "token_acc": 0.27290040684158084 + }, + { + "epoch": 1.5992377601876284, + "grad_norm": 0.4879246372246115, + "learning_rate": 0.00029926473748566746, + "loss": 3.224544048309326, + "step": 2728, + "token_acc": 0.2759846919768348 + }, + { + "epoch": 1.5998240985048373, + "grad_norm": 0.5040858080934653, + "learning_rate": 0.0002992632990913589, + "loss": 3.3423006534576416, + "step": 2729, + "token_acc": 0.26068816977565573 + }, + { + "epoch": 1.6004104368220462, + "grad_norm": 0.4117497050223786, + "learning_rate": 0.0002992618592949228, + "loss": 3.1993706226348877, + "step": 2730, + "token_acc": 0.2792337058871694 + }, + { + "epoch": 1.6009967751392553, + "grad_norm": 0.5381450468096101, + "learning_rate": 0.00029926041809637266, + "loss": 3.237950325012207, + "step": 2731, + "token_acc": 0.2730349412367786 + }, + { + "epoch": 1.6015831134564644, + "grad_norm": 0.536469442746549, + "learning_rate": 0.00029925897549572196, + "loss": 3.270899772644043, + "step": 2732, + "token_acc": 0.2713140905773085 + }, + { + "epoch": 1.6021694517736735, + "grad_norm": 0.41947074181675853, + "learning_rate": 0.0002992575314929844, + "loss": 3.2282652854919434, + "step": 2733, + "token_acc": 0.2757453991014926 + }, + { + "epoch": 1.6027557900908824, + "grad_norm": 0.4478211929321843, + "learning_rate": 0.0002992560860881734, + "loss": 3.2585651874542236, + "step": 2734, + "token_acc": 0.2718842639428697 + }, + { + "epoch": 1.6033421284080913, + "grad_norm": 0.4323518882342909, + "learning_rate": 0.00029925463928130264, + "loss": 3.2802717685699463, + "step": 2735, + "token_acc": 0.266565710522085 + }, + { + "epoch": 1.6039284667253004, + "grad_norm": 0.4342113935040127, + "learning_rate": 0.00029925319107238565, + "loss": 3.2652697563171387, + "step": 2736, + "token_acc": 0.2708841463414634 + }, + { + "epoch": 1.6045148050425095, + "grad_norm": 0.4983534563241922, + "learning_rate": 0.00029925174146143603, + "loss": 3.2986245155334473, + "step": 2737, + "token_acc": 0.26620232410019773 + }, + { + "epoch": 1.6051011433597187, + "grad_norm": 0.4018549426682999, + "learning_rate": 0.0002992502904484674, + "loss": 3.2220096588134766, + "step": 2738, + "token_acc": 0.2759158841423092 + }, + { + "epoch": 1.6056874816769275, + "grad_norm": 0.46023540523421264, + "learning_rate": 0.00029924883803349346, + "loss": 3.2141122817993164, + "step": 2739, + "token_acc": 0.2767691241772431 + }, + { + "epoch": 1.6062738199941367, + "grad_norm": 0.44112152099692786, + "learning_rate": 0.0002992473842165278, + "loss": 3.2150025367736816, + "step": 2740, + "token_acc": 0.2763198864767466 + }, + { + "epoch": 1.6068601583113455, + "grad_norm": 0.48571007093696955, + "learning_rate": 0.00029924592899758406, + "loss": 3.2276647090911865, + "step": 2741, + "token_acc": 0.2774720826533366 + }, + { + "epoch": 1.6074464966285547, + "grad_norm": 0.4882611553540648, + "learning_rate": 0.0002992444723766759, + "loss": 3.209761142730713, + "step": 2742, + "token_acc": 0.27640525304761554 + }, + { + "epoch": 1.6080328349457638, + "grad_norm": 0.5037432511400337, + "learning_rate": 0.00029924301435381705, + "loss": 3.2221689224243164, + "step": 2743, + "token_acc": 0.27641195846345956 + }, + { + "epoch": 1.6086191732629729, + "grad_norm": 0.4108345978356018, + "learning_rate": 0.00029924155492902125, + "loss": 3.169506311416626, + "step": 2744, + "token_acc": 0.2839673159275413 + }, + { + "epoch": 1.6092055115801818, + "grad_norm": 0.48161915532035654, + "learning_rate": 0.00029924009410230206, + "loss": 3.207735061645508, + "step": 2745, + "token_acc": 0.2782410996520518 + }, + { + "epoch": 1.6097918498973907, + "grad_norm": 0.5395821224512068, + "learning_rate": 0.00029923863187367335, + "loss": 3.244959831237793, + "step": 2746, + "token_acc": 0.27297992521679176 + }, + { + "epoch": 1.6103781882145998, + "grad_norm": 0.4860412555961269, + "learning_rate": 0.00029923716824314874, + "loss": 3.272493362426758, + "step": 2747, + "token_acc": 0.26890261920037056 + }, + { + "epoch": 1.6109645265318089, + "grad_norm": 0.4486647506364991, + "learning_rate": 0.00029923570321074204, + "loss": 3.2505991458892822, + "step": 2748, + "token_acc": 0.2723867367127596 + }, + { + "epoch": 1.611550864849018, + "grad_norm": 0.5571826196354732, + "learning_rate": 0.00029923423677646703, + "loss": 3.2746028900146484, + "step": 2749, + "token_acc": 0.268941748565943 + }, + { + "epoch": 1.6121372031662269, + "grad_norm": 0.5401522424899917, + "learning_rate": 0.00029923276894033753, + "loss": 3.2452683448791504, + "step": 2750, + "token_acc": 0.2735822378576673 + }, + { + "epoch": 1.612723541483436, + "grad_norm": 0.44585486045488004, + "learning_rate": 0.0002992312997023672, + "loss": 3.2004213333129883, + "step": 2751, + "token_acc": 0.27908434080450767 + }, + { + "epoch": 1.6133098798006449, + "grad_norm": 0.5152388818495092, + "learning_rate": 0.0002992298290625698, + "loss": 3.2106189727783203, + "step": 2752, + "token_acc": 0.278890324148552 + }, + { + "epoch": 1.613896218117854, + "grad_norm": 0.5149415375917549, + "learning_rate": 0.00029922835702095936, + "loss": 3.225625991821289, + "step": 2753, + "token_acc": 0.2752305933239503 + }, + { + "epoch": 1.614482556435063, + "grad_norm": 0.522854781758891, + "learning_rate": 0.00029922688357754965, + "loss": 3.2174530029296875, + "step": 2754, + "token_acc": 0.2757915401042565 + }, + { + "epoch": 1.6150688947522722, + "grad_norm": 0.5171855737146633, + "learning_rate": 0.0002992254087323543, + "loss": 3.257126808166504, + "step": 2755, + "token_acc": 0.2725901898991652 + }, + { + "epoch": 1.6156552330694811, + "grad_norm": 0.5110765602229769, + "learning_rate": 0.00029922393248538745, + "loss": 3.2495858669281006, + "step": 2756, + "token_acc": 0.27403209522021704 + }, + { + "epoch": 1.61624157138669, + "grad_norm": 0.45370606672750524, + "learning_rate": 0.0002992224548366627, + "loss": 3.2649779319763184, + "step": 2757, + "token_acc": 0.2720630886214788 + }, + { + "epoch": 1.6168279097038991, + "grad_norm": 0.45805663282030973, + "learning_rate": 0.0002992209757861942, + "loss": 3.239689350128174, + "step": 2758, + "token_acc": 0.27263965648735333 + }, + { + "epoch": 1.6174142480211082, + "grad_norm": 0.44054215322263024, + "learning_rate": 0.0002992194953339957, + "loss": 3.2765979766845703, + "step": 2759, + "token_acc": 0.2669946503387101 + }, + { + "epoch": 1.6180005863383173, + "grad_norm": 0.4916839250635167, + "learning_rate": 0.000299218013480081, + "loss": 3.258183479309082, + "step": 2760, + "token_acc": 0.2724705329601863 + }, + { + "epoch": 1.6185869246555262, + "grad_norm": 0.5048261077039959, + "learning_rate": 0.00029921653022446426, + "loss": 3.2572457790374756, + "step": 2761, + "token_acc": 0.27138767349844034 + }, + { + "epoch": 1.6191732629727351, + "grad_norm": 0.49811486906674163, + "learning_rate": 0.00029921504556715923, + "loss": 3.169217109680176, + "step": 2762, + "token_acc": 0.28300437315892607 + }, + { + "epoch": 1.6197596012899442, + "grad_norm": 0.4595125569053117, + "learning_rate": 0.0002992135595081799, + "loss": 3.201615810394287, + "step": 2763, + "token_acc": 0.2788792931707706 + }, + { + "epoch": 1.6203459396071533, + "grad_norm": 0.48850151544162984, + "learning_rate": 0.00029921207204754033, + "loss": 3.2450265884399414, + "step": 2764, + "token_acc": 0.2718133545391581 + }, + { + "epoch": 1.6209322779243625, + "grad_norm": 0.4523349350879401, + "learning_rate": 0.0002992105831852543, + "loss": 3.185438632965088, + "step": 2765, + "token_acc": 0.2796368855363484 + }, + { + "epoch": 1.6215186162415713, + "grad_norm": 0.4299534610222147, + "learning_rate": 0.000299209092921336, + "loss": 3.2155470848083496, + "step": 2766, + "token_acc": 0.27669452181987003 + }, + { + "epoch": 1.6221049545587805, + "grad_norm": 0.47199304565738287, + "learning_rate": 0.0002992076012557993, + "loss": 3.2312304973602295, + "step": 2767, + "token_acc": 0.27646653047146535 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.48169586292190014, + "learning_rate": 0.0002992061081886582, + "loss": 3.2011427879333496, + "step": 2768, + "token_acc": 0.28045878121603496 + }, + { + "epoch": 1.6232776311931985, + "grad_norm": 0.450042128295733, + "learning_rate": 0.00029920461371992684, + "loss": 3.2608869075775146, + "step": 2769, + "token_acc": 0.27108049953050684 + }, + { + "epoch": 1.6238639695104076, + "grad_norm": 0.40807286227511486, + "learning_rate": 0.00029920311784961917, + "loss": 3.227682590484619, + "step": 2770, + "token_acc": 0.2750270653625147 + }, + { + "epoch": 1.6244503078276167, + "grad_norm": 0.4278745158532367, + "learning_rate": 0.0002992016205777492, + "loss": 3.2256436347961426, + "step": 2771, + "token_acc": 0.2752127717243996 + }, + { + "epoch": 1.6250366461448256, + "grad_norm": 0.4527504295828738, + "learning_rate": 0.00029920012190433115, + "loss": 3.231621265411377, + "step": 2772, + "token_acc": 0.2775668531097732 + }, + { + "epoch": 1.6256229844620345, + "grad_norm": 0.47534107910061496, + "learning_rate": 0.000299198621829379, + "loss": 3.2577579021453857, + "step": 2773, + "token_acc": 0.2708343340825258 + }, + { + "epoch": 1.6262093227792436, + "grad_norm": 0.4663353596532772, + "learning_rate": 0.00029919712035290675, + "loss": 3.305983304977417, + "step": 2774, + "token_acc": 0.26527691200509385 + }, + { + "epoch": 1.6267956610964527, + "grad_norm": 0.38004499709417205, + "learning_rate": 0.0002991956174749287, + "loss": 3.2501847743988037, + "step": 2775, + "token_acc": 0.27291044601275066 + }, + { + "epoch": 1.6273819994136618, + "grad_norm": 0.39649814703538755, + "learning_rate": 0.0002991941131954588, + "loss": 3.2686281204223633, + "step": 2776, + "token_acc": 0.2713234548249413 + }, + { + "epoch": 1.6279683377308707, + "grad_norm": 0.4395841431592008, + "learning_rate": 0.00029919260751451124, + "loss": 3.2062840461730957, + "step": 2777, + "token_acc": 0.2773292518622142 + }, + { + "epoch": 1.6285546760480798, + "grad_norm": 0.542379940537786, + "learning_rate": 0.0002991911004321002, + "loss": 3.2093567848205566, + "step": 2778, + "token_acc": 0.27836100292176286 + }, + { + "epoch": 1.6291410143652887, + "grad_norm": 0.6236099249715498, + "learning_rate": 0.0002991895919482398, + "loss": 3.229440450668335, + "step": 2779, + "token_acc": 0.27503906433726505 + }, + { + "epoch": 1.6297273526824978, + "grad_norm": 0.6686920349885959, + "learning_rate": 0.0002991880820629443, + "loss": 3.2797927856445312, + "step": 2780, + "token_acc": 0.269106760687393 + }, + { + "epoch": 1.630313690999707, + "grad_norm": 0.5891680262330786, + "learning_rate": 0.0002991865707762277, + "loss": 3.2230935096740723, + "step": 2781, + "token_acc": 0.2747641177659158 + }, + { + "epoch": 1.630900029316916, + "grad_norm": 0.38169271362579993, + "learning_rate": 0.00029918505808810436, + "loss": 3.233772039413452, + "step": 2782, + "token_acc": 0.27561514416166616 + }, + { + "epoch": 1.631486367634125, + "grad_norm": 0.5040408203568694, + "learning_rate": 0.0002991835439985884, + "loss": 3.1938376426696777, + "step": 2783, + "token_acc": 0.28025278125205716 + }, + { + "epoch": 1.6320727059513338, + "grad_norm": 0.50511812583065, + "learning_rate": 0.0002991820285076941, + "loss": 3.227950096130371, + "step": 2784, + "token_acc": 0.27614113745402313 + }, + { + "epoch": 1.632659044268543, + "grad_norm": 0.3981822428392917, + "learning_rate": 0.00029918051161543564, + "loss": 3.2353432178497314, + "step": 2785, + "token_acc": 0.27576602393031296 + }, + { + "epoch": 1.633245382585752, + "grad_norm": 0.40274345346499313, + "learning_rate": 0.00029917899332182723, + "loss": 3.197986602783203, + "step": 2786, + "token_acc": 0.2774161572773831 + }, + { + "epoch": 1.6338317209029611, + "grad_norm": 0.5216991829369977, + "learning_rate": 0.0002991774736268833, + "loss": 3.2200732231140137, + "step": 2787, + "token_acc": 0.2773095291831721 + }, + { + "epoch": 1.63441805922017, + "grad_norm": 0.5249615058286641, + "learning_rate": 0.000299175952530618, + "loss": 3.2722325325012207, + "step": 2788, + "token_acc": 0.26963473444804786 + }, + { + "epoch": 1.635004397537379, + "grad_norm": 0.44309677495433253, + "learning_rate": 0.0002991744300330456, + "loss": 3.272714138031006, + "step": 2789, + "token_acc": 0.2709898232737305 + }, + { + "epoch": 1.635590735854588, + "grad_norm": 0.44954046283565063, + "learning_rate": 0.0002991729061341805, + "loss": 3.2553958892822266, + "step": 2790, + "token_acc": 0.27128811550479476 + }, + { + "epoch": 1.6361770741717971, + "grad_norm": 0.4612430435001581, + "learning_rate": 0.0002991713808340369, + "loss": 3.2683799266815186, + "step": 2791, + "token_acc": 0.2695307918652867 + }, + { + "epoch": 1.6367634124890063, + "grad_norm": 0.5374043302255161, + "learning_rate": 0.00029916985413262927, + "loss": 3.208031177520752, + "step": 2792, + "token_acc": 0.27587530652376585 + }, + { + "epoch": 1.6373497508062151, + "grad_norm": 0.4996969858384595, + "learning_rate": 0.0002991683260299718, + "loss": 3.3217482566833496, + "step": 2793, + "token_acc": 0.2642883546075864 + }, + { + "epoch": 1.6379360891234243, + "grad_norm": 0.479591947993097, + "learning_rate": 0.00029916679652607894, + "loss": 3.261453628540039, + "step": 2794, + "token_acc": 0.27161909561434405 + }, + { + "epoch": 1.6385224274406331, + "grad_norm": 0.48740664193297306, + "learning_rate": 0.00029916526562096506, + "loss": 3.21614670753479, + "step": 2795, + "token_acc": 0.2767442921271737 + }, + { + "epoch": 1.6391087657578423, + "grad_norm": 0.5434620208433892, + "learning_rate": 0.0002991637333146445, + "loss": 3.2481441497802734, + "step": 2796, + "token_acc": 0.274214273877408 + }, + { + "epoch": 1.6396951040750514, + "grad_norm": 0.5048102115116913, + "learning_rate": 0.0002991621996071316, + "loss": 3.2879605293273926, + "step": 2797, + "token_acc": 0.26718519094656606 + }, + { + "epoch": 1.6402814423922605, + "grad_norm": 0.46198781192539773, + "learning_rate": 0.00029916066449844095, + "loss": 3.2883810997009277, + "step": 2798, + "token_acc": 0.26736141055636076 + }, + { + "epoch": 1.6408677807094694, + "grad_norm": 0.5059493436244304, + "learning_rate": 0.00029915912798858676, + "loss": 3.278900146484375, + "step": 2799, + "token_acc": 0.2674969855380721 + }, + { + "epoch": 1.6414541190266783, + "grad_norm": 0.48827666203430803, + "learning_rate": 0.00029915759007758357, + "loss": 3.2965686321258545, + "step": 2800, + "token_acc": 0.2656863049577109 + }, + { + "epoch": 1.6420404573438874, + "grad_norm": 0.4605077587581276, + "learning_rate": 0.0002991560507654458, + "loss": 3.2295122146606445, + "step": 2801, + "token_acc": 0.27451548025353484 + }, + { + "epoch": 1.6426267956610965, + "grad_norm": 0.47374081971298215, + "learning_rate": 0.000299154510052188, + "loss": 3.173492670059204, + "step": 2802, + "token_acc": 0.28155719855655564 + }, + { + "epoch": 1.6432131339783056, + "grad_norm": 0.42688371855016066, + "learning_rate": 0.0002991529679378245, + "loss": 3.226634979248047, + "step": 2803, + "token_acc": 0.2757227573063262 + }, + { + "epoch": 1.6437994722955145, + "grad_norm": 0.4361767904572582, + "learning_rate": 0.00029915142442236986, + "loss": 3.251396417617798, + "step": 2804, + "token_acc": 0.2725779039874364 + }, + { + "epoch": 1.6443858106127234, + "grad_norm": 0.475816140310604, + "learning_rate": 0.0002991498795058386, + "loss": 3.2423527240753174, + "step": 2805, + "token_acc": 0.273065127004381 + }, + { + "epoch": 1.6449721489299325, + "grad_norm": 0.40756243927030267, + "learning_rate": 0.00029914833318824517, + "loss": 3.249202251434326, + "step": 2806, + "token_acc": 0.27335952089638743 + }, + { + "epoch": 1.6455584872471416, + "grad_norm": 0.46369295312067865, + "learning_rate": 0.00029914678546960415, + "loss": 3.229423999786377, + "step": 2807, + "token_acc": 0.27506253062066477 + }, + { + "epoch": 1.6461448255643507, + "grad_norm": 0.5065792103297269, + "learning_rate": 0.0002991452363499301, + "loss": 3.241318702697754, + "step": 2808, + "token_acc": 0.27285611557821793 + }, + { + "epoch": 1.6467311638815598, + "grad_norm": 0.4592479492270207, + "learning_rate": 0.00029914368582923746, + "loss": 3.2161073684692383, + "step": 2809, + "token_acc": 0.2769729802727943 + }, + { + "epoch": 1.6473175021987687, + "grad_norm": 0.46478065073603425, + "learning_rate": 0.0002991421339075409, + "loss": 3.233269691467285, + "step": 2810, + "token_acc": 0.2751621844481755 + }, + { + "epoch": 1.6479038405159776, + "grad_norm": 0.43303168277559084, + "learning_rate": 0.0002991405805848549, + "loss": 3.1588401794433594, + "step": 2811, + "token_acc": 0.28553875870209655 + }, + { + "epoch": 1.6484901788331867, + "grad_norm": 0.46259567860613987, + "learning_rate": 0.0002991390258611942, + "loss": 3.1970856189727783, + "step": 2812, + "token_acc": 0.28027978035741086 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.5284415102704099, + "learning_rate": 0.0002991374697365733, + "loss": 3.2642972469329834, + "step": 2813, + "token_acc": 0.27382538104261084 + }, + { + "epoch": 1.649662855467605, + "grad_norm": 0.459088700642807, + "learning_rate": 0.00029913591221100683, + "loss": 3.212010145187378, + "step": 2814, + "token_acc": 0.2750689247735329 + }, + { + "epoch": 1.6502491937848138, + "grad_norm": 0.4932365296610275, + "learning_rate": 0.0002991343532845094, + "loss": 3.2581164836883545, + "step": 2815, + "token_acc": 0.27183697772624354 + }, + { + "epoch": 1.6508355321020227, + "grad_norm": 0.4632231285174976, + "learning_rate": 0.0002991327929570957, + "loss": 3.225412368774414, + "step": 2816, + "token_acc": 0.275295327450117 + }, + { + "epoch": 1.6514218704192318, + "grad_norm": 0.4481535710117751, + "learning_rate": 0.0002991312312287804, + "loss": 3.177954912185669, + "step": 2817, + "token_acc": 0.2818033981080582 + }, + { + "epoch": 1.652008208736441, + "grad_norm": 0.3972418048032003, + "learning_rate": 0.0002991296680995781, + "loss": 3.233959436416626, + "step": 2818, + "token_acc": 0.27440802215881877 + }, + { + "epoch": 1.65259454705365, + "grad_norm": 0.4602011784551465, + "learning_rate": 0.0002991281035695035, + "loss": 3.171598434448242, + "step": 2819, + "token_acc": 0.28290302186081656 + }, + { + "epoch": 1.653180885370859, + "grad_norm": 0.5057108721084328, + "learning_rate": 0.0002991265376385714, + "loss": 3.280641555786133, + "step": 2820, + "token_acc": 0.2681088861606198 + }, + { + "epoch": 1.653767223688068, + "grad_norm": 0.4435909377590639, + "learning_rate": 0.0002991249703067964, + "loss": 3.2387359142303467, + "step": 2821, + "token_acc": 0.2715591222014698 + }, + { + "epoch": 1.654353562005277, + "grad_norm": 0.4202352843563113, + "learning_rate": 0.0002991234015741932, + "loss": 3.1910641193389893, + "step": 2822, + "token_acc": 0.28135926746824796 + }, + { + "epoch": 1.654939900322486, + "grad_norm": 0.4206557046682502, + "learning_rate": 0.00029912183144077664, + "loss": 3.204280376434326, + "step": 2823, + "token_acc": 0.2804414409022423 + }, + { + "epoch": 1.6555262386396952, + "grad_norm": 0.4542571943296922, + "learning_rate": 0.0002991202599065614, + "loss": 3.228511095046997, + "step": 2824, + "token_acc": 0.27397860258541423 + }, + { + "epoch": 1.6561125769569043, + "grad_norm": 0.4749885204709126, + "learning_rate": 0.0002991186869715623, + "loss": 3.245894432067871, + "step": 2825, + "token_acc": 0.2718803233989589 + }, + { + "epoch": 1.6566989152741132, + "grad_norm": 0.35464321955210626, + "learning_rate": 0.00029911711263579403, + "loss": 3.2186279296875, + "step": 2826, + "token_acc": 0.2747720522431616 + }, + { + "epoch": 1.657285253591322, + "grad_norm": 0.42248244633721693, + "learning_rate": 0.00029911553689927143, + "loss": 3.230262041091919, + "step": 2827, + "token_acc": 0.27456979577936935 + }, + { + "epoch": 1.6578715919085312, + "grad_norm": 0.4667035752094275, + "learning_rate": 0.0002991139597620093, + "loss": 3.218432903289795, + "step": 2828, + "token_acc": 0.27650287918788025 + }, + { + "epoch": 1.6584579302257403, + "grad_norm": 0.47507935857195155, + "learning_rate": 0.00029911238122402243, + "loss": 3.2209808826446533, + "step": 2829, + "token_acc": 0.27657500185950634 + }, + { + "epoch": 1.6590442685429494, + "grad_norm": 0.47861342126262924, + "learning_rate": 0.0002991108012853257, + "loss": 3.2544357776641846, + "step": 2830, + "token_acc": 0.2702499202838164 + }, + { + "epoch": 1.6596306068601583, + "grad_norm": 0.39978005899052843, + "learning_rate": 0.0002991092199459339, + "loss": 3.1622705459594727, + "step": 2831, + "token_acc": 0.28341896682200185 + }, + { + "epoch": 1.6602169451773672, + "grad_norm": 0.4428136135693279, + "learning_rate": 0.0002991076372058619, + "loss": 3.2114734649658203, + "step": 2832, + "token_acc": 0.2784724793483396 + }, + { + "epoch": 1.6608032834945763, + "grad_norm": 0.44145785388244313, + "learning_rate": 0.0002991060530651246, + "loss": 3.2212629318237305, + "step": 2833, + "token_acc": 0.27632747845331423 + }, + { + "epoch": 1.6613896218117854, + "grad_norm": 0.44206036860571757, + "learning_rate": 0.00029910446752373686, + "loss": 3.214123010635376, + "step": 2834, + "token_acc": 0.2752622766839999 + }, + { + "epoch": 1.6619759601289945, + "grad_norm": 0.4133133760550389, + "learning_rate": 0.00029910288058171354, + "loss": 3.2422633171081543, + "step": 2835, + "token_acc": 0.27304393150784484 + }, + { + "epoch": 1.6625622984462036, + "grad_norm": 0.48891310044777486, + "learning_rate": 0.0002991012922390696, + "loss": 3.2133827209472656, + "step": 2836, + "token_acc": 0.27682369127956497 + }, + { + "epoch": 1.6631486367634125, + "grad_norm": 0.5363772781414919, + "learning_rate": 0.0002990997024958199, + "loss": 3.214707374572754, + "step": 2837, + "token_acc": 0.27630177905702463 + }, + { + "epoch": 1.6637349750806214, + "grad_norm": 0.4364771881159486, + "learning_rate": 0.0002990981113519795, + "loss": 3.2803666591644287, + "step": 2838, + "token_acc": 0.2696617585794232 + }, + { + "epoch": 1.6643213133978305, + "grad_norm": 0.4486979724519655, + "learning_rate": 0.00029909651880756315, + "loss": 3.2419018745422363, + "step": 2839, + "token_acc": 0.27528582101987176 + }, + { + "epoch": 1.6649076517150396, + "grad_norm": 0.5822000068867318, + "learning_rate": 0.00029909492486258595, + "loss": 3.227296829223633, + "step": 2840, + "token_acc": 0.274400288273448 + }, + { + "epoch": 1.6654939900322487, + "grad_norm": 0.5261569567025727, + "learning_rate": 0.00029909332951706284, + "loss": 3.1907236576080322, + "step": 2841, + "token_acc": 0.27935683677230194 + }, + { + "epoch": 1.6660803283494576, + "grad_norm": 0.49724005142374633, + "learning_rate": 0.00029909173277100883, + "loss": 3.233206272125244, + "step": 2842, + "token_acc": 0.2747210110117508 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5274464042339126, + "learning_rate": 0.0002990901346244389, + "loss": 3.245659112930298, + "step": 2843, + "token_acc": 0.27321410845180066 + }, + { + "epoch": 1.6672530049838756, + "grad_norm": 0.5558296269198623, + "learning_rate": 0.000299088535077368, + "loss": 3.2522130012512207, + "step": 2844, + "token_acc": 0.2724277417222365 + }, + { + "epoch": 1.6678393433010847, + "grad_norm": 0.5166506452069556, + "learning_rate": 0.00029908693412981127, + "loss": 3.2887752056121826, + "step": 2845, + "token_acc": 0.2664081697862963 + }, + { + "epoch": 1.6684256816182939, + "grad_norm": 0.3833862431555096, + "learning_rate": 0.0002990853317817837, + "loss": 3.2319154739379883, + "step": 2846, + "token_acc": 0.27538357016009085 + }, + { + "epoch": 1.6690120199355027, + "grad_norm": 0.497694338400513, + "learning_rate": 0.00029908372803330027, + "loss": 3.257106065750122, + "step": 2847, + "token_acc": 0.2720572313071932 + }, + { + "epoch": 1.6695983582527119, + "grad_norm": 0.504161388079991, + "learning_rate": 0.0002990821228843761, + "loss": 3.2067041397094727, + "step": 2848, + "token_acc": 0.2784462372165262 + }, + { + "epoch": 1.6701846965699207, + "grad_norm": 0.46879493539524586, + "learning_rate": 0.00029908051633502635, + "loss": 3.2096762657165527, + "step": 2849, + "token_acc": 0.27803603455028963 + }, + { + "epoch": 1.6707710348871299, + "grad_norm": 0.42829641333100577, + "learning_rate": 0.000299078908385266, + "loss": 3.200063467025757, + "step": 2850, + "token_acc": 0.27725802874722144 + }, + { + "epoch": 1.671357373204339, + "grad_norm": 0.45899376762916194, + "learning_rate": 0.0002990772990351102, + "loss": 3.238583564758301, + "step": 2851, + "token_acc": 0.273020788974488 + }, + { + "epoch": 1.671943711521548, + "grad_norm": 0.38646537877138676, + "learning_rate": 0.000299075688284574, + "loss": 3.297233819961548, + "step": 2852, + "token_acc": 0.2647120688657864 + }, + { + "epoch": 1.672530049838757, + "grad_norm": 0.3923370194459343, + "learning_rate": 0.0002990740761336727, + "loss": 3.247020959854126, + "step": 2853, + "token_acc": 0.27318545774921615 + }, + { + "epoch": 1.6731163881559659, + "grad_norm": 0.4644511971706538, + "learning_rate": 0.00029907246258242126, + "loss": 3.2347538471221924, + "step": 2854, + "token_acc": 0.27459427590272967 + }, + { + "epoch": 1.673702726473175, + "grad_norm": 0.39475447247516626, + "learning_rate": 0.00029907084763083495, + "loss": 3.208374500274658, + "step": 2855, + "token_acc": 0.278306472843667 + }, + { + "epoch": 1.674289064790384, + "grad_norm": 0.4430346427796089, + "learning_rate": 0.00029906923127892885, + "loss": 3.2527387142181396, + "step": 2856, + "token_acc": 0.27363797881404905 + }, + { + "epoch": 1.6748754031075932, + "grad_norm": 0.4868100026820017, + "learning_rate": 0.00029906761352671823, + "loss": 3.2725353240966797, + "step": 2857, + "token_acc": 0.26958010228243817 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.43962473874674607, + "learning_rate": 0.0002990659943742182, + "loss": 3.2349705696105957, + "step": 2858, + "token_acc": 0.2738408823776341 + }, + { + "epoch": 1.676048079742011, + "grad_norm": 0.4669380202583218, + "learning_rate": 0.0002990643738214441, + "loss": 3.220262289047241, + "step": 2859, + "token_acc": 0.27696553781888705 + }, + { + "epoch": 1.67663441805922, + "grad_norm": 0.5290029478657404, + "learning_rate": 0.00029906275186841107, + "loss": 3.2136669158935547, + "step": 2860, + "token_acc": 0.2748721816010062 + }, + { + "epoch": 1.6772207563764292, + "grad_norm": 0.5053608750044666, + "learning_rate": 0.00029906112851513434, + "loss": 3.2090916633605957, + "step": 2861, + "token_acc": 0.2775112673444542 + }, + { + "epoch": 1.6778070946936383, + "grad_norm": 0.4995945617978085, + "learning_rate": 0.00029905950376162916, + "loss": 3.249103307723999, + "step": 2862, + "token_acc": 0.2719103179585188 + }, + { + "epoch": 1.6783934330108474, + "grad_norm": 0.4899050876239883, + "learning_rate": 0.00029905787760791075, + "loss": 3.2164385318756104, + "step": 2863, + "token_acc": 0.27659480793933616 + }, + { + "epoch": 1.6789797713280563, + "grad_norm": 0.4720769696939573, + "learning_rate": 0.0002990562500539945, + "loss": 3.2586801052093506, + "step": 2864, + "token_acc": 0.2702825130584553 + }, + { + "epoch": 1.6795661096452652, + "grad_norm": 0.46991483081344027, + "learning_rate": 0.0002990546210998956, + "loss": 3.2369980812072754, + "step": 2865, + "token_acc": 0.2757360598814078 + }, + { + "epoch": 1.6801524479624743, + "grad_norm": 0.47195767603199723, + "learning_rate": 0.0002990529907456294, + "loss": 3.2156364917755127, + "step": 2866, + "token_acc": 0.276450387257051 + }, + { + "epoch": 1.6807387862796834, + "grad_norm": 0.4275220616730097, + "learning_rate": 0.00029905135899121126, + "loss": 3.2788164615631104, + "step": 2867, + "token_acc": 0.26949163071986343 + }, + { + "epoch": 1.6813251245968925, + "grad_norm": 0.44233258282252413, + "learning_rate": 0.00029904972583665637, + "loss": 3.2445950508117676, + "step": 2868, + "token_acc": 0.27418893053883964 + }, + { + "epoch": 1.6819114629141014, + "grad_norm": 0.447608962005346, + "learning_rate": 0.00029904809128198024, + "loss": 3.188769817352295, + "step": 2869, + "token_acc": 0.2783678439657999 + }, + { + "epoch": 1.6824978012313103, + "grad_norm": 0.47553117662997496, + "learning_rate": 0.00029904645532719806, + "loss": 3.2812047004699707, + "step": 2870, + "token_acc": 0.2672803478397117 + }, + { + "epoch": 1.6830841395485194, + "grad_norm": 0.5071577131050147, + "learning_rate": 0.00029904481797232534, + "loss": 3.223794460296631, + "step": 2871, + "token_acc": 0.2739874377301278 + }, + { + "epoch": 1.6836704778657285, + "grad_norm": 0.4543249449654351, + "learning_rate": 0.0002990431792173773, + "loss": 3.2305033206939697, + "step": 2872, + "token_acc": 0.2742884819896181 + }, + { + "epoch": 1.6842568161829377, + "grad_norm": 0.40414519758355155, + "learning_rate": 0.0002990415390623695, + "loss": 3.212996482849121, + "step": 2873, + "token_acc": 0.27560592413666046 + }, + { + "epoch": 1.6848431545001465, + "grad_norm": 0.4417170995846462, + "learning_rate": 0.0002990398975073173, + "loss": 3.2344865798950195, + "step": 2874, + "token_acc": 0.2741679172507783 + }, + { + "epoch": 1.6854294928173557, + "grad_norm": 0.4642149503113735, + "learning_rate": 0.000299038254552236, + "loss": 3.227048873901367, + "step": 2875, + "token_acc": 0.2755617465566173 + }, + { + "epoch": 1.6860158311345645, + "grad_norm": 0.5137941631016929, + "learning_rate": 0.0002990366101971412, + "loss": 3.248990297317505, + "step": 2876, + "token_acc": 0.2727422981166032 + }, + { + "epoch": 1.6866021694517737, + "grad_norm": 0.4132054370104702, + "learning_rate": 0.0002990349644420483, + "loss": 3.285066604614258, + "step": 2877, + "token_acc": 0.2674370614940157 + }, + { + "epoch": 1.6871885077689828, + "grad_norm": 0.5001672302139621, + "learning_rate": 0.0002990333172869727, + "loss": 3.2116456031799316, + "step": 2878, + "token_acc": 0.27631278244266366 + }, + { + "epoch": 1.6877748460861919, + "grad_norm": 0.5272204765261164, + "learning_rate": 0.0002990316687319299, + "loss": 3.1955928802490234, + "step": 2879, + "token_acc": 0.2793326700772171 + }, + { + "epoch": 1.6883611844034008, + "grad_norm": 0.4621699302757071, + "learning_rate": 0.0002990300187769354, + "loss": 3.2302982807159424, + "step": 2880, + "token_acc": 0.27474503596545713 + }, + { + "epoch": 1.6889475227206097, + "grad_norm": 0.4440285107378725, + "learning_rate": 0.00029902836742200467, + "loss": 3.2027597427368164, + "step": 2881, + "token_acc": 0.28094687522222017 + }, + { + "epoch": 1.6895338610378188, + "grad_norm": 0.4905970091795241, + "learning_rate": 0.0002990267146671533, + "loss": 3.2533726692199707, + "step": 2882, + "token_acc": 0.27134760606425345 + }, + { + "epoch": 1.6901201993550279, + "grad_norm": 0.43316419970321957, + "learning_rate": 0.00029902506051239676, + "loss": 3.2209091186523438, + "step": 2883, + "token_acc": 0.2758372362648307 + }, + { + "epoch": 1.690706537672237, + "grad_norm": 0.4193898788072306, + "learning_rate": 0.0002990234049577506, + "loss": 3.2143421173095703, + "step": 2884, + "token_acc": 0.2753898372557133 + }, + { + "epoch": 1.6912928759894459, + "grad_norm": 0.44630613298124766, + "learning_rate": 0.00029902174800323033, + "loss": 3.251372814178467, + "step": 2885, + "token_acc": 0.2712665947927897 + }, + { + "epoch": 1.6918792143066548, + "grad_norm": 0.46453857681112154, + "learning_rate": 0.0002990200896488515, + "loss": 3.2222745418548584, + "step": 2886, + "token_acc": 0.27526902306139867 + }, + { + "epoch": 1.692465552623864, + "grad_norm": 0.45375993751477894, + "learning_rate": 0.0002990184298946298, + "loss": 3.201394557952881, + "step": 2887, + "token_acc": 0.27790923924182376 + }, + { + "epoch": 1.693051890941073, + "grad_norm": 0.4553226339871933, + "learning_rate": 0.0002990167687405807, + "loss": 3.18430757522583, + "step": 2888, + "token_acc": 0.28163958169486775 + }, + { + "epoch": 1.6936382292582821, + "grad_norm": 0.4548508088697926, + "learning_rate": 0.0002990151061867199, + "loss": 3.154754638671875, + "step": 2889, + "token_acc": 0.28382355272125914 + }, + { + "epoch": 1.6942245675754912, + "grad_norm": 0.41502553854796614, + "learning_rate": 0.000299013442233063, + "loss": 3.2125260829925537, + "step": 2890, + "token_acc": 0.27621796398688403 + }, + { + "epoch": 1.6948109058927001, + "grad_norm": 0.4230302723983323, + "learning_rate": 0.0002990117768796256, + "loss": 3.2401626110076904, + "step": 2891, + "token_acc": 0.2733638656577725 + }, + { + "epoch": 1.695397244209909, + "grad_norm": 0.3779837201386293, + "learning_rate": 0.00029901011012642333, + "loss": 3.2548422813415527, + "step": 2892, + "token_acc": 0.2714419669189004 + }, + { + "epoch": 1.6959835825271181, + "grad_norm": 0.49682878472345093, + "learning_rate": 0.0002990084419734719, + "loss": 3.244354248046875, + "step": 2893, + "token_acc": 0.2716279167864007 + }, + { + "epoch": 1.6965699208443272, + "grad_norm": 0.4271309472781941, + "learning_rate": 0.0002990067724207869, + "loss": 3.2306809425354004, + "step": 2894, + "token_acc": 0.27270328578745595 + }, + { + "epoch": 1.6971562591615363, + "grad_norm": 0.4582096075377558, + "learning_rate": 0.00029900510146838407, + "loss": 3.207265853881836, + "step": 2895, + "token_acc": 0.276791251498739 + }, + { + "epoch": 1.6977425974787452, + "grad_norm": 0.4281921138993705, + "learning_rate": 0.00029900342911627913, + "loss": 3.2117080688476562, + "step": 2896, + "token_acc": 0.27532571193161337 + }, + { + "epoch": 1.6983289357959541, + "grad_norm": 0.3957950439960161, + "learning_rate": 0.0002990017553644877, + "loss": 3.229311943054199, + "step": 2897, + "token_acc": 0.2743480964973934 + }, + { + "epoch": 1.6989152741131632, + "grad_norm": 0.3932749004914916, + "learning_rate": 0.0002990000802130256, + "loss": 3.1969499588012695, + "step": 2898, + "token_acc": 0.2780155003205345 + }, + { + "epoch": 1.6995016124303723, + "grad_norm": 0.37133416075991915, + "learning_rate": 0.00029899840366190856, + "loss": 3.246971368789673, + "step": 2899, + "token_acc": 0.2717865912245116 + }, + { + "epoch": 1.7000879507475815, + "grad_norm": 0.4429710927873473, + "learning_rate": 0.0002989967257111523, + "loss": 3.248538017272949, + "step": 2900, + "token_acc": 0.2719268858126851 + }, + { + "epoch": 1.7006742890647903, + "grad_norm": 0.43667158614321094, + "learning_rate": 0.0002989950463607725, + "loss": 3.2323217391967773, + "step": 2901, + "token_acc": 0.2745708641483001 + }, + { + "epoch": 1.7012606273819995, + "grad_norm": 0.500344499868889, + "learning_rate": 0.0002989933656107851, + "loss": 3.2650294303894043, + "step": 2902, + "token_acc": 0.27021428869943165 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.46211898363266674, + "learning_rate": 0.00029899168346120573, + "loss": 3.228623390197754, + "step": 2903, + "token_acc": 0.27370896727940786 + }, + { + "epoch": 1.7024333040164175, + "grad_norm": 0.4027540192380709, + "learning_rate": 0.0002989899999120503, + "loss": 3.202935218811035, + "step": 2904, + "token_acc": 0.2783711668134439 + }, + { + "epoch": 1.7030196423336266, + "grad_norm": 0.4546474344483752, + "learning_rate": 0.0002989883149633346, + "loss": 3.2350871562957764, + "step": 2905, + "token_acc": 0.2718994041615107 + }, + { + "epoch": 1.7036059806508357, + "grad_norm": 0.4412110771246422, + "learning_rate": 0.0002989866286150744, + "loss": 3.204291582107544, + "step": 2906, + "token_acc": 0.2771382781014429 + }, + { + "epoch": 1.7041923189680446, + "grad_norm": 0.4109946291592292, + "learning_rate": 0.0002989849408672856, + "loss": 3.199084758758545, + "step": 2907, + "token_acc": 0.2799282061702056 + }, + { + "epoch": 1.7047786572852535, + "grad_norm": 0.44875402976443873, + "learning_rate": 0.00029898325171998406, + "loss": 3.1845109462738037, + "step": 2908, + "token_acc": 0.2806975317132005 + }, + { + "epoch": 1.7053649956024626, + "grad_norm": 0.49086532630772184, + "learning_rate": 0.0002989815611731856, + "loss": 3.2574119567871094, + "step": 2909, + "token_acc": 0.26958782313082136 + }, + { + "epoch": 1.7059513339196717, + "grad_norm": 0.5105773884194543, + "learning_rate": 0.00029897986922690616, + "loss": 3.2497825622558594, + "step": 2910, + "token_acc": 0.27092517036567093 + }, + { + "epoch": 1.7065376722368808, + "grad_norm": 0.47524729864649207, + "learning_rate": 0.00029897817588116156, + "loss": 3.282766342163086, + "step": 2911, + "token_acc": 0.26748043497752105 + }, + { + "epoch": 1.7071240105540897, + "grad_norm": 0.5017116256579833, + "learning_rate": 0.00029897648113596777, + "loss": 3.2044057846069336, + "step": 2912, + "token_acc": 0.27696646556821214 + }, + { + "epoch": 1.7077103488712986, + "grad_norm": 0.5954270924794584, + "learning_rate": 0.00029897478499134073, + "loss": 3.224130153656006, + "step": 2913, + "token_acc": 0.27602873692921087 + }, + { + "epoch": 1.7082966871885077, + "grad_norm": 0.45629157144050586, + "learning_rate": 0.00029897308744729627, + "loss": 3.2441396713256836, + "step": 2914, + "token_acc": 0.27346893747570367 + }, + { + "epoch": 1.7088830255057168, + "grad_norm": 0.42205608635520075, + "learning_rate": 0.00029897138850385044, + "loss": 3.206831932067871, + "step": 2915, + "token_acc": 0.2775562948502928 + }, + { + "epoch": 1.709469363822926, + "grad_norm": 0.4064019208811385, + "learning_rate": 0.0002989696881610191, + "loss": 3.198359489440918, + "step": 2916, + "token_acc": 0.27816397437176044 + }, + { + "epoch": 1.7100557021401348, + "grad_norm": 0.40080909909490353, + "learning_rate": 0.00029896798641881834, + "loss": 3.207329750061035, + "step": 2917, + "token_acc": 0.2773968619890265 + }, + { + "epoch": 1.710642040457344, + "grad_norm": 0.43281358375828005, + "learning_rate": 0.00029896628327726407, + "loss": 3.2210593223571777, + "step": 2918, + "token_acc": 0.27596926979196346 + }, + { + "epoch": 1.7112283787745528, + "grad_norm": 0.3944509896842094, + "learning_rate": 0.0002989645787363723, + "loss": 3.258690357208252, + "step": 2919, + "token_acc": 0.27137178542420837 + }, + { + "epoch": 1.711814717091762, + "grad_norm": 0.43436806475673606, + "learning_rate": 0.000298962872796159, + "loss": 3.1930580139160156, + "step": 2920, + "token_acc": 0.27941056078591897 + }, + { + "epoch": 1.712401055408971, + "grad_norm": 0.45859538596934785, + "learning_rate": 0.0002989611654566403, + "loss": 3.235530376434326, + "step": 2921, + "token_acc": 0.27269450002873546 + }, + { + "epoch": 1.7129873937261801, + "grad_norm": 0.440224515759211, + "learning_rate": 0.0002989594567178322, + "loss": 3.2450520992279053, + "step": 2922, + "token_acc": 0.2715480846434769 + }, + { + "epoch": 1.713573732043389, + "grad_norm": 0.36498507951938525, + "learning_rate": 0.00029895774657975063, + "loss": 3.2106118202209473, + "step": 2923, + "token_acc": 0.27674604946866505 + }, + { + "epoch": 1.714160070360598, + "grad_norm": 0.4002467923174126, + "learning_rate": 0.00029895603504241186, + "loss": 3.2360215187072754, + "step": 2924, + "token_acc": 0.2749669419696083 + }, + { + "epoch": 1.714746408677807, + "grad_norm": 0.38645027886485367, + "learning_rate": 0.0002989543221058318, + "loss": 3.2303028106689453, + "step": 2925, + "token_acc": 0.27771741846889625 + }, + { + "epoch": 1.7153327469950161, + "grad_norm": 0.3485442911327244, + "learning_rate": 0.0002989526077700266, + "loss": 3.184638023376465, + "step": 2926, + "token_acc": 0.28187811168627697 + }, + { + "epoch": 1.7159190853122253, + "grad_norm": 0.3488914602193716, + "learning_rate": 0.0002989508920350124, + "loss": 3.186772584915161, + "step": 2927, + "token_acc": 0.27990037041767785 + }, + { + "epoch": 1.7165054236294341, + "grad_norm": 0.46298604829372964, + "learning_rate": 0.0002989491749008053, + "loss": 3.2689456939697266, + "step": 2928, + "token_acc": 0.26986889918436674 + }, + { + "epoch": 1.7170917619466433, + "grad_norm": 0.4730672590206892, + "learning_rate": 0.0002989474563674213, + "loss": 3.2138149738311768, + "step": 2929, + "token_acc": 0.2749291875472617 + }, + { + "epoch": 1.7176781002638521, + "grad_norm": 0.44781503470620443, + "learning_rate": 0.00029894573643487674, + "loss": 3.209279775619507, + "step": 2930, + "token_acc": 0.2783731963956184 + }, + { + "epoch": 1.7182644385810613, + "grad_norm": 0.4301862174608705, + "learning_rate": 0.0002989440151031877, + "loss": 3.203165054321289, + "step": 2931, + "token_acc": 0.2791456965884623 + }, + { + "epoch": 1.7188507768982704, + "grad_norm": 0.42165815586939914, + "learning_rate": 0.00029894229237237036, + "loss": 3.2482869625091553, + "step": 2932, + "token_acc": 0.2714959799792218 + }, + { + "epoch": 1.7194371152154795, + "grad_norm": 0.424637432731858, + "learning_rate": 0.0002989405682424408, + "loss": 3.2542104721069336, + "step": 2933, + "token_acc": 0.2710513070390559 + }, + { + "epoch": 1.7200234535326884, + "grad_norm": 0.45401365198124943, + "learning_rate": 0.0002989388427134154, + "loss": 3.2733314037323, + "step": 2934, + "token_acc": 0.26972491183071495 + }, + { + "epoch": 1.7206097918498973, + "grad_norm": 0.4222743477856575, + "learning_rate": 0.0002989371157853102, + "loss": 3.246250629425049, + "step": 2935, + "token_acc": 0.27324763697284327 + }, + { + "epoch": 1.7211961301671064, + "grad_norm": 0.41210246551814894, + "learning_rate": 0.00029893538745814154, + "loss": 3.1802730560302734, + "step": 2936, + "token_acc": 0.27988948775800115 + }, + { + "epoch": 1.7217824684843155, + "grad_norm": 0.4265238932587353, + "learning_rate": 0.00029893365773192554, + "loss": 3.176746129989624, + "step": 2937, + "token_acc": 0.2818218227921725 + }, + { + "epoch": 1.7223688068015246, + "grad_norm": 0.43244487621907757, + "learning_rate": 0.0002989319266066786, + "loss": 3.2159221172332764, + "step": 2938, + "token_acc": 0.27829120328973084 + }, + { + "epoch": 1.7229551451187335, + "grad_norm": 0.37385990892769094, + "learning_rate": 0.00029893019408241684, + "loss": 3.2271828651428223, + "step": 2939, + "token_acc": 0.27446952549274956 + }, + { + "epoch": 1.7235414834359424, + "grad_norm": 0.46642081374639366, + "learning_rate": 0.00029892846015915666, + "loss": 3.2303805351257324, + "step": 2940, + "token_acc": 0.2743567876371707 + }, + { + "epoch": 1.7241278217531515, + "grad_norm": 0.43244335862197014, + "learning_rate": 0.0002989267248369142, + "loss": 3.235325813293457, + "step": 2941, + "token_acc": 0.2727040623591539 + }, + { + "epoch": 1.7247141600703606, + "grad_norm": 0.4588509065278779, + "learning_rate": 0.0002989249881157059, + "loss": 3.1970572471618652, + "step": 2942, + "token_acc": 0.27793381819534607 + }, + { + "epoch": 1.7253004983875697, + "grad_norm": 0.5045604752131334, + "learning_rate": 0.00029892324999554796, + "loss": 3.275254249572754, + "step": 2943, + "token_acc": 0.2693328440632502 + }, + { + "epoch": 1.7258868367047786, + "grad_norm": 0.456024552990943, + "learning_rate": 0.0002989215104764568, + "loss": 3.2023587226867676, + "step": 2944, + "token_acc": 0.2777041561788561 + }, + { + "epoch": 1.7264731750219877, + "grad_norm": 0.4071224382283433, + "learning_rate": 0.00029891976955844873, + "loss": 3.1918816566467285, + "step": 2945, + "token_acc": 0.28053982312513964 + }, + { + "epoch": 1.7270595133391966, + "grad_norm": 0.43218059990150653, + "learning_rate": 0.0002989180272415401, + "loss": 3.1881520748138428, + "step": 2946, + "token_acc": 0.2790993683245183 + }, + { + "epoch": 1.7276458516564057, + "grad_norm": 0.47266101181725195, + "learning_rate": 0.0002989162835257472, + "loss": 3.2644615173339844, + "step": 2947, + "token_acc": 0.2716276140746766 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.5337284360939488, + "learning_rate": 0.00029891453841108655, + "loss": 3.255692958831787, + "step": 2948, + "token_acc": 0.2716893788305622 + }, + { + "epoch": 1.728818528290824, + "grad_norm": 0.5251778305953624, + "learning_rate": 0.0002989127918975745, + "loss": 3.244690418243408, + "step": 2949, + "token_acc": 0.27068144583248005 + }, + { + "epoch": 1.7294048666080328, + "grad_norm": 0.5010245638083478, + "learning_rate": 0.0002989110439852274, + "loss": 3.2158520221710205, + "step": 2950, + "token_acc": 0.2768921130738729 + }, + { + "epoch": 1.7299912049252417, + "grad_norm": 0.5927692292672526, + "learning_rate": 0.0002989092946740617, + "loss": 3.231207847595215, + "step": 2951, + "token_acc": 0.2736133851821531 + }, + { + "epoch": 1.7305775432424508, + "grad_norm": 0.4749809044495472, + "learning_rate": 0.0002989075439640938, + "loss": 3.2253758907318115, + "step": 2952, + "token_acc": 0.27382944807080734 + }, + { + "epoch": 1.73116388155966, + "grad_norm": 0.43492546237654195, + "learning_rate": 0.0002989057918553402, + "loss": 3.2406039237976074, + "step": 2953, + "token_acc": 0.27328072508950585 + }, + { + "epoch": 1.731750219876869, + "grad_norm": 0.4900718881293884, + "learning_rate": 0.0002989040383478174, + "loss": 3.1991477012634277, + "step": 2954, + "token_acc": 0.27923704050412657 + }, + { + "epoch": 1.732336558194078, + "grad_norm": 0.479484525788494, + "learning_rate": 0.00029890228344154175, + "loss": 3.2283072471618652, + "step": 2955, + "token_acc": 0.2752824818593782 + }, + { + "epoch": 1.732922896511287, + "grad_norm": 0.4375042735530878, + "learning_rate": 0.0002989005271365298, + "loss": 3.2165286540985107, + "step": 2956, + "token_acc": 0.2775749756112912 + }, + { + "epoch": 1.733509234828496, + "grad_norm": 0.4827776263469403, + "learning_rate": 0.0002988987694327981, + "loss": 3.210174083709717, + "step": 2957, + "token_acc": 0.27810936002555636 + }, + { + "epoch": 1.734095573145705, + "grad_norm": 0.4358646178546923, + "learning_rate": 0.00029889701033036304, + "loss": 3.191426992416382, + "step": 2958, + "token_acc": 0.27815224768965696 + }, + { + "epoch": 1.7346819114629142, + "grad_norm": 0.4602293276345902, + "learning_rate": 0.0002988952498292412, + "loss": 3.2631454467773438, + "step": 2959, + "token_acc": 0.27076818886472703 + }, + { + "epoch": 1.7352682497801233, + "grad_norm": 0.4666881351243424, + "learning_rate": 0.0002988934879294492, + "loss": 3.255311965942383, + "step": 2960, + "token_acc": 0.270743254410424 + }, + { + "epoch": 1.7358545880973322, + "grad_norm": 0.436412379374655, + "learning_rate": 0.00029889172463100344, + "loss": 3.2340545654296875, + "step": 2961, + "token_acc": 0.2738752672045908 + }, + { + "epoch": 1.736440926414541, + "grad_norm": 0.456885558621909, + "learning_rate": 0.0002988899599339206, + "loss": 3.228849411010742, + "step": 2962, + "token_acc": 0.27373197083399864 + }, + { + "epoch": 1.7370272647317502, + "grad_norm": 0.476080936728531, + "learning_rate": 0.0002988881938382172, + "loss": 3.198176860809326, + "step": 2963, + "token_acc": 0.2788898440378239 + }, + { + "epoch": 1.7376136030489593, + "grad_norm": 0.4288188813084186, + "learning_rate": 0.0002988864263439099, + "loss": 3.2011232376098633, + "step": 2964, + "token_acc": 0.27934262479407995 + }, + { + "epoch": 1.7381999413661684, + "grad_norm": 0.45945776994352666, + "learning_rate": 0.0002988846574510152, + "loss": 3.294987916946411, + "step": 2965, + "token_acc": 0.26492802736638504 + }, + { + "epoch": 1.7387862796833773, + "grad_norm": 0.48926988423464574, + "learning_rate": 0.00029888288715954975, + "loss": 3.279208183288574, + "step": 2966, + "token_acc": 0.2673917392648637 + }, + { + "epoch": 1.7393726180005862, + "grad_norm": 0.5209673038047091, + "learning_rate": 0.00029888111546953023, + "loss": 3.286663055419922, + "step": 2967, + "token_acc": 0.2667663194636676 + }, + { + "epoch": 1.7399589563177953, + "grad_norm": 0.4585282185867643, + "learning_rate": 0.0002988793423809733, + "loss": 3.192500114440918, + "step": 2968, + "token_acc": 0.28024420872632055 + }, + { + "epoch": 1.7405452946350044, + "grad_norm": 0.5561036413315082, + "learning_rate": 0.0002988775678938955, + "loss": 3.2186150550842285, + "step": 2969, + "token_acc": 0.27541355955394403 + }, + { + "epoch": 1.7411316329522135, + "grad_norm": 0.48782343135866413, + "learning_rate": 0.0002988757920083136, + "loss": 3.230574131011963, + "step": 2970, + "token_acc": 0.2729369981148386 + }, + { + "epoch": 1.7417179712694224, + "grad_norm": 0.4347209747158836, + "learning_rate": 0.0002988740147242442, + "loss": 3.2144205570220947, + "step": 2971, + "token_acc": 0.27357395619474045 + }, + { + "epoch": 1.7423043095866315, + "grad_norm": 0.5138337717978827, + "learning_rate": 0.000298872236041704, + "loss": 3.2159461975097656, + "step": 2972, + "token_acc": 0.2749179423889925 + }, + { + "epoch": 1.7428906479038404, + "grad_norm": 0.45446442986291996, + "learning_rate": 0.00029887045596070985, + "loss": 3.2635390758514404, + "step": 2973, + "token_acc": 0.2695824953224163 + }, + { + "epoch": 1.7434769862210495, + "grad_norm": 0.44277223115423175, + "learning_rate": 0.0002988686744812783, + "loss": 3.1536710262298584, + "step": 2974, + "token_acc": 0.28362740158625455 + }, + { + "epoch": 1.7440633245382586, + "grad_norm": 0.4477785635117505, + "learning_rate": 0.00029886689160342624, + "loss": 3.149986505508423, + "step": 2975, + "token_acc": 0.28682943764216984 + }, + { + "epoch": 1.7446496628554677, + "grad_norm": 0.45705595430088247, + "learning_rate": 0.0002988651073271703, + "loss": 3.223367929458618, + "step": 2976, + "token_acc": 0.276728867452036 + }, + { + "epoch": 1.7452360011726766, + "grad_norm": 0.41713350095968776, + "learning_rate": 0.0002988633216525273, + "loss": 3.211010694503784, + "step": 2977, + "token_acc": 0.27826329750104706 + }, + { + "epoch": 1.7458223394898855, + "grad_norm": 0.39944145899523875, + "learning_rate": 0.0002988615345795139, + "loss": 3.209463119506836, + "step": 2978, + "token_acc": 0.2767651924025843 + }, + { + "epoch": 1.7464086778070946, + "grad_norm": 0.45146553948988505, + "learning_rate": 0.000298859746108147, + "loss": 3.2298130989074707, + "step": 2979, + "token_acc": 0.2757337238391604 + }, + { + "epoch": 1.7469950161243037, + "grad_norm": 0.36290085317086257, + "learning_rate": 0.00029885795623844344, + "loss": 3.2437000274658203, + "step": 2980, + "token_acc": 0.2720106261705073 + }, + { + "epoch": 1.7475813544415129, + "grad_norm": 0.3860731857418535, + "learning_rate": 0.00029885616497041993, + "loss": 3.235504388809204, + "step": 2981, + "token_acc": 0.27485309832549076 + }, + { + "epoch": 1.7481676927587217, + "grad_norm": 0.4111734952727702, + "learning_rate": 0.00029885437230409335, + "loss": 3.174206495285034, + "step": 2982, + "token_acc": 0.27966847700187897 + }, + { + "epoch": 1.7487540310759309, + "grad_norm": 0.38123763080776796, + "learning_rate": 0.0002988525782394805, + "loss": 3.1991724967956543, + "step": 2983, + "token_acc": 0.2780540644376381 + }, + { + "epoch": 1.7493403693931397, + "grad_norm": 0.357392931881769, + "learning_rate": 0.0002988507827765983, + "loss": 3.228595018386841, + "step": 2984, + "token_acc": 0.2724567164099995 + }, + { + "epoch": 1.7499267077103489, + "grad_norm": 0.41246354276358044, + "learning_rate": 0.0002988489859154635, + "loss": 3.197805404663086, + "step": 2985, + "token_acc": 0.279079239565822 + }, + { + "epoch": 1.750513046027558, + "grad_norm": 0.39625737378171655, + "learning_rate": 0.0002988471876560931, + "loss": 3.2019221782684326, + "step": 2986, + "token_acc": 0.2785406239815266 + }, + { + "epoch": 1.751099384344767, + "grad_norm": 0.393408487160217, + "learning_rate": 0.000298845387998504, + "loss": 3.1612119674682617, + "step": 2987, + "token_acc": 0.2818130645441298 + }, + { + "epoch": 1.751685722661976, + "grad_norm": 0.47261885390619, + "learning_rate": 0.000298843586942713, + "loss": 3.2373523712158203, + "step": 2988, + "token_acc": 0.27260510552464284 + }, + { + "epoch": 1.7522720609791849, + "grad_norm": 0.4836075771794946, + "learning_rate": 0.0002988417844887371, + "loss": 3.203369140625, + "step": 2989, + "token_acc": 0.27700130344444873 + }, + { + "epoch": 1.752858399296394, + "grad_norm": 0.40921260232614215, + "learning_rate": 0.0002988399806365931, + "loss": 3.1472361087799072, + "step": 2990, + "token_acc": 0.28747061256761075 + }, + { + "epoch": 1.753444737613603, + "grad_norm": 0.374756966045756, + "learning_rate": 0.00029883817538629815, + "loss": 3.188462495803833, + "step": 2991, + "token_acc": 0.2792496741037529 + }, + { + "epoch": 1.7540310759308122, + "grad_norm": 0.4606709485931363, + "learning_rate": 0.00029883636873786904, + "loss": 3.1872105598449707, + "step": 2992, + "token_acc": 0.2783044378113592 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.4391831791058478, + "learning_rate": 0.00029883456069132284, + "loss": 3.2252631187438965, + "step": 2993, + "token_acc": 0.27628494904798434 + }, + { + "epoch": 1.75520375256523, + "grad_norm": 0.4033532181645478, + "learning_rate": 0.00029883275124667654, + "loss": 3.237276077270508, + "step": 2994, + "token_acc": 0.2730648604485676 + }, + { + "epoch": 1.755790090882439, + "grad_norm": 0.3774001762006789, + "learning_rate": 0.000298830940403947, + "loss": 3.160149574279785, + "step": 2995, + "token_acc": 0.28180051949140117 + }, + { + "epoch": 1.7563764291996482, + "grad_norm": 0.36910819264381955, + "learning_rate": 0.00029882912816315145, + "loss": 3.2079246044158936, + "step": 2996, + "token_acc": 0.27753857653359 + }, + { + "epoch": 1.7569627675168573, + "grad_norm": 0.3962695436774894, + "learning_rate": 0.0002988273145243067, + "loss": 3.204803466796875, + "step": 2997, + "token_acc": 0.27699695509079575 + }, + { + "epoch": 1.7575491058340662, + "grad_norm": 0.4853136571049276, + "learning_rate": 0.0002988254994874299, + "loss": 3.2736048698425293, + "step": 2998, + "token_acc": 0.26733801717408273 + }, + { + "epoch": 1.7581354441512753, + "grad_norm": 0.48233065037883466, + "learning_rate": 0.00029882368305253807, + "loss": 3.1931209564208984, + "step": 2999, + "token_acc": 0.27754757190955975 + }, + { + "epoch": 1.7587217824684842, + "grad_norm": 0.5568223567602844, + "learning_rate": 0.0002988218652196483, + "loss": 3.2376389503479004, + "step": 3000, + "token_acc": 0.2734502353783574 + }, + { + "epoch": 1.7593081207856933, + "grad_norm": 0.568473019553501, + "learning_rate": 0.0002988200459887776, + "loss": 3.232337474822998, + "step": 3001, + "token_acc": 0.27418963036520355 + }, + { + "epoch": 1.7598944591029024, + "grad_norm": 0.48928417508193744, + "learning_rate": 0.0002988182253599432, + "loss": 3.216559410095215, + "step": 3002, + "token_acc": 0.2777156523879335 + }, + { + "epoch": 1.7604807974201115, + "grad_norm": 0.44999607097181255, + "learning_rate": 0.000298816403333162, + "loss": 3.209160804748535, + "step": 3003, + "token_acc": 0.27667326341302306 + }, + { + "epoch": 1.7610671357373204, + "grad_norm": 0.4076155523978033, + "learning_rate": 0.00029881457990845123, + "loss": 3.2078170776367188, + "step": 3004, + "token_acc": 0.27667922201882494 + }, + { + "epoch": 1.7616534740545293, + "grad_norm": 0.4504196985302263, + "learning_rate": 0.0002988127550858281, + "loss": 3.2098276615142822, + "step": 3005, + "token_acc": 0.27558754118031203 + }, + { + "epoch": 1.7622398123717384, + "grad_norm": 0.4019528533199475, + "learning_rate": 0.0002988109288653096, + "loss": 3.184138774871826, + "step": 3006, + "token_acc": 0.2793607561353662 + }, + { + "epoch": 1.7628261506889475, + "grad_norm": 0.33582318540561285, + "learning_rate": 0.00029880910124691296, + "loss": 3.2205710411071777, + "step": 3007, + "token_acc": 0.2741717958840845 + }, + { + "epoch": 1.7634124890061567, + "grad_norm": 0.41831368159712556, + "learning_rate": 0.0002988072722306554, + "loss": 3.1818723678588867, + "step": 3008, + "token_acc": 0.28001851379461784 + }, + { + "epoch": 1.7639988273233655, + "grad_norm": 0.38351976240962415, + "learning_rate": 0.00029880544181655396, + "loss": 3.217712163925171, + "step": 3009, + "token_acc": 0.27636742671009773 + }, + { + "epoch": 1.7645851656405747, + "grad_norm": 0.40290249676749684, + "learning_rate": 0.0002988036100046259, + "loss": 3.2289366722106934, + "step": 3010, + "token_acc": 0.2744454179723781 + }, + { + "epoch": 1.7651715039577835, + "grad_norm": 0.44901830164436773, + "learning_rate": 0.00029880177679488846, + "loss": 3.2276575565338135, + "step": 3011, + "token_acc": 0.27476908782860576 + }, + { + "epoch": 1.7657578422749927, + "grad_norm": 0.3842510012972663, + "learning_rate": 0.0002987999421873589, + "loss": 3.196608781814575, + "step": 3012, + "token_acc": 0.27771141717080705 + }, + { + "epoch": 1.7663441805922018, + "grad_norm": 0.36832728374962936, + "learning_rate": 0.00029879810618205433, + "loss": 3.2251601219177246, + "step": 3013, + "token_acc": 0.27583368182403983 + }, + { + "epoch": 1.7669305189094109, + "grad_norm": 0.48137554259166715, + "learning_rate": 0.00029879626877899205, + "loss": 3.229835033416748, + "step": 3014, + "token_acc": 0.2743034136651158 + }, + { + "epoch": 1.7675168572266198, + "grad_norm": 0.5057034826244041, + "learning_rate": 0.00029879442997818935, + "loss": 3.205227851867676, + "step": 3015, + "token_acc": 0.2788544038221311 + }, + { + "epoch": 1.7681031955438287, + "grad_norm": 0.5257169779801291, + "learning_rate": 0.0002987925897796635, + "loss": 3.2185215950012207, + "step": 3016, + "token_acc": 0.2769872201577087 + }, + { + "epoch": 1.7686895338610378, + "grad_norm": 0.47184697729205993, + "learning_rate": 0.00029879074818343177, + "loss": 3.227670192718506, + "step": 3017, + "token_acc": 0.2734811350695923 + }, + { + "epoch": 1.7692758721782469, + "grad_norm": 0.3801213722968637, + "learning_rate": 0.0002987889051895114, + "loss": 3.2056775093078613, + "step": 3018, + "token_acc": 0.2781037714528801 + }, + { + "epoch": 1.769862210495456, + "grad_norm": 0.454100575991093, + "learning_rate": 0.0002987870607979198, + "loss": 3.20422101020813, + "step": 3019, + "token_acc": 0.2795593654526557 + }, + { + "epoch": 1.770448548812665, + "grad_norm": 0.4718796378675752, + "learning_rate": 0.00029878521500867426, + "loss": 3.25378680229187, + "step": 3020, + "token_acc": 0.2707251617422838 + }, + { + "epoch": 1.7710348871298738, + "grad_norm": 0.4445528067020484, + "learning_rate": 0.0002987833678217922, + "loss": 3.2453157901763916, + "step": 3021, + "token_acc": 0.2721935817441633 + }, + { + "epoch": 1.771621225447083, + "grad_norm": 0.4086999590296976, + "learning_rate": 0.0002987815192372907, + "loss": 3.191281318664551, + "step": 3022, + "token_acc": 0.27838477379414434 + }, + { + "epoch": 1.772207563764292, + "grad_norm": 0.4110438884012705, + "learning_rate": 0.00029877966925518745, + "loss": 3.2246382236480713, + "step": 3023, + "token_acc": 0.273426226978502 + }, + { + "epoch": 1.7727939020815011, + "grad_norm": 0.381426325772649, + "learning_rate": 0.00029877781787549966, + "loss": 3.2010087966918945, + "step": 3024, + "token_acc": 0.27750102082482647 + }, + { + "epoch": 1.77338024039871, + "grad_norm": 0.4412252322976521, + "learning_rate": 0.0002987759650982448, + "loss": 3.1967265605926514, + "step": 3025, + "token_acc": 0.27849687793362243 + }, + { + "epoch": 1.7739665787159191, + "grad_norm": 0.43051256948339794, + "learning_rate": 0.00029877411092344016, + "loss": 3.250744104385376, + "step": 3026, + "token_acc": 0.2709132503932788 + }, + { + "epoch": 1.774552917033128, + "grad_norm": 0.4485402817527516, + "learning_rate": 0.00029877225535110326, + "loss": 3.2088661193847656, + "step": 3027, + "token_acc": 0.2769502320270681 + }, + { + "epoch": 1.7751392553503371, + "grad_norm": 0.4580295659152123, + "learning_rate": 0.00029877039838125145, + "loss": 3.2266652584075928, + "step": 3028, + "token_acc": 0.2749788370637938 + }, + { + "epoch": 1.7757255936675462, + "grad_norm": 0.4158067344416709, + "learning_rate": 0.00029876854001390223, + "loss": 3.1609230041503906, + "step": 3029, + "token_acc": 0.2848933664791312 + }, + { + "epoch": 1.7763119319847553, + "grad_norm": 0.48798610508802487, + "learning_rate": 0.0002987666802490731, + "loss": 3.217484474182129, + "step": 3030, + "token_acc": 0.27469376841559073 + }, + { + "epoch": 1.7768982703019642, + "grad_norm": 0.42652496634360815, + "learning_rate": 0.0002987648190867814, + "loss": 3.2117695808410645, + "step": 3031, + "token_acc": 0.27565490689494715 + }, + { + "epoch": 1.7774846086191731, + "grad_norm": 0.3844513614065584, + "learning_rate": 0.0002987629565270447, + "loss": 3.2262282371520996, + "step": 3032, + "token_acc": 0.27447388005705664 + }, + { + "epoch": 1.7780709469363822, + "grad_norm": 0.41181120203171784, + "learning_rate": 0.00029876109256988056, + "loss": 3.176386833190918, + "step": 3033, + "token_acc": 0.2803974298441894 + }, + { + "epoch": 1.7786572852535913, + "grad_norm": 0.4113338658326975, + "learning_rate": 0.00029875922721530636, + "loss": 3.200834035873413, + "step": 3034, + "token_acc": 0.27865419479576997 + }, + { + "epoch": 1.7792436235708005, + "grad_norm": 0.3977930974238967, + "learning_rate": 0.00029875736046333965, + "loss": 3.240610122680664, + "step": 3035, + "token_acc": 0.27340778734864213 + }, + { + "epoch": 1.7798299618880093, + "grad_norm": 0.39834230440344687, + "learning_rate": 0.000298755492313998, + "loss": 3.20261287689209, + "step": 3036, + "token_acc": 0.27678968373281115 + }, + { + "epoch": 1.7804163002052185, + "grad_norm": 0.41900610497518376, + "learning_rate": 0.00029875362276729896, + "loss": 3.193077564239502, + "step": 3037, + "token_acc": 0.2809007872862043 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.399033025506431, + "learning_rate": 0.0002987517518232601, + "loss": 3.227677345275879, + "step": 3038, + "token_acc": 0.273995111843703 + }, + { + "epoch": 1.7815889768396365, + "grad_norm": 0.3344428067050935, + "learning_rate": 0.00029874987948189894, + "loss": 3.1884307861328125, + "step": 3039, + "token_acc": 0.27842918879067613 + }, + { + "epoch": 1.7821753151568456, + "grad_norm": 0.4359989969882841, + "learning_rate": 0.00029874800574323314, + "loss": 3.2151083946228027, + "step": 3040, + "token_acc": 0.27566703024444067 + }, + { + "epoch": 1.7827616534740547, + "grad_norm": 0.4230103452265189, + "learning_rate": 0.00029874613060728027, + "loss": 3.217792510986328, + "step": 3041, + "token_acc": 0.27678232095712496 + }, + { + "epoch": 1.7833479917912636, + "grad_norm": 0.37760816007101156, + "learning_rate": 0.00029874425407405795, + "loss": 3.196533679962158, + "step": 3042, + "token_acc": 0.2790759892876156 + }, + { + "epoch": 1.7839343301084725, + "grad_norm": 0.4072870011474391, + "learning_rate": 0.00029874237614358374, + "loss": 3.2531285285949707, + "step": 3043, + "token_acc": 0.274150277896537 + }, + { + "epoch": 1.7845206684256816, + "grad_norm": 0.4815636661061652, + "learning_rate": 0.00029874049681587536, + "loss": 3.203864574432373, + "step": 3044, + "token_acc": 0.2765805045157272 + }, + { + "epoch": 1.7851070067428907, + "grad_norm": 0.5592637574956612, + "learning_rate": 0.0002987386160909505, + "loss": 3.1920690536499023, + "step": 3045, + "token_acc": 0.27948762639598135 + }, + { + "epoch": 1.7856933450600998, + "grad_norm": 0.5116728882443193, + "learning_rate": 0.00029873673396882666, + "loss": 3.2219960689544678, + "step": 3046, + "token_acc": 0.2767154165678613 + }, + { + "epoch": 1.7862796833773087, + "grad_norm": 0.4766312764029502, + "learning_rate": 0.0002987348504495217, + "loss": 3.2557475566864014, + "step": 3047, + "token_acc": 0.2717916698635102 + }, + { + "epoch": 1.7868660216945176, + "grad_norm": 0.43202190142552827, + "learning_rate": 0.00029873296553305326, + "loss": 3.2439732551574707, + "step": 3048, + "token_acc": 0.27214608965074516 + }, + { + "epoch": 1.7874523600117267, + "grad_norm": 0.4994665538617656, + "learning_rate": 0.000298731079219439, + "loss": 3.1739494800567627, + "step": 3049, + "token_acc": 0.2806197167314858 + }, + { + "epoch": 1.7880386983289358, + "grad_norm": 0.527347381215796, + "learning_rate": 0.00029872919150869667, + "loss": 3.2109100818634033, + "step": 3050, + "token_acc": 0.27657140469188296 + }, + { + "epoch": 1.788625036646145, + "grad_norm": 0.46072518418200253, + "learning_rate": 0.00029872730240084405, + "loss": 3.201361656188965, + "step": 3051, + "token_acc": 0.27859193470575677 + }, + { + "epoch": 1.7892113749633538, + "grad_norm": 0.38981800606865175, + "learning_rate": 0.00029872541189589875, + "loss": 3.1890811920166016, + "step": 3052, + "token_acc": 0.27871792914699434 + }, + { + "epoch": 1.789797713280563, + "grad_norm": 0.5780406996167121, + "learning_rate": 0.00029872351999387866, + "loss": 3.183228015899658, + "step": 3053, + "token_acc": 0.2805518983547949 + }, + { + "epoch": 1.7903840515977718, + "grad_norm": 0.43280506158346155, + "learning_rate": 0.0002987216266948015, + "loss": 3.2463531494140625, + "step": 3054, + "token_acc": 0.2711178725543592 + }, + { + "epoch": 1.790970389914981, + "grad_norm": 0.4297012456614373, + "learning_rate": 0.0002987197319986851, + "loss": 3.169093132019043, + "step": 3055, + "token_acc": 0.2816073422464311 + }, + { + "epoch": 1.79155672823219, + "grad_norm": 0.4123924087524073, + "learning_rate": 0.0002987178359055472, + "loss": 3.23175048828125, + "step": 3056, + "token_acc": 0.27464615431451994 + }, + { + "epoch": 1.7921430665493991, + "grad_norm": 0.4720911078423588, + "learning_rate": 0.0002987159384154056, + "loss": 3.1703243255615234, + "step": 3057, + "token_acc": 0.2829324747234779 + }, + { + "epoch": 1.792729404866608, + "grad_norm": 0.38726885822971013, + "learning_rate": 0.00029871403952827817, + "loss": 3.1636850833892822, + "step": 3058, + "token_acc": 0.2844534835489784 + }, + { + "epoch": 1.793315743183817, + "grad_norm": 0.43873521591992914, + "learning_rate": 0.0002987121392441827, + "loss": 3.1982152462005615, + "step": 3059, + "token_acc": 0.2795582042355231 + }, + { + "epoch": 1.793902081501026, + "grad_norm": 0.43347233583267847, + "learning_rate": 0.0002987102375631371, + "loss": 3.258458137512207, + "step": 3060, + "token_acc": 0.26988021052295474 + }, + { + "epoch": 1.7944884198182351, + "grad_norm": 0.39534045915902943, + "learning_rate": 0.00029870833448515926, + "loss": 3.2241101264953613, + "step": 3061, + "token_acc": 0.27467076428063153 + }, + { + "epoch": 1.7950747581354443, + "grad_norm": 0.4193269975284209, + "learning_rate": 0.00029870643001026696, + "loss": 3.208967685699463, + "step": 3062, + "token_acc": 0.27696220120128684 + }, + { + "epoch": 1.7956610964526531, + "grad_norm": 0.5125322557948351, + "learning_rate": 0.0002987045241384782, + "loss": 3.2821810245513916, + "step": 3063, + "token_acc": 0.26538648101121354 + }, + { + "epoch": 1.7962474347698623, + "grad_norm": 0.5024411216156999, + "learning_rate": 0.0002987026168698107, + "loss": 3.2589094638824463, + "step": 3064, + "token_acc": 0.27228363606476536 + }, + { + "epoch": 1.7968337730870712, + "grad_norm": 0.44702953811579343, + "learning_rate": 0.0002987007082042826, + "loss": 3.162526845932007, + "step": 3065, + "token_acc": 0.2839980640182087 + }, + { + "epoch": 1.7974201114042803, + "grad_norm": 0.38903732352117193, + "learning_rate": 0.0002986987981419117, + "loss": 3.2290239334106445, + "step": 3066, + "token_acc": 0.2735682559436958 + }, + { + "epoch": 1.7980064497214894, + "grad_norm": 0.4401312455932557, + "learning_rate": 0.0002986968866827159, + "loss": 3.2162981033325195, + "step": 3067, + "token_acc": 0.2752204465482574 + }, + { + "epoch": 1.7985927880386985, + "grad_norm": 0.4802073775247277, + "learning_rate": 0.00029869497382671324, + "loss": 3.254286766052246, + "step": 3068, + "token_acc": 0.2712889913985752 + }, + { + "epoch": 1.7991791263559074, + "grad_norm": 0.4204199252379988, + "learning_rate": 0.0002986930595739217, + "loss": 3.2674221992492676, + "step": 3069, + "token_acc": 0.2700633600098783 + }, + { + "epoch": 1.7997654646731163, + "grad_norm": 0.42891477123289806, + "learning_rate": 0.0002986911439243592, + "loss": 3.1940321922302246, + "step": 3070, + "token_acc": 0.2777882518394717 + }, + { + "epoch": 1.8003518029903254, + "grad_norm": 0.49901467444925296, + "learning_rate": 0.0002986892268780438, + "loss": 3.2779791355133057, + "step": 3071, + "token_acc": 0.26902503369219644 + }, + { + "epoch": 1.8009381413075345, + "grad_norm": 0.41877885587213687, + "learning_rate": 0.00029868730843499343, + "loss": 3.2141828536987305, + "step": 3072, + "token_acc": 0.275964811576404 + }, + { + "epoch": 1.8015244796247436, + "grad_norm": 0.40499434211130053, + "learning_rate": 0.00029868538859522623, + "loss": 3.199528217315674, + "step": 3073, + "token_acc": 0.27853907715952725 + }, + { + "epoch": 1.8021108179419525, + "grad_norm": 0.4441401078885119, + "learning_rate": 0.0002986834673587601, + "loss": 3.211149215698242, + "step": 3074, + "token_acc": 0.27742192821825135 + }, + { + "epoch": 1.8026971562591614, + "grad_norm": 0.4298235793235902, + "learning_rate": 0.0002986815447256132, + "loss": 3.195619583129883, + "step": 3075, + "token_acc": 0.2783943002442033 + }, + { + "epoch": 1.8032834945763705, + "grad_norm": 0.5212437172382742, + "learning_rate": 0.00029867962069580345, + "loss": 3.1940438747406006, + "step": 3076, + "token_acc": 0.2778802347380819 + }, + { + "epoch": 1.8038698328935796, + "grad_norm": 0.5184726183245811, + "learning_rate": 0.0002986776952693491, + "loss": 3.2543487548828125, + "step": 3077, + "token_acc": 0.2707527491185204 + }, + { + "epoch": 1.8044561712107887, + "grad_norm": 0.43210728160586004, + "learning_rate": 0.0002986757684462681, + "loss": 3.1954867839813232, + "step": 3078, + "token_acc": 0.2773328628485806 + }, + { + "epoch": 1.8050425095279976, + "grad_norm": 0.510658200430899, + "learning_rate": 0.00029867384022657864, + "loss": 3.2134909629821777, + "step": 3079, + "token_acc": 0.2769135234098136 + }, + { + "epoch": 1.8056288478452067, + "grad_norm": 0.4431334149184413, + "learning_rate": 0.0002986719106102988, + "loss": 3.2375941276550293, + "step": 3080, + "token_acc": 0.2735058741605516 + }, + { + "epoch": 1.8062151861624156, + "grad_norm": 0.48954184459414696, + "learning_rate": 0.0002986699795974466, + "loss": 3.2246596813201904, + "step": 3081, + "token_acc": 0.27461348834378235 + }, + { + "epoch": 1.8068015244796247, + "grad_norm": 0.4262138406975625, + "learning_rate": 0.0002986680471880404, + "loss": 3.2108099460601807, + "step": 3082, + "token_acc": 0.27553052710434883 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.4189868749650725, + "learning_rate": 0.00029866611338209815, + "loss": 3.2112135887145996, + "step": 3083, + "token_acc": 0.2770698003227915 + }, + { + "epoch": 1.807974201114043, + "grad_norm": 0.41203927046986744, + "learning_rate": 0.0002986641781796381, + "loss": 3.1484875679016113, + "step": 3084, + "token_acc": 0.284946854052248 + }, + { + "epoch": 1.8085605394312518, + "grad_norm": 0.4975151478604645, + "learning_rate": 0.00029866224158067847, + "loss": 3.2264223098754883, + "step": 3085, + "token_acc": 0.2753575959933222 + }, + { + "epoch": 1.8091468777484607, + "grad_norm": 0.46300821375660994, + "learning_rate": 0.0002986603035852373, + "loss": 3.1928622722625732, + "step": 3086, + "token_acc": 0.27933179360663485 + }, + { + "epoch": 1.8097332160656698, + "grad_norm": 0.45176389425886143, + "learning_rate": 0.000298658364193333, + "loss": 3.169503688812256, + "step": 3087, + "token_acc": 0.28127640677536003 + }, + { + "epoch": 1.810319554382879, + "grad_norm": 0.4350557653181423, + "learning_rate": 0.0002986564234049837, + "loss": 3.243781328201294, + "step": 3088, + "token_acc": 0.27342597294186144 + }, + { + "epoch": 1.810905892700088, + "grad_norm": 0.41248706938235047, + "learning_rate": 0.00029865448122020754, + "loss": 3.1504716873168945, + "step": 3089, + "token_acc": 0.2841933661179435 + }, + { + "epoch": 1.811492231017297, + "grad_norm": 0.49268484197778906, + "learning_rate": 0.00029865253763902293, + "loss": 3.193650484085083, + "step": 3090, + "token_acc": 0.2789217966883076 + }, + { + "epoch": 1.812078569334506, + "grad_norm": 0.3841039358593599, + "learning_rate": 0.000298650592661448, + "loss": 3.2557573318481445, + "step": 3091, + "token_acc": 0.27061570188977807 + }, + { + "epoch": 1.812664907651715, + "grad_norm": 0.4499488876826179, + "learning_rate": 0.00029864864628750105, + "loss": 3.198819637298584, + "step": 3092, + "token_acc": 0.2796761359450157 + }, + { + "epoch": 1.813251245968924, + "grad_norm": 0.40753208160656057, + "learning_rate": 0.00029864669851720037, + "loss": 3.147763729095459, + "step": 3093, + "token_acc": 0.2839922499320726 + }, + { + "epoch": 1.8138375842861332, + "grad_norm": 0.35178713276639517, + "learning_rate": 0.0002986447493505643, + "loss": 3.1944730281829834, + "step": 3094, + "token_acc": 0.2804610957250102 + }, + { + "epoch": 1.8144239226033423, + "grad_norm": 0.33907632871602367, + "learning_rate": 0.00029864279878761104, + "loss": 3.204986810684204, + "step": 3095, + "token_acc": 0.27684321778800486 + }, + { + "epoch": 1.8150102609205512, + "grad_norm": 0.33653443604374145, + "learning_rate": 0.00029864084682835904, + "loss": 3.206244468688965, + "step": 3096, + "token_acc": 0.27764046619649285 + }, + { + "epoch": 1.81559659923776, + "grad_norm": 0.3298522977495309, + "learning_rate": 0.0002986388934728266, + "loss": 3.1717872619628906, + "step": 3097, + "token_acc": 0.2836129450956071 + }, + { + "epoch": 1.8161829375549692, + "grad_norm": 0.3294007155370809, + "learning_rate": 0.00029863693872103197, + "loss": 3.1714537143707275, + "step": 3098, + "token_acc": 0.28205546707382884 + }, + { + "epoch": 1.8167692758721783, + "grad_norm": 0.3685643171166607, + "learning_rate": 0.00029863498257299366, + "loss": 3.238262176513672, + "step": 3099, + "token_acc": 0.27239314017445554 + }, + { + "epoch": 1.8173556141893874, + "grad_norm": 0.4234196138726498, + "learning_rate": 0.00029863302502872993, + "loss": 3.2294039726257324, + "step": 3100, + "token_acc": 0.2751639519297316 + }, + { + "epoch": 1.8179419525065963, + "grad_norm": 0.5167174813397577, + "learning_rate": 0.00029863106608825926, + "loss": 3.2505478858947754, + "step": 3101, + "token_acc": 0.27222478362009345 + }, + { + "epoch": 1.8185282908238052, + "grad_norm": 0.46935948210100714, + "learning_rate": 0.0002986291057516, + "loss": 3.2138917446136475, + "step": 3102, + "token_acc": 0.2756795561913976 + }, + { + "epoch": 1.8191146291410143, + "grad_norm": 0.44551284679005143, + "learning_rate": 0.00029862714401877053, + "loss": 3.211258888244629, + "step": 3103, + "token_acc": 0.2769806843880918 + }, + { + "epoch": 1.8197009674582234, + "grad_norm": 0.4537234968057431, + "learning_rate": 0.0002986251808897894, + "loss": 3.2034716606140137, + "step": 3104, + "token_acc": 0.2770397584377291 + }, + { + "epoch": 1.8202873057754325, + "grad_norm": 0.39869028198677336, + "learning_rate": 0.00029862321636467485, + "loss": 3.167607545852661, + "step": 3105, + "token_acc": 0.28300831722730135 + }, + { + "epoch": 1.8208736440926414, + "grad_norm": 0.3686062727039798, + "learning_rate": 0.00029862125044344555, + "loss": 3.20642352104187, + "step": 3106, + "token_acc": 0.27738731165403935 + }, + { + "epoch": 1.8214599824098505, + "grad_norm": 0.37085357467841207, + "learning_rate": 0.00029861928312611985, + "loss": 3.216221809387207, + "step": 3107, + "token_acc": 0.27567667126222584 + }, + { + "epoch": 1.8220463207270594, + "grad_norm": 0.3767866787276598, + "learning_rate": 0.00029861731441271623, + "loss": 3.155963897705078, + "step": 3108, + "token_acc": 0.28427144715205216 + }, + { + "epoch": 1.8226326590442685, + "grad_norm": 0.41341285109018266, + "learning_rate": 0.00029861534430325324, + "loss": 3.1870346069335938, + "step": 3109, + "token_acc": 0.2800279275031197 + }, + { + "epoch": 1.8232189973614776, + "grad_norm": 0.41418805797569425, + "learning_rate": 0.00029861337279774936, + "loss": 3.229611873626709, + "step": 3110, + "token_acc": 0.2764427270620523 + }, + { + "epoch": 1.8238053356786867, + "grad_norm": 0.49703068165890885, + "learning_rate": 0.0002986113998962231, + "loss": 3.180020332336426, + "step": 3111, + "token_acc": 0.27924554580614747 + }, + { + "epoch": 1.8243916739958956, + "grad_norm": 0.46791056259104963, + "learning_rate": 0.0002986094255986929, + "loss": 3.216400146484375, + "step": 3112, + "token_acc": 0.27519471260794287 + }, + { + "epoch": 1.8249780123131045, + "grad_norm": 0.42541383570304797, + "learning_rate": 0.0002986074499051775, + "loss": 3.213284969329834, + "step": 3113, + "token_acc": 0.279238840305554 + }, + { + "epoch": 1.8255643506303136, + "grad_norm": 0.3748929424448023, + "learning_rate": 0.0002986054728156953, + "loss": 3.182849884033203, + "step": 3114, + "token_acc": 0.2797672325002789 + }, + { + "epoch": 1.8261506889475227, + "grad_norm": 0.42727198737075833, + "learning_rate": 0.000298603494330265, + "loss": 3.2009809017181396, + "step": 3115, + "token_acc": 0.27764275895701757 + }, + { + "epoch": 1.8267370272647319, + "grad_norm": 0.47546506951376843, + "learning_rate": 0.0002986015144489051, + "loss": 3.2002198696136475, + "step": 3116, + "token_acc": 0.2797896954922943 + }, + { + "epoch": 1.8273233655819408, + "grad_norm": 0.47828658624460335, + "learning_rate": 0.00029859953317163415, + "loss": 3.212883949279785, + "step": 3117, + "token_acc": 0.27705251469695474 + }, + { + "epoch": 1.8279097038991499, + "grad_norm": 0.4058148664822124, + "learning_rate": 0.00029859755049847087, + "loss": 3.1685757637023926, + "step": 3118, + "token_acc": 0.28074656652304236 + }, + { + "epoch": 1.8284960422163588, + "grad_norm": 0.3903058587257659, + "learning_rate": 0.00029859556642943387, + "loss": 3.153568744659424, + "step": 3119, + "token_acc": 0.2823968765914106 + }, + { + "epoch": 1.8290823805335679, + "grad_norm": 0.38492404378103423, + "learning_rate": 0.0002985935809645417, + "loss": 3.2294020652770996, + "step": 3120, + "token_acc": 0.2738543831469541 + }, + { + "epoch": 1.829668718850777, + "grad_norm": 0.44808067750202546, + "learning_rate": 0.0002985915941038131, + "loss": 3.1805953979492188, + "step": 3121, + "token_acc": 0.28085333220737957 + }, + { + "epoch": 1.830255057167986, + "grad_norm": 0.42568268337021614, + "learning_rate": 0.00029858960584726665, + "loss": 3.2394044399261475, + "step": 3122, + "token_acc": 0.2709040631904844 + }, + { + "epoch": 1.830841395485195, + "grad_norm": 0.4268552598205282, + "learning_rate": 0.0002985876161949212, + "loss": 3.2019171714782715, + "step": 3123, + "token_acc": 0.2778036052883134 + }, + { + "epoch": 1.8314277338024039, + "grad_norm": 0.4004359952887598, + "learning_rate": 0.00029858562514679525, + "loss": 3.1293020248413086, + "step": 3124, + "token_acc": 0.28673597552955254 + }, + { + "epoch": 1.832014072119613, + "grad_norm": 0.44890998005873095, + "learning_rate": 0.00029858363270290753, + "loss": 3.206660509109497, + "step": 3125, + "token_acc": 0.2764361216769021 + }, + { + "epoch": 1.832600410436822, + "grad_norm": 0.43258716369510586, + "learning_rate": 0.00029858163886327686, + "loss": 3.217843532562256, + "step": 3126, + "token_acc": 0.2758903879556892 + }, + { + "epoch": 1.8331867487540312, + "grad_norm": 0.450978071018672, + "learning_rate": 0.0002985796436279219, + "loss": 3.2305612564086914, + "step": 3127, + "token_acc": 0.2743687172832503 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.5089404042219443, + "learning_rate": 0.00029857764699686137, + "loss": 3.2540650367736816, + "step": 3128, + "token_acc": 0.2690460084766364 + }, + { + "epoch": 1.834359425388449, + "grad_norm": 0.4082736116010722, + "learning_rate": 0.0002985756489701141, + "loss": 3.18744158744812, + "step": 3129, + "token_acc": 0.2794995274553737 + }, + { + "epoch": 1.834945763705658, + "grad_norm": 0.35075497128023786, + "learning_rate": 0.00029857364954769883, + "loss": 3.1709580421447754, + "step": 3130, + "token_acc": 0.2793715934924603 + }, + { + "epoch": 1.8355321020228672, + "grad_norm": 0.37835861697190143, + "learning_rate": 0.0002985716487296343, + "loss": 3.208159923553467, + "step": 3131, + "token_acc": 0.27684298421545805 + }, + { + "epoch": 1.8361184403400763, + "grad_norm": 0.4000573020570646, + "learning_rate": 0.0002985696465159393, + "loss": 3.2082881927490234, + "step": 3132, + "token_acc": 0.279523993289912 + }, + { + "epoch": 1.8367047786572852, + "grad_norm": 0.40340472367935787, + "learning_rate": 0.00029856764290663273, + "loss": 3.227461576461792, + "step": 3133, + "token_acc": 0.2739957220367279 + }, + { + "epoch": 1.8372911169744943, + "grad_norm": 0.4194641914519934, + "learning_rate": 0.0002985656379017333, + "loss": 3.2242798805236816, + "step": 3134, + "token_acc": 0.2766155821394363 + }, + { + "epoch": 1.8378774552917032, + "grad_norm": 0.43292567839822604, + "learning_rate": 0.00029856363150125993, + "loss": 3.2192955017089844, + "step": 3135, + "token_acc": 0.2745324278009651 + }, + { + "epoch": 1.8384637936089123, + "grad_norm": 0.4558226350646853, + "learning_rate": 0.0002985616237052314, + "loss": 3.202052593231201, + "step": 3136, + "token_acc": 0.27658421030995645 + }, + { + "epoch": 1.8390501319261214, + "grad_norm": 0.45727269258474135, + "learning_rate": 0.0002985596145136666, + "loss": 3.2248802185058594, + "step": 3137, + "token_acc": 0.27515891702600853 + }, + { + "epoch": 1.8396364702433305, + "grad_norm": 0.43041455822369623, + "learning_rate": 0.00029855760392658444, + "loss": 3.190176486968994, + "step": 3138, + "token_acc": 0.2786948637260654 + }, + { + "epoch": 1.8402228085605394, + "grad_norm": 0.441329278628718, + "learning_rate": 0.0002985555919440038, + "loss": 3.1970410346984863, + "step": 3139, + "token_acc": 0.27860734043742996 + }, + { + "epoch": 1.8408091468777483, + "grad_norm": 0.46537009334174434, + "learning_rate": 0.0002985535785659435, + "loss": 3.1814727783203125, + "step": 3140, + "token_acc": 0.28188284485526066 + }, + { + "epoch": 1.8413954851949574, + "grad_norm": 0.4605879301198367, + "learning_rate": 0.00029855156379242256, + "loss": 3.226712703704834, + "step": 3141, + "token_acc": 0.2744886509578078 + }, + { + "epoch": 1.8419818235121665, + "grad_norm": 0.4076116366859706, + "learning_rate": 0.0002985495476234598, + "loss": 3.2322587966918945, + "step": 3142, + "token_acc": 0.2748466663178159 + }, + { + "epoch": 1.8425681618293757, + "grad_norm": 0.38425798073138234, + "learning_rate": 0.0002985475300590743, + "loss": 3.1664044857025146, + "step": 3143, + "token_acc": 0.28218780377162855 + }, + { + "epoch": 1.8431545001465846, + "grad_norm": 0.41519430838295224, + "learning_rate": 0.00029854551109928485, + "loss": 3.2139058113098145, + "step": 3144, + "token_acc": 0.2755124461610549 + }, + { + "epoch": 1.8437408384637937, + "grad_norm": 0.3868168126568655, + "learning_rate": 0.0002985434907441105, + "loss": 3.210146427154541, + "step": 3145, + "token_acc": 0.2760524777275385 + }, + { + "epoch": 1.8443271767810026, + "grad_norm": 0.3985831011018103, + "learning_rate": 0.0002985414689935702, + "loss": 3.194047689437866, + "step": 3146, + "token_acc": 0.2787377444636553 + }, + { + "epoch": 1.8449135150982117, + "grad_norm": 0.40317171119465345, + "learning_rate": 0.000298539445847683, + "loss": 3.2293734550476074, + "step": 3147, + "token_acc": 0.27307719723430035 + }, + { + "epoch": 1.8454998534154208, + "grad_norm": 0.37243362005012787, + "learning_rate": 0.0002985374213064679, + "loss": 3.182706832885742, + "step": 3148, + "token_acc": 0.2819988234181803 + }, + { + "epoch": 1.8460861917326299, + "grad_norm": 0.44328076855296367, + "learning_rate": 0.0002985353953699438, + "loss": 3.2005558013916016, + "step": 3149, + "token_acc": 0.27830500538946956 + }, + { + "epoch": 1.8466725300498388, + "grad_norm": 0.43004968188085785, + "learning_rate": 0.00029853336803812983, + "loss": 3.1745519638061523, + "step": 3150, + "token_acc": 0.2813922412543455 + }, + { + "epoch": 1.8472588683670477, + "grad_norm": 0.3852882331914369, + "learning_rate": 0.0002985313393110451, + "loss": 3.202648162841797, + "step": 3151, + "token_acc": 0.2785962777285414 + }, + { + "epoch": 1.8478452066842568, + "grad_norm": 0.4870368865653353, + "learning_rate": 0.0002985293091887085, + "loss": 3.2058701515197754, + "step": 3152, + "token_acc": 0.2773094317814291 + }, + { + "epoch": 1.848431545001466, + "grad_norm": 0.45651623027631416, + "learning_rate": 0.0002985272776711392, + "loss": 3.227034330368042, + "step": 3153, + "token_acc": 0.27455820749595544 + }, + { + "epoch": 1.849017883318675, + "grad_norm": 0.5259379715010543, + "learning_rate": 0.0002985252447583563, + "loss": 3.211610794067383, + "step": 3154, + "token_acc": 0.275925784971693 + }, + { + "epoch": 1.849604221635884, + "grad_norm": 0.4809338588034194, + "learning_rate": 0.00029852321045037883, + "loss": 3.217149257659912, + "step": 3155, + "token_acc": 0.2742873101907402 + }, + { + "epoch": 1.8501905599530928, + "grad_norm": 0.4108569049849537, + "learning_rate": 0.0002985211747472259, + "loss": 3.201446056365967, + "step": 3156, + "token_acc": 0.27515519732694815 + }, + { + "epoch": 1.850776898270302, + "grad_norm": 0.37086706909781375, + "learning_rate": 0.00029851913764891675, + "loss": 3.210491418838501, + "step": 3157, + "token_acc": 0.27733549158618614 + }, + { + "epoch": 1.851363236587511, + "grad_norm": 0.41262517789485464, + "learning_rate": 0.00029851709915547044, + "loss": 3.17791748046875, + "step": 3158, + "token_acc": 0.2787053851222138 + }, + { + "epoch": 1.8519495749047201, + "grad_norm": 0.45673646088590075, + "learning_rate": 0.00029851505926690606, + "loss": 3.2235374450683594, + "step": 3159, + "token_acc": 0.2746550156998843 + }, + { + "epoch": 1.852535913221929, + "grad_norm": 0.35702179950292057, + "learning_rate": 0.00029851301798324286, + "loss": 3.1815433502197266, + "step": 3160, + "token_acc": 0.28194149199810553 + }, + { + "epoch": 1.8531222515391381, + "grad_norm": 0.39987948080732244, + "learning_rate": 0.0002985109753044999, + "loss": 3.229124069213867, + "step": 3161, + "token_acc": 0.2747942843988072 + }, + { + "epoch": 1.853708589856347, + "grad_norm": 0.39850377029089884, + "learning_rate": 0.00029850893123069657, + "loss": 3.171560287475586, + "step": 3162, + "token_acc": 0.281475852393504 + }, + { + "epoch": 1.8542949281735561, + "grad_norm": 0.4218096560658906, + "learning_rate": 0.00029850688576185186, + "loss": 3.2326807975769043, + "step": 3163, + "token_acc": 0.27570190989239185 + }, + { + "epoch": 1.8548812664907652, + "grad_norm": 0.4021593184716227, + "learning_rate": 0.0002985048388979851, + "loss": 3.254361391067505, + "step": 3164, + "token_acc": 0.27140013446353006 + }, + { + "epoch": 1.8554676048079743, + "grad_norm": 0.38612231061113916, + "learning_rate": 0.0002985027906391155, + "loss": 3.2154479026794434, + "step": 3165, + "token_acc": 0.2759643840285919 + }, + { + "epoch": 1.8560539431251832, + "grad_norm": 0.35717847703912836, + "learning_rate": 0.0002985007409852623, + "loss": 3.171966552734375, + "step": 3166, + "token_acc": 0.2812637467421016 + }, + { + "epoch": 1.8566402814423921, + "grad_norm": 0.3769056647241825, + "learning_rate": 0.0002984986899364447, + "loss": 3.1887423992156982, + "step": 3167, + "token_acc": 0.2794621554888916 + }, + { + "epoch": 1.8572266197596012, + "grad_norm": 0.3648182072034886, + "learning_rate": 0.00029849663749268205, + "loss": 3.178748607635498, + "step": 3168, + "token_acc": 0.281371768099718 + }, + { + "epoch": 1.8578129580768104, + "grad_norm": 0.3542305046949214, + "learning_rate": 0.0002984945836539936, + "loss": 3.182419538497925, + "step": 3169, + "token_acc": 0.2802276250305948 + }, + { + "epoch": 1.8583992963940195, + "grad_norm": 0.3378534815473059, + "learning_rate": 0.0002984925284203986, + "loss": 3.200014114379883, + "step": 3170, + "token_acc": 0.2782660668449055 + }, + { + "epoch": 1.8589856347112284, + "grad_norm": 0.371313051338446, + "learning_rate": 0.0002984904717919164, + "loss": 3.1628899574279785, + "step": 3171, + "token_acc": 0.28373754104088483 + }, + { + "epoch": 1.8595719730284375, + "grad_norm": 0.39976028829886423, + "learning_rate": 0.00029848841376856636, + "loss": 3.2044270038604736, + "step": 3172, + "token_acc": 0.2770537030870951 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.4250598892164116, + "learning_rate": 0.0002984863543503677, + "loss": 3.2030868530273438, + "step": 3173, + "token_acc": 0.27621551148468887 + }, + { + "epoch": 1.8607446496628555, + "grad_norm": 0.40323851635304736, + "learning_rate": 0.00029848429353733984, + "loss": 3.202575922012329, + "step": 3174, + "token_acc": 0.2771224144797048 + }, + { + "epoch": 1.8613309879800646, + "grad_norm": 0.377348552669163, + "learning_rate": 0.0002984822313295022, + "loss": 3.2006585597991943, + "step": 3175, + "token_acc": 0.27729184122534317 + }, + { + "epoch": 1.8619173262972737, + "grad_norm": 0.4181877463261301, + "learning_rate": 0.000298480167726874, + "loss": 3.151139497756958, + "step": 3176, + "token_acc": 0.2858931753600154 + }, + { + "epoch": 1.8625036646144826, + "grad_norm": 0.43541420658597085, + "learning_rate": 0.00029847810272947475, + "loss": 3.2313599586486816, + "step": 3177, + "token_acc": 0.27430587870327283 + }, + { + "epoch": 1.8630900029316915, + "grad_norm": 0.4079072090005843, + "learning_rate": 0.0002984760363373237, + "loss": 3.202117919921875, + "step": 3178, + "token_acc": 0.2774894244145681 + }, + { + "epoch": 1.8636763412489006, + "grad_norm": 0.47212875656883946, + "learning_rate": 0.0002984739685504405, + "loss": 3.239431142807007, + "step": 3179, + "token_acc": 0.27287224479054023 + }, + { + "epoch": 1.8642626795661097, + "grad_norm": 0.4514300879350858, + "learning_rate": 0.00029847189936884434, + "loss": 3.1829934120178223, + "step": 3180, + "token_acc": 0.2795262232811818 + }, + { + "epoch": 1.8648490178833188, + "grad_norm": 0.522484620590958, + "learning_rate": 0.0002984698287925548, + "loss": 3.2194814682006836, + "step": 3181, + "token_acc": 0.27591910993666374 + }, + { + "epoch": 1.8654353562005277, + "grad_norm": 0.45634783663542133, + "learning_rate": 0.0002984677568215913, + "loss": 3.2230944633483887, + "step": 3182, + "token_acc": 0.27489761868648566 + }, + { + "epoch": 1.8660216945177366, + "grad_norm": 0.4655294040153684, + "learning_rate": 0.0002984656834559733, + "loss": 3.2047390937805176, + "step": 3183, + "token_acc": 0.27477459234246776 + }, + { + "epoch": 1.8666080328349457, + "grad_norm": 0.4788033130084151, + "learning_rate": 0.0002984636086957202, + "loss": 3.1858067512512207, + "step": 3184, + "token_acc": 0.2798367928805218 + }, + { + "epoch": 1.8671943711521548, + "grad_norm": 0.44564861390293875, + "learning_rate": 0.00029846153254085156, + "loss": 3.1930346488952637, + "step": 3185, + "token_acc": 0.27878234463426804 + }, + { + "epoch": 1.867780709469364, + "grad_norm": 0.4176694295595294, + "learning_rate": 0.0002984594549913869, + "loss": 3.233114719390869, + "step": 3186, + "token_acc": 0.27319412298760914 + }, + { + "epoch": 1.8683670477865728, + "grad_norm": 0.43749726833104186, + "learning_rate": 0.00029845737604734573, + "loss": 3.225269317626953, + "step": 3187, + "token_acc": 0.2757619886814299 + }, + { + "epoch": 1.868953386103782, + "grad_norm": 0.5181997279425303, + "learning_rate": 0.0002984552957087475, + "loss": 3.172684907913208, + "step": 3188, + "token_acc": 0.28107840206428875 + }, + { + "epoch": 1.8695397244209908, + "grad_norm": 0.4033264073286499, + "learning_rate": 0.00029845321397561187, + "loss": 3.1759142875671387, + "step": 3189, + "token_acc": 0.2808328365342949 + }, + { + "epoch": 1.8701260627382, + "grad_norm": 0.4398765656377838, + "learning_rate": 0.0002984511308479583, + "loss": 3.190573215484619, + "step": 3190, + "token_acc": 0.2777804347005298 + }, + { + "epoch": 1.870712401055409, + "grad_norm": 0.4102816450588917, + "learning_rate": 0.0002984490463258064, + "loss": 3.199730396270752, + "step": 3191, + "token_acc": 0.27973053844646517 + }, + { + "epoch": 1.8712987393726181, + "grad_norm": 0.4130506687215074, + "learning_rate": 0.00029844696040917575, + "loss": 3.176208972930908, + "step": 3192, + "token_acc": 0.2803861599425547 + }, + { + "epoch": 1.871885077689827, + "grad_norm": 0.4296636872450154, + "learning_rate": 0.000298444873098086, + "loss": 3.2095255851745605, + "step": 3193, + "token_acc": 0.27644879583988996 + }, + { + "epoch": 1.872471416007036, + "grad_norm": 0.3965659237224688, + "learning_rate": 0.00029844278439255666, + "loss": 3.184441089630127, + "step": 3194, + "token_acc": 0.2814311673730448 + }, + { + "epoch": 1.873057754324245, + "grad_norm": 0.3580081571413274, + "learning_rate": 0.00029844069429260737, + "loss": 3.1753878593444824, + "step": 3195, + "token_acc": 0.2802337458105739 + }, + { + "epoch": 1.8736440926414542, + "grad_norm": 0.3706597560374224, + "learning_rate": 0.00029843860279825775, + "loss": 3.1752281188964844, + "step": 3196, + "token_acc": 0.27982441618640647 + }, + { + "epoch": 1.8742304309586633, + "grad_norm": 0.39021112388741136, + "learning_rate": 0.0002984365099095276, + "loss": 3.203425645828247, + "step": 3197, + "token_acc": 0.27845959324222536 + }, + { + "epoch": 1.8748167692758722, + "grad_norm": 0.41017745983710135, + "learning_rate": 0.00029843441562643635, + "loss": 3.2024641036987305, + "step": 3198, + "token_acc": 0.27575206846115224 + }, + { + "epoch": 1.875403107593081, + "grad_norm": 0.4012340903681659, + "learning_rate": 0.0002984323199490039, + "loss": 3.1705002784729004, + "step": 3199, + "token_acc": 0.2818638900985787 + }, + { + "epoch": 1.8759894459102902, + "grad_norm": 0.41982304482250743, + "learning_rate": 0.00029843022287724967, + "loss": 3.1912131309509277, + "step": 3200, + "token_acc": 0.2806412206643096 + }, + { + "epoch": 1.8765757842274993, + "grad_norm": 0.45467515897719585, + "learning_rate": 0.0002984281244111936, + "loss": 3.1630783081054688, + "step": 3201, + "token_acc": 0.28236466978261715 + }, + { + "epoch": 1.8771621225447084, + "grad_norm": 0.4035403465148831, + "learning_rate": 0.0002984260245508553, + "loss": 3.13374662399292, + "step": 3202, + "token_acc": 0.286162007090855 + }, + { + "epoch": 1.8777484608619175, + "grad_norm": 0.38405944037625445, + "learning_rate": 0.0002984239232962545, + "loss": 3.209691286087036, + "step": 3203, + "token_acc": 0.2763281266099285 + }, + { + "epoch": 1.8783347991791264, + "grad_norm": 0.3477970900455854, + "learning_rate": 0.0002984218206474109, + "loss": 3.2011451721191406, + "step": 3204, + "token_acc": 0.2797791485447703 + }, + { + "epoch": 1.8789211374963353, + "grad_norm": 0.3872956566422683, + "learning_rate": 0.00029841971660434435, + "loss": 3.20654296875, + "step": 3205, + "token_acc": 0.2753751594941813 + }, + { + "epoch": 1.8795074758135444, + "grad_norm": 0.34121031345541986, + "learning_rate": 0.00029841761116707456, + "loss": 3.226215362548828, + "step": 3206, + "token_acc": 0.2751305457009607 + }, + { + "epoch": 1.8800938141307535, + "grad_norm": 0.4070360101527037, + "learning_rate": 0.00029841550433562123, + "loss": 3.1702065467834473, + "step": 3207, + "token_acc": 0.2804094070746127 + }, + { + "epoch": 1.8806801524479626, + "grad_norm": 0.40992383035305885, + "learning_rate": 0.0002984133961100043, + "loss": 3.194882392883301, + "step": 3208, + "token_acc": 0.2787433295516618 + }, + { + "epoch": 1.8812664907651715, + "grad_norm": 0.40138179079917286, + "learning_rate": 0.00029841128649024353, + "loss": 3.190107822418213, + "step": 3209, + "token_acc": 0.27804613525743455 + }, + { + "epoch": 1.8818528290823804, + "grad_norm": 0.459921014553069, + "learning_rate": 0.00029840917547635867, + "loss": 3.213904619216919, + "step": 3210, + "token_acc": 0.27567705321886293 + }, + { + "epoch": 1.8824391673995895, + "grad_norm": 0.3573585452830387, + "learning_rate": 0.0002984070630683696, + "loss": 3.195622444152832, + "step": 3211, + "token_acc": 0.27764944418665644 + }, + { + "epoch": 1.8830255057167986, + "grad_norm": 0.42240784141541315, + "learning_rate": 0.00029840494926629615, + "loss": 3.2173094749450684, + "step": 3212, + "token_acc": 0.27477803852018473 + }, + { + "epoch": 1.8836118440340077, + "grad_norm": 0.4443676416793418, + "learning_rate": 0.0002984028340701582, + "loss": 3.215662717819214, + "step": 3213, + "token_acc": 0.2753571484782825 + }, + { + "epoch": 1.8841981823512166, + "grad_norm": 0.4263192329542528, + "learning_rate": 0.0002984007174799756, + "loss": 3.1895689964294434, + "step": 3214, + "token_acc": 0.27935678033104644 + }, + { + "epoch": 1.8847845206684257, + "grad_norm": 0.4117901506083151, + "learning_rate": 0.00029839859949576814, + "loss": 3.2455568313598633, + "step": 3215, + "token_acc": 0.2738851312925044 + }, + { + "epoch": 1.8853708589856346, + "grad_norm": 0.4236778417235489, + "learning_rate": 0.0002983964801175559, + "loss": 3.192448377609253, + "step": 3216, + "token_acc": 0.2767620898803224 + }, + { + "epoch": 1.8859571973028437, + "grad_norm": 0.4432769365229791, + "learning_rate": 0.0002983943593453587, + "loss": 3.2194581031799316, + "step": 3217, + "token_acc": 0.2751429991578438 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.37060157269203853, + "learning_rate": 0.0002983922371791964, + "loss": 3.1897478103637695, + "step": 3218, + "token_acc": 0.28095427372588455 + }, + { + "epoch": 1.887129873937262, + "grad_norm": 0.3764482518877029, + "learning_rate": 0.000298390113619089, + "loss": 3.2313661575317383, + "step": 3219, + "token_acc": 0.2750642805850756 + }, + { + "epoch": 1.8877162122544708, + "grad_norm": 0.45964119030685785, + "learning_rate": 0.0002983879886650565, + "loss": 3.2060577869415283, + "step": 3220, + "token_acc": 0.2760493527849222 + }, + { + "epoch": 1.8883025505716797, + "grad_norm": 0.43845234300249014, + "learning_rate": 0.0002983858623171188, + "loss": 3.2169058322906494, + "step": 3221, + "token_acc": 0.27592220878142154 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.3839137522523228, + "learning_rate": 0.0002983837345752958, + "loss": 3.2282400131225586, + "step": 3222, + "token_acc": 0.27429584581652205 + }, + { + "epoch": 1.889475227206098, + "grad_norm": 0.41613139085523676, + "learning_rate": 0.0002983816054396076, + "loss": 3.2257776260375977, + "step": 3223, + "token_acc": 0.27400575193698623 + }, + { + "epoch": 1.890061565523307, + "grad_norm": 0.4291768558383752, + "learning_rate": 0.0002983794749100742, + "loss": 3.201963424682617, + "step": 3224, + "token_acc": 0.27672577192721737 + }, + { + "epoch": 1.890647903840516, + "grad_norm": 0.418901959353053, + "learning_rate": 0.0002983773429867156, + "loss": 3.200531482696533, + "step": 3225, + "token_acc": 0.2782756387042808 + }, + { + "epoch": 1.8912342421577248, + "grad_norm": 0.4224010659989229, + "learning_rate": 0.0002983752096695517, + "loss": 3.241856336593628, + "step": 3226, + "token_acc": 0.2717264969464637 + }, + { + "epoch": 1.891820580474934, + "grad_norm": 0.5014097727150821, + "learning_rate": 0.0002983730749586027, + "loss": 3.242023229598999, + "step": 3227, + "token_acc": 0.2722166714058754 + }, + { + "epoch": 1.892406918792143, + "grad_norm": 0.3852111713370862, + "learning_rate": 0.00029837093885388857, + "loss": 3.174863815307617, + "step": 3228, + "token_acc": 0.28121074401266405 + }, + { + "epoch": 1.8929932571093522, + "grad_norm": 0.41313256438738555, + "learning_rate": 0.0002983688013554294, + "loss": 3.169738292694092, + "step": 3229, + "token_acc": 0.2818856958346787 + }, + { + "epoch": 1.8935795954265613, + "grad_norm": 0.40940744932653717, + "learning_rate": 0.00029836666246324533, + "loss": 3.1308212280273438, + "step": 3230, + "token_acc": 0.2847391165172855 + }, + { + "epoch": 1.8941659337437702, + "grad_norm": 0.4274488445251595, + "learning_rate": 0.00029836452217735633, + "loss": 3.199251651763916, + "step": 3231, + "token_acc": 0.2780745544152852 + }, + { + "epoch": 1.894752272060979, + "grad_norm": 0.3986837271736455, + "learning_rate": 0.0002983623804977826, + "loss": 3.236386775970459, + "step": 3232, + "token_acc": 0.2736496271837268 + }, + { + "epoch": 1.8953386103781882, + "grad_norm": 0.4123211530595274, + "learning_rate": 0.00029836023742454423, + "loss": 3.230705738067627, + "step": 3233, + "token_acc": 0.27349479567033014 + }, + { + "epoch": 1.8959249486953973, + "grad_norm": 0.44933503811263664, + "learning_rate": 0.0002983580929576613, + "loss": 3.2067837715148926, + "step": 3234, + "token_acc": 0.2774796978006722 + }, + { + "epoch": 1.8965112870126064, + "grad_norm": 0.37673431041986927, + "learning_rate": 0.0002983559470971541, + "loss": 3.1614632606506348, + "step": 3235, + "token_acc": 0.2838031051042028 + }, + { + "epoch": 1.8970976253298153, + "grad_norm": 0.380946580472696, + "learning_rate": 0.00029835379984304255, + "loss": 3.2016046047210693, + "step": 3236, + "token_acc": 0.27655908772508137 + }, + { + "epoch": 1.8976839636470242, + "grad_norm": 0.352380810400323, + "learning_rate": 0.0002983516511953471, + "loss": 3.205845355987549, + "step": 3237, + "token_acc": 0.27607971904330697 + }, + { + "epoch": 1.8982703019642333, + "grad_norm": 0.41320997110120156, + "learning_rate": 0.00029834950115408774, + "loss": 3.1926727294921875, + "step": 3238, + "token_acc": 0.27720375429390604 + }, + { + "epoch": 1.8988566402814424, + "grad_norm": 0.42468964170473467, + "learning_rate": 0.00029834734971928464, + "loss": 3.156770706176758, + "step": 3239, + "token_acc": 0.2836733639791333 + }, + { + "epoch": 1.8994429785986515, + "grad_norm": 0.45615514219722897, + "learning_rate": 0.00029834519689095817, + "loss": 3.1967973709106445, + "step": 3240, + "token_acc": 0.27857696525370784 + }, + { + "epoch": 1.9000293169158604, + "grad_norm": 0.4887867250369316, + "learning_rate": 0.0002983430426691285, + "loss": 3.176856517791748, + "step": 3241, + "token_acc": 0.2808887638122647 + }, + { + "epoch": 1.9006156552330695, + "grad_norm": 0.4165573955246559, + "learning_rate": 0.00029834088705381584, + "loss": 3.18300724029541, + "step": 3242, + "token_acc": 0.2794832604564001 + }, + { + "epoch": 1.9012019935502784, + "grad_norm": 0.4015630854465249, + "learning_rate": 0.00029833873004504036, + "loss": 3.20864200592041, + "step": 3243, + "token_acc": 0.2760727373444691 + }, + { + "epoch": 1.9017883318674875, + "grad_norm": 0.3792470085882102, + "learning_rate": 0.00029833657164282244, + "loss": 3.2033562660217285, + "step": 3244, + "token_acc": 0.27489906760826394 + }, + { + "epoch": 1.9023746701846966, + "grad_norm": 0.43503849658118826, + "learning_rate": 0.0002983344118471823, + "loss": 3.229607582092285, + "step": 3245, + "token_acc": 0.27428279803323197 + }, + { + "epoch": 1.9029610085019057, + "grad_norm": 0.3846147936325712, + "learning_rate": 0.0002983322506581403, + "loss": 3.168962001800537, + "step": 3246, + "token_acc": 0.2839145118568745 + }, + { + "epoch": 1.9035473468191146, + "grad_norm": 0.3613047043056451, + "learning_rate": 0.0002983300880757166, + "loss": 3.160580635070801, + "step": 3247, + "token_acc": 0.2820882833017561 + }, + { + "epoch": 1.9041336851363235, + "grad_norm": 0.3745111184181826, + "learning_rate": 0.00029832792409993165, + "loss": 3.190911293029785, + "step": 3248, + "token_acc": 0.27724068155904225 + }, + { + "epoch": 1.9047200234535326, + "grad_norm": 0.34041692139170465, + "learning_rate": 0.0002983257587308057, + "loss": 3.220590829849243, + "step": 3249, + "token_acc": 0.27406374102089615 + }, + { + "epoch": 1.9053063617707418, + "grad_norm": 0.39170381131087156, + "learning_rate": 0.0002983235919683592, + "loss": 3.2262701988220215, + "step": 3250, + "token_acc": 0.27329793741788777 + }, + { + "epoch": 1.9058927000879509, + "grad_norm": 0.5122507035629825, + "learning_rate": 0.00029832142381261233, + "loss": 3.1575300693511963, + "step": 3251, + "token_acc": 0.2835954877661265 + }, + { + "epoch": 1.9064790384051598, + "grad_norm": 0.365203698763899, + "learning_rate": 0.0002983192542635856, + "loss": 3.2405402660369873, + "step": 3252, + "token_acc": 0.2734824960212339 + }, + { + "epoch": 1.9070653767223686, + "grad_norm": 0.43648747033508006, + "learning_rate": 0.00029831708332129933, + "loss": 3.2181406021118164, + "step": 3253, + "token_acc": 0.2738248025369914 + }, + { + "epoch": 1.9076517150395778, + "grad_norm": 0.43688086757076067, + "learning_rate": 0.0002983149109857739, + "loss": 3.197798490524292, + "step": 3254, + "token_acc": 0.2774641029359965 + }, + { + "epoch": 1.9082380533567869, + "grad_norm": 0.4340239944059772, + "learning_rate": 0.00029831273725702974, + "loss": 3.1855835914611816, + "step": 3255, + "token_acc": 0.277578808280375 + }, + { + "epoch": 1.908824391673996, + "grad_norm": 0.3912222676932408, + "learning_rate": 0.0002983105621350873, + "loss": 3.2201390266418457, + "step": 3256, + "token_acc": 0.27630883410157736 + }, + { + "epoch": 1.909410729991205, + "grad_norm": 0.3854329019934525, + "learning_rate": 0.00029830838561996695, + "loss": 3.1903791427612305, + "step": 3257, + "token_acc": 0.2782542537003735 + }, + { + "epoch": 1.909997068308414, + "grad_norm": 0.3862478016982444, + "learning_rate": 0.0002983062077116892, + "loss": 3.1459033489227295, + "step": 3258, + "token_acc": 0.2854167289813654 + }, + { + "epoch": 1.9105834066256229, + "grad_norm": 0.41511038361437846, + "learning_rate": 0.0002983040284102744, + "loss": 3.1794991493225098, + "step": 3259, + "token_acc": 0.27992423165679936 + }, + { + "epoch": 1.911169744942832, + "grad_norm": 0.381718857994355, + "learning_rate": 0.00029830184771574314, + "loss": 3.1398983001708984, + "step": 3260, + "token_acc": 0.2852127377429291 + }, + { + "epoch": 1.911756083260041, + "grad_norm": 0.47046937749274026, + "learning_rate": 0.00029829966562811586, + "loss": 3.1875643730163574, + "step": 3261, + "token_acc": 0.27873890776606364 + }, + { + "epoch": 1.9123424215772502, + "grad_norm": 0.4506070821078623, + "learning_rate": 0.0002982974821474131, + "loss": 3.2244222164154053, + "step": 3262, + "token_acc": 0.2755318262752897 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.40635206044609135, + "learning_rate": 0.00029829529727365525, + "loss": 3.1653754711151123, + "step": 3263, + "token_acc": 0.28184894405285754 + }, + { + "epoch": 1.913515098211668, + "grad_norm": 0.4785599601434365, + "learning_rate": 0.000298293111006863, + "loss": 3.1700034141540527, + "step": 3264, + "token_acc": 0.28323772013969234 + }, + { + "epoch": 1.914101436528877, + "grad_norm": 0.401628713067513, + "learning_rate": 0.0002982909233470567, + "loss": 3.1526448726654053, + "step": 3265, + "token_acc": 0.2845154719336768 + }, + { + "epoch": 1.9146877748460862, + "grad_norm": 0.430457793619935, + "learning_rate": 0.0002982887342942571, + "loss": 3.205681800842285, + "step": 3266, + "token_acc": 0.2762036042984751 + }, + { + "epoch": 1.9152741131632953, + "grad_norm": 0.45014472090374535, + "learning_rate": 0.00029828654384848457, + "loss": 3.204684257507324, + "step": 3267, + "token_acc": 0.2771506610911025 + }, + { + "epoch": 1.9158604514805042, + "grad_norm": 0.414036650147306, + "learning_rate": 0.00029828435200975983, + "loss": 3.194882392883301, + "step": 3268, + "token_acc": 0.27805809137455273 + }, + { + "epoch": 1.9164467897977133, + "grad_norm": 0.49028250393990896, + "learning_rate": 0.00029828215877810343, + "loss": 3.1921753883361816, + "step": 3269, + "token_acc": 0.27922428393292514 + }, + { + "epoch": 1.9170331281149222, + "grad_norm": 0.42596667605325345, + "learning_rate": 0.0002982799641535359, + "loss": 3.180551052093506, + "step": 3270, + "token_acc": 0.2797045888733604 + }, + { + "epoch": 1.9176194664321313, + "grad_norm": 0.43981858528141105, + "learning_rate": 0.00029827776813607797, + "loss": 3.2114505767822266, + "step": 3271, + "token_acc": 0.27596880255327566 + }, + { + "epoch": 1.9182058047493404, + "grad_norm": 0.42509479073889406, + "learning_rate": 0.0002982755707257502, + "loss": 3.183439254760742, + "step": 3272, + "token_acc": 0.2810664509209556 + }, + { + "epoch": 1.9187921430665495, + "grad_norm": 0.37565389269065763, + "learning_rate": 0.00029827337192257325, + "loss": 3.1612391471862793, + "step": 3273, + "token_acc": 0.28248648906282686 + }, + { + "epoch": 1.9193784813837584, + "grad_norm": 0.3929822899700079, + "learning_rate": 0.00029827117172656777, + "loss": 3.150998592376709, + "step": 3274, + "token_acc": 0.2846092262709618 + }, + { + "epoch": 1.9199648197009673, + "grad_norm": 0.41602259410780307, + "learning_rate": 0.0002982689701377544, + "loss": 3.157850980758667, + "step": 3275, + "token_acc": 0.2827557158464191 + }, + { + "epoch": 1.9205511580181764, + "grad_norm": 0.4254472187762964, + "learning_rate": 0.0002982667671561539, + "loss": 3.2182846069335938, + "step": 3276, + "token_acc": 0.27647127495362744 + }, + { + "epoch": 1.9211374963353856, + "grad_norm": 0.41361663878700644, + "learning_rate": 0.0002982645627817869, + "loss": 3.1519813537597656, + "step": 3277, + "token_acc": 0.2839578130731237 + }, + { + "epoch": 1.9217238346525947, + "grad_norm": 0.5189396805664954, + "learning_rate": 0.00029826235701467416, + "loss": 3.195946455001831, + "step": 3278, + "token_acc": 0.2759253848633836 + }, + { + "epoch": 1.9223101729698036, + "grad_norm": 0.38126211327871623, + "learning_rate": 0.0002982601498548363, + "loss": 3.2020576000213623, + "step": 3279, + "token_acc": 0.2766937999857728 + }, + { + "epoch": 1.9228965112870124, + "grad_norm": 0.40458316481430173, + "learning_rate": 0.0002982579413022941, + "loss": 3.201653480529785, + "step": 3280, + "token_acc": 0.2793728290017085 + }, + { + "epoch": 1.9234828496042216, + "grad_norm": 0.378524823196167, + "learning_rate": 0.00029825573135706837, + "loss": 3.1889257431030273, + "step": 3281, + "token_acc": 0.2774900176456413 + }, + { + "epoch": 1.9240691879214307, + "grad_norm": 0.4303950731629008, + "learning_rate": 0.00029825352001917985, + "loss": 3.214787006378174, + "step": 3282, + "token_acc": 0.2771549985207498 + }, + { + "epoch": 1.9246555262386398, + "grad_norm": 0.39262207122526765, + "learning_rate": 0.00029825130728864925, + "loss": 3.172614097595215, + "step": 3283, + "token_acc": 0.2796875248933899 + }, + { + "epoch": 1.9252418645558487, + "grad_norm": 0.4732405221061119, + "learning_rate": 0.00029824909316549734, + "loss": 3.2121779918670654, + "step": 3284, + "token_acc": 0.2748695713943693 + }, + { + "epoch": 1.9258282028730578, + "grad_norm": 0.4579406080508642, + "learning_rate": 0.00029824687764974504, + "loss": 3.209717273712158, + "step": 3285, + "token_acc": 0.27702217708704413 + }, + { + "epoch": 1.9264145411902667, + "grad_norm": 0.42690220719481053, + "learning_rate": 0.00029824466074141305, + "loss": 3.192824363708496, + "step": 3286, + "token_acc": 0.2788978995939854 + }, + { + "epoch": 1.9270008795074758, + "grad_norm": 0.4105712300770551, + "learning_rate": 0.0002982424424405222, + "loss": 3.1622745990753174, + "step": 3287, + "token_acc": 0.28333783971796156 + }, + { + "epoch": 1.927587217824685, + "grad_norm": 0.33173920117456757, + "learning_rate": 0.0002982402227470934, + "loss": 3.164241313934326, + "step": 3288, + "token_acc": 0.2813170897063204 + }, + { + "epoch": 1.928173556141894, + "grad_norm": 0.36749657710773265, + "learning_rate": 0.0002982380016611475, + "loss": 3.133445978164673, + "step": 3289, + "token_acc": 0.28680531841661655 + }, + { + "epoch": 1.928759894459103, + "grad_norm": 0.4050672066283259, + "learning_rate": 0.0002982357791827053, + "loss": 3.232879638671875, + "step": 3290, + "token_acc": 0.27073077705451587 + }, + { + "epoch": 1.9293462327763118, + "grad_norm": 0.35628364754238656, + "learning_rate": 0.00029823355531178767, + "loss": 3.2116737365722656, + "step": 3291, + "token_acc": 0.2766563589369053 + }, + { + "epoch": 1.929932571093521, + "grad_norm": 0.3604384097230929, + "learning_rate": 0.00029823133004841556, + "loss": 3.2077221870422363, + "step": 3292, + "token_acc": 0.2761558487958786 + }, + { + "epoch": 1.93051890941073, + "grad_norm": 0.39439111226140755, + "learning_rate": 0.00029822910339260986, + "loss": 3.2032601833343506, + "step": 3293, + "token_acc": 0.27645328948820375 + }, + { + "epoch": 1.9311052477279391, + "grad_norm": 0.3750406128769394, + "learning_rate": 0.0002982268753443915, + "loss": 3.2258524894714355, + "step": 3294, + "token_acc": 0.2716833026456052 + }, + { + "epoch": 1.931691586045148, + "grad_norm": 0.412236860923718, + "learning_rate": 0.00029822464590378134, + "loss": 3.203927516937256, + "step": 3295, + "token_acc": 0.27702681059611134 + }, + { + "epoch": 1.9322779243623571, + "grad_norm": 0.49576705318650954, + "learning_rate": 0.0002982224150708004, + "loss": 3.2049429416656494, + "step": 3296, + "token_acc": 0.2765579618230108 + }, + { + "epoch": 1.932864262679566, + "grad_norm": 0.43590032710430804, + "learning_rate": 0.00029822018284546953, + "loss": 3.1903162002563477, + "step": 3297, + "token_acc": 0.2808540628854255 + }, + { + "epoch": 1.9334506009967751, + "grad_norm": 0.3856189410273088, + "learning_rate": 0.00029821794922780983, + "loss": 3.1884617805480957, + "step": 3298, + "token_acc": 0.27852309027823724 + }, + { + "epoch": 1.9340369393139842, + "grad_norm": 0.41304800711767214, + "learning_rate": 0.00029821571421784226, + "loss": 3.172560214996338, + "step": 3299, + "token_acc": 0.2810791453440056 + }, + { + "epoch": 1.9346232776311933, + "grad_norm": 0.4501092591489766, + "learning_rate": 0.00029821347781558774, + "loss": 3.182562828063965, + "step": 3300, + "token_acc": 0.280446725178703 + }, + { + "epoch": 1.9352096159484022, + "grad_norm": 0.3578776335923242, + "learning_rate": 0.00029821124002106725, + "loss": 3.213266134262085, + "step": 3301, + "token_acc": 0.27627538666034684 + }, + { + "epoch": 1.9357959542656111, + "grad_norm": 0.3337353788765908, + "learning_rate": 0.000298209000834302, + "loss": 3.1792633533477783, + "step": 3302, + "token_acc": 0.28133187481657373 + }, + { + "epoch": 1.9363822925828202, + "grad_norm": 0.34260121730734133, + "learning_rate": 0.00029820676025531283, + "loss": 3.2000203132629395, + "step": 3303, + "token_acc": 0.276902518307977 + }, + { + "epoch": 1.9369686309000294, + "grad_norm": 0.3356792226973069, + "learning_rate": 0.00029820451828412085, + "loss": 3.188465118408203, + "step": 3304, + "token_acc": 0.28093685957892356 + }, + { + "epoch": 1.9375549692172385, + "grad_norm": 0.33502387990846544, + "learning_rate": 0.00029820227492074715, + "loss": 3.1891541481018066, + "step": 3305, + "token_acc": 0.2787599008177714 + }, + { + "epoch": 1.9381413075344474, + "grad_norm": 0.3449171200822794, + "learning_rate": 0.00029820003016521276, + "loss": 3.185636043548584, + "step": 3306, + "token_acc": 0.2802411221333775 + }, + { + "epoch": 1.9387276458516562, + "grad_norm": 0.4029365558771742, + "learning_rate": 0.00029819778401753887, + "loss": 3.1883487701416016, + "step": 3307, + "token_acc": 0.2789084969547925 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.43275664429039784, + "learning_rate": 0.0002981955364777464, + "loss": 3.215132236480713, + "step": 3308, + "token_acc": 0.2744035546747999 + }, + { + "epoch": 1.9399003224860745, + "grad_norm": 0.4037274148022834, + "learning_rate": 0.0002981932875458566, + "loss": 3.2274036407470703, + "step": 3309, + "token_acc": 0.2732151339350046 + }, + { + "epoch": 1.9404866608032836, + "grad_norm": 0.381948067075118, + "learning_rate": 0.00029819103722189056, + "loss": 3.1984851360321045, + "step": 3310, + "token_acc": 0.2773605794588723 + }, + { + "epoch": 1.9410729991204925, + "grad_norm": 0.3524375031515061, + "learning_rate": 0.0002981887855058694, + "loss": 3.1581273078918457, + "step": 3311, + "token_acc": 0.28273383203352964 + }, + { + "epoch": 1.9416593374377016, + "grad_norm": 0.4071483241788557, + "learning_rate": 0.0002981865323978143, + "loss": 3.1918249130249023, + "step": 3312, + "token_acc": 0.27839754289911195 + }, + { + "epoch": 1.9422456757549105, + "grad_norm": 0.47921824901010524, + "learning_rate": 0.00029818427789774643, + "loss": 3.2381887435913086, + "step": 3313, + "token_acc": 0.2725154293709497 + }, + { + "epoch": 1.9428320140721196, + "grad_norm": 0.48081525741191644, + "learning_rate": 0.0002981820220056869, + "loss": 3.2318496704101562, + "step": 3314, + "token_acc": 0.2724779054071263 + }, + { + "epoch": 1.9434183523893287, + "grad_norm": 0.42311287483682924, + "learning_rate": 0.00029817976472165696, + "loss": 3.214770555496216, + "step": 3315, + "token_acc": 0.275542615841495 + }, + { + "epoch": 1.9440046907065378, + "grad_norm": 0.4478272748059361, + "learning_rate": 0.00029817750604567786, + "loss": 3.183302402496338, + "step": 3316, + "token_acc": 0.278842094018181 + }, + { + "epoch": 1.9445910290237467, + "grad_norm": 0.4786799140179171, + "learning_rate": 0.0002981752459777707, + "loss": 3.2055699825286865, + "step": 3317, + "token_acc": 0.2741945618196384 + }, + { + "epoch": 1.9451773673409556, + "grad_norm": 0.4286479259847441, + "learning_rate": 0.00029817298451795683, + "loss": 3.2112278938293457, + "step": 3318, + "token_acc": 0.27629442897303574 + }, + { + "epoch": 1.9457637056581647, + "grad_norm": 0.5145411571997006, + "learning_rate": 0.0002981707216662574, + "loss": 3.1898062229156494, + "step": 3319, + "token_acc": 0.28030057494900085 + }, + { + "epoch": 1.9463500439753738, + "grad_norm": 0.4235195803927868, + "learning_rate": 0.0002981684574226937, + "loss": 3.198345422744751, + "step": 3320, + "token_acc": 0.27767898798970925 + }, + { + "epoch": 1.946936382292583, + "grad_norm": 0.3775682745066809, + "learning_rate": 0.000298166191787287, + "loss": 3.1314826011657715, + "step": 3321, + "token_acc": 0.2866647184882135 + }, + { + "epoch": 1.9475227206097918, + "grad_norm": 0.36721432278091465, + "learning_rate": 0.00029816392476005857, + "loss": 3.194535493850708, + "step": 3322, + "token_acc": 0.2770400516899781 + }, + { + "epoch": 1.948109058927001, + "grad_norm": 0.38883516635987747, + "learning_rate": 0.0002981616563410298, + "loss": 3.1548545360565186, + "step": 3323, + "token_acc": 0.2839111224947173 + }, + { + "epoch": 1.9486953972442098, + "grad_norm": 0.37379266958737695, + "learning_rate": 0.0002981593865302218, + "loss": 3.2424850463867188, + "step": 3324, + "token_acc": 0.2710454380263664 + }, + { + "epoch": 1.949281735561419, + "grad_norm": 0.39466643921567207, + "learning_rate": 0.00029815711532765613, + "loss": 3.14503812789917, + "step": 3325, + "token_acc": 0.2857516894685202 + }, + { + "epoch": 1.949868073878628, + "grad_norm": 0.4410454586777076, + "learning_rate": 0.00029815484273335393, + "loss": 3.2125816345214844, + "step": 3326, + "token_acc": 0.2750875752525975 + }, + { + "epoch": 1.9504544121958372, + "grad_norm": 0.3818890148236516, + "learning_rate": 0.0002981525687473366, + "loss": 3.1598093509674072, + "step": 3327, + "token_acc": 0.28131851296166177 + }, + { + "epoch": 1.951040750513046, + "grad_norm": 0.37253793323580653, + "learning_rate": 0.0002981502933696256, + "loss": 3.196945905685425, + "step": 3328, + "token_acc": 0.27836811183144244 + }, + { + "epoch": 1.951627088830255, + "grad_norm": 0.4533250080495823, + "learning_rate": 0.0002981480166002422, + "loss": 3.200519561767578, + "step": 3329, + "token_acc": 0.27938840140563515 + }, + { + "epoch": 1.952213427147464, + "grad_norm": 0.41884550768654216, + "learning_rate": 0.00029814573843920777, + "loss": 3.2075729370117188, + "step": 3330, + "token_acc": 0.2754437620473002 + }, + { + "epoch": 1.9527997654646732, + "grad_norm": 0.4321080278107562, + "learning_rate": 0.00029814345888654384, + "loss": 3.2020134925842285, + "step": 3331, + "token_acc": 0.2760553389562849 + }, + { + "epoch": 1.9533861037818823, + "grad_norm": 0.44580260643052616, + "learning_rate": 0.0002981411779422717, + "loss": 3.144413709640503, + "step": 3332, + "token_acc": 0.28522076418474246 + }, + { + "epoch": 1.9539724420990912, + "grad_norm": 0.37774152281232387, + "learning_rate": 0.0002981388956064128, + "loss": 3.1753480434417725, + "step": 3333, + "token_acc": 0.28009258042990526 + }, + { + "epoch": 1.9545587804163, + "grad_norm": 0.4390593345767438, + "learning_rate": 0.0002981366118789886, + "loss": 3.1932919025421143, + "step": 3334, + "token_acc": 0.27945905074199767 + }, + { + "epoch": 1.9551451187335092, + "grad_norm": 0.44904778864841616, + "learning_rate": 0.0002981343267600206, + "loss": 3.2144229412078857, + "step": 3335, + "token_acc": 0.27534977680416634 + }, + { + "epoch": 1.9557314570507183, + "grad_norm": 0.38132758654801113, + "learning_rate": 0.00029813204024953016, + "loss": 3.2166664600372314, + "step": 3336, + "token_acc": 0.27556653145833065 + }, + { + "epoch": 1.9563177953679274, + "grad_norm": 0.3791002976992549, + "learning_rate": 0.0002981297523475388, + "loss": 3.217221736907959, + "step": 3337, + "token_acc": 0.27372611964489246 + }, + { + "epoch": 1.9569041336851363, + "grad_norm": 0.4458679749297398, + "learning_rate": 0.00029812746305406804, + "loss": 3.2028467655181885, + "step": 3338, + "token_acc": 0.27645650481219974 + }, + { + "epoch": 1.9574904720023454, + "grad_norm": 0.42027871517592674, + "learning_rate": 0.0002981251723691394, + "loss": 3.1748695373535156, + "step": 3339, + "token_acc": 0.2797402847867314 + }, + { + "epoch": 1.9580768103195543, + "grad_norm": 0.4033260705541706, + "learning_rate": 0.00029812288029277433, + "loss": 3.2126588821411133, + "step": 3340, + "token_acc": 0.2739816513761468 + }, + { + "epoch": 1.9586631486367634, + "grad_norm": 0.4118000596195956, + "learning_rate": 0.00029812058682499444, + "loss": 3.216691732406616, + "step": 3341, + "token_acc": 0.27443571379635634 + }, + { + "epoch": 1.9592494869539725, + "grad_norm": 0.38530467116516487, + "learning_rate": 0.0002981182919658212, + "loss": 3.2303221225738525, + "step": 3342, + "token_acc": 0.2746282109058134 + }, + { + "epoch": 1.9598358252711816, + "grad_norm": 0.3900969240197085, + "learning_rate": 0.0002981159957152762, + "loss": 3.206285238265991, + "step": 3343, + "token_acc": 0.27562706518474017 + }, + { + "epoch": 1.9604221635883905, + "grad_norm": 0.45661259972418955, + "learning_rate": 0.000298113698073381, + "loss": 3.2184529304504395, + "step": 3344, + "token_acc": 0.2744780586493138 + }, + { + "epoch": 1.9610085019055994, + "grad_norm": 0.44203255455665885, + "learning_rate": 0.0002981113990401572, + "loss": 3.217252731323242, + "step": 3345, + "token_acc": 0.2757837846330293 + }, + { + "epoch": 1.9615948402228085, + "grad_norm": 0.4051769473174531, + "learning_rate": 0.0002981090986156264, + "loss": 3.192514419555664, + "step": 3346, + "token_acc": 0.2765101901817991 + }, + { + "epoch": 1.9621811785400176, + "grad_norm": 0.3931895936801137, + "learning_rate": 0.0002981067967998102, + "loss": 3.1989264488220215, + "step": 3347, + "token_acc": 0.27841162161236166 + }, + { + "epoch": 1.9627675168572267, + "grad_norm": 0.394766761969608, + "learning_rate": 0.0002981044935927302, + "loss": 3.155118465423584, + "step": 3348, + "token_acc": 0.2828657712787094 + }, + { + "epoch": 1.9633538551744356, + "grad_norm": 0.3704907512293779, + "learning_rate": 0.00029810218899440803, + "loss": 3.176358222961426, + "step": 3349, + "token_acc": 0.28128635270214075 + }, + { + "epoch": 1.9639401934916447, + "grad_norm": 0.31508603859403145, + "learning_rate": 0.0002980998830048654, + "loss": 3.235736846923828, + "step": 3350, + "token_acc": 0.2717218084407978 + }, + { + "epoch": 1.9645265318088536, + "grad_norm": 0.38380912812083057, + "learning_rate": 0.000298097575624124, + "loss": 3.1732888221740723, + "step": 3351, + "token_acc": 0.2807628209185073 + }, + { + "epoch": 1.9651128701260627, + "grad_norm": 0.3662293069653066, + "learning_rate": 0.00029809526685220533, + "loss": 3.195889472961426, + "step": 3352, + "token_acc": 0.27930063121942655 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.38151433491931575, + "learning_rate": 0.00029809295668913125, + "loss": 3.1938557624816895, + "step": 3353, + "token_acc": 0.2784071738855384 + }, + { + "epoch": 1.966285546760481, + "grad_norm": 0.3427815865251826, + "learning_rate": 0.0002980906451349234, + "loss": 3.2070651054382324, + "step": 3354, + "token_acc": 0.27608612760394374 + }, + { + "epoch": 1.9668718850776898, + "grad_norm": 0.3800055243023331, + "learning_rate": 0.00029808833218960347, + "loss": 3.1367244720458984, + "step": 3355, + "token_acc": 0.28626813563312237 + }, + { + "epoch": 1.9674582233948987, + "grad_norm": 0.36934742439101065, + "learning_rate": 0.00029808601785319324, + "loss": 3.1776466369628906, + "step": 3356, + "token_acc": 0.28181682147094345 + }, + { + "epoch": 1.9680445617121078, + "grad_norm": 0.3369324288371547, + "learning_rate": 0.0002980837021257144, + "loss": 3.193371534347534, + "step": 3357, + "token_acc": 0.27857620231507474 + }, + { + "epoch": 1.968630900029317, + "grad_norm": 0.3857452494345077, + "learning_rate": 0.00029808138500718874, + "loss": 3.1710965633392334, + "step": 3358, + "token_acc": 0.2818963260884776 + }, + { + "epoch": 1.969217238346526, + "grad_norm": 0.3884919844665951, + "learning_rate": 0.000298079066497638, + "loss": 3.1391916275024414, + "step": 3359, + "token_acc": 0.2854416893704085 + }, + { + "epoch": 1.969803576663735, + "grad_norm": 0.3407201310779719, + "learning_rate": 0.0002980767465970839, + "loss": 3.1727020740509033, + "step": 3360, + "token_acc": 0.2810303719748796 + }, + { + "epoch": 1.9703899149809438, + "grad_norm": 0.38151157714908196, + "learning_rate": 0.0002980744253055484, + "loss": 3.2263922691345215, + "step": 3361, + "token_acc": 0.27155195908881435 + }, + { + "epoch": 1.970976253298153, + "grad_norm": 0.3834518965681551, + "learning_rate": 0.0002980721026230532, + "loss": 3.222538709640503, + "step": 3362, + "token_acc": 0.27440405680593083 + }, + { + "epoch": 1.971562591615362, + "grad_norm": 0.371486825197991, + "learning_rate": 0.0002980697785496201, + "loss": 3.2115392684936523, + "step": 3363, + "token_acc": 0.27605826360477786 + }, + { + "epoch": 1.9721489299325712, + "grad_norm": 0.38852705969961343, + "learning_rate": 0.0002980674530852709, + "loss": 3.1961581707000732, + "step": 3364, + "token_acc": 0.27628800733276226 + }, + { + "epoch": 1.97273526824978, + "grad_norm": 0.3547550976883598, + "learning_rate": 0.0002980651262300276, + "loss": 3.155595064163208, + "step": 3365, + "token_acc": 0.28347993827160495 + }, + { + "epoch": 1.9733216065669892, + "grad_norm": 0.38286935170102565, + "learning_rate": 0.0002980627979839119, + "loss": 3.1769320964813232, + "step": 3366, + "token_acc": 0.2826406381192275 + }, + { + "epoch": 1.973907944884198, + "grad_norm": 0.4275581205098955, + "learning_rate": 0.00029806046834694575, + "loss": 3.2086310386657715, + "step": 3367, + "token_acc": 0.2752268404311433 + }, + { + "epoch": 1.9744942832014072, + "grad_norm": 0.41196820554876673, + "learning_rate": 0.00029805813731915103, + "loss": 3.240220308303833, + "step": 3368, + "token_acc": 0.2713572482766086 + }, + { + "epoch": 1.9750806215186163, + "grad_norm": 0.410749692762219, + "learning_rate": 0.00029805580490054956, + "loss": 3.1351304054260254, + "step": 3369, + "token_acc": 0.28883281792764554 + }, + { + "epoch": 1.9756669598358254, + "grad_norm": 0.36825549015755243, + "learning_rate": 0.00029805347109116337, + "loss": 3.2013583183288574, + "step": 3370, + "token_acc": 0.2772010468839261 + }, + { + "epoch": 1.9762532981530343, + "grad_norm": 0.36624203603685684, + "learning_rate": 0.0002980511358910143, + "loss": 3.210986614227295, + "step": 3371, + "token_acc": 0.27531358071786016 + }, + { + "epoch": 1.9768396364702432, + "grad_norm": 0.3912545071492538, + "learning_rate": 0.00029804879930012433, + "loss": 3.1605324745178223, + "step": 3372, + "token_acc": 0.2837726091231095 + }, + { + "epoch": 1.9774259747874523, + "grad_norm": 0.4385274472201182, + "learning_rate": 0.0002980464613185154, + "loss": 3.217862606048584, + "step": 3373, + "token_acc": 0.2753342941058431 + }, + { + "epoch": 1.9780123131046614, + "grad_norm": 0.4710715067931232, + "learning_rate": 0.0002980441219462094, + "loss": 3.2124388217926025, + "step": 3374, + "token_acc": 0.27557038687568974 + }, + { + "epoch": 1.9785986514218705, + "grad_norm": 0.5229571340675347, + "learning_rate": 0.00029804178118322843, + "loss": 3.205665111541748, + "step": 3375, + "token_acc": 0.2775821269135077 + }, + { + "epoch": 1.9791849897390794, + "grad_norm": 0.47597492712186795, + "learning_rate": 0.0002980394390295944, + "loss": 3.217750310897827, + "step": 3376, + "token_acc": 0.27510263450522 + }, + { + "epoch": 1.9797713280562885, + "grad_norm": 0.3815455478629293, + "learning_rate": 0.0002980370954853293, + "loss": 3.226161241531372, + "step": 3377, + "token_acc": 0.2727554870432166 + }, + { + "epoch": 1.9803576663734974, + "grad_norm": 0.42996653121034245, + "learning_rate": 0.00029803475055045515, + "loss": 3.20371150970459, + "step": 3378, + "token_acc": 0.2760148559852479 + }, + { + "epoch": 1.9809440046907065, + "grad_norm": 0.4348522933684916, + "learning_rate": 0.000298032404224994, + "loss": 3.2051711082458496, + "step": 3379, + "token_acc": 0.2777285308537005 + }, + { + "epoch": 1.9815303430079156, + "grad_norm": 0.3832950545575956, + "learning_rate": 0.000298030056508968, + "loss": 3.1846365928649902, + "step": 3380, + "token_acc": 0.27835558753200584 + }, + { + "epoch": 1.9821166813251248, + "grad_norm": 0.4334540998076748, + "learning_rate": 0.00029802770740239894, + "loss": 3.17218017578125, + "step": 3381, + "token_acc": 0.2805139464150395 + }, + { + "epoch": 1.9827030196423336, + "grad_norm": 0.4369274971382508, + "learning_rate": 0.0002980253569053091, + "loss": 3.1959147453308105, + "step": 3382, + "token_acc": 0.2765149886842072 + }, + { + "epoch": 1.9832893579595425, + "grad_norm": 0.39196166238801156, + "learning_rate": 0.0002980230050177206, + "loss": 3.157712697982788, + "step": 3383, + "token_acc": 0.28273083462704224 + }, + { + "epoch": 1.9838756962767516, + "grad_norm": 0.3906006710629176, + "learning_rate": 0.0002980206517396553, + "loss": 3.207461357116699, + "step": 3384, + "token_acc": 0.27782520805167105 + }, + { + "epoch": 1.9844620345939608, + "grad_norm": 0.4663236834899523, + "learning_rate": 0.0002980182970711355, + "loss": 3.160243511199951, + "step": 3385, + "token_acc": 0.2823008713794796 + }, + { + "epoch": 1.9850483729111699, + "grad_norm": 0.4135120842634517, + "learning_rate": 0.0002980159410121832, + "loss": 3.204855442047119, + "step": 3386, + "token_acc": 0.2759180727239917 + }, + { + "epoch": 1.9856347112283788, + "grad_norm": 0.3723443810245823, + "learning_rate": 0.0002980135835628206, + "loss": 3.185324192047119, + "step": 3387, + "token_acc": 0.27974397014514474 + }, + { + "epoch": 1.9862210495455876, + "grad_norm": 0.3804048246392445, + "learning_rate": 0.0002980112247230699, + "loss": 3.16774845123291, + "step": 3388, + "token_acc": 0.2826880053713947 + }, + { + "epoch": 1.9868073878627968, + "grad_norm": 0.3726620750949641, + "learning_rate": 0.00029800886449295313, + "loss": 3.1935040950775146, + "step": 3389, + "token_acc": 0.2784573073233131 + }, + { + "epoch": 1.9873937261800059, + "grad_norm": 0.40202464782262426, + "learning_rate": 0.00029800650287249253, + "loss": 3.1757564544677734, + "step": 3390, + "token_acc": 0.2800574037834312 + }, + { + "epoch": 1.987980064497215, + "grad_norm": 0.3771190893082877, + "learning_rate": 0.0002980041398617103, + "loss": 3.186380386352539, + "step": 3391, + "token_acc": 0.27781333812957254 + }, + { + "epoch": 1.9885664028144239, + "grad_norm": 0.36412645512417874, + "learning_rate": 0.0002980017754606286, + "loss": 3.173776626586914, + "step": 3392, + "token_acc": 0.2796649521813379 + }, + { + "epoch": 1.989152741131633, + "grad_norm": 0.4199477875595848, + "learning_rate": 0.0002979994096692696, + "loss": 3.2383148670196533, + "step": 3393, + "token_acc": 0.2728222961241958 + }, + { + "epoch": 1.9897390794488419, + "grad_norm": 0.41224534750998254, + "learning_rate": 0.0002979970424876557, + "loss": 3.222076892852783, + "step": 3394, + "token_acc": 0.2737492030468776 + }, + { + "epoch": 1.990325417766051, + "grad_norm": 0.39633262701575256, + "learning_rate": 0.0002979946739158089, + "loss": 3.1951208114624023, + "step": 3395, + "token_acc": 0.27740279567303383 + }, + { + "epoch": 1.99091175608326, + "grad_norm": 0.38435878069984414, + "learning_rate": 0.00029799230395375167, + "loss": 3.2123942375183105, + "step": 3396, + "token_acc": 0.27582848045582137 + }, + { + "epoch": 1.9914980944004692, + "grad_norm": 0.36975804109065546, + "learning_rate": 0.0002979899326015061, + "loss": 3.1646666526794434, + "step": 3397, + "token_acc": 0.28235180768068996 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.3654960051140184, + "learning_rate": 0.0002979875598590945, + "loss": 3.1928539276123047, + "step": 3398, + "token_acc": 0.28084030710775465 + }, + { + "epoch": 1.992670771034887, + "grad_norm": 0.3178800950094833, + "learning_rate": 0.00029798518572653925, + "loss": 3.229656457901001, + "step": 3399, + "token_acc": 0.2739833651086296 + }, + { + "epoch": 1.993257109352096, + "grad_norm": 0.4168841050810552, + "learning_rate": 0.0002979828102038625, + "loss": 3.230128765106201, + "step": 3400, + "token_acc": 0.27231527295976665 + }, + { + "epoch": 1.9938434476693052, + "grad_norm": 0.4060076991545318, + "learning_rate": 0.0002979804332910868, + "loss": 3.209616184234619, + "step": 3401, + "token_acc": 0.27532426918061975 + }, + { + "epoch": 1.9944297859865143, + "grad_norm": 0.3474952930150825, + "learning_rate": 0.0002979780549882343, + "loss": 3.225494861602783, + "step": 3402, + "token_acc": 0.2751629178873053 + }, + { + "epoch": 1.9950161243037232, + "grad_norm": 0.37390638891861583, + "learning_rate": 0.00029797567529532727, + "loss": 3.2129006385803223, + "step": 3403, + "token_acc": 0.273763058202273 + }, + { + "epoch": 1.9956024626209323, + "grad_norm": 0.39394712142879085, + "learning_rate": 0.00029797329421238827, + "loss": 3.193532943725586, + "step": 3404, + "token_acc": 0.27796589233879954 + }, + { + "epoch": 1.9961888009381412, + "grad_norm": 0.35348550773198245, + "learning_rate": 0.00029797091173943953, + "loss": 3.210447311401367, + "step": 3405, + "token_acc": 0.27487725174403116 + }, + { + "epoch": 1.9967751392553503, + "grad_norm": 0.3317223774491301, + "learning_rate": 0.0002979685278765035, + "loss": 3.192009687423706, + "step": 3406, + "token_acc": 0.2767474772497872 + }, + { + "epoch": 1.9973614775725594, + "grad_norm": 0.4037863277658947, + "learning_rate": 0.0002979661426236025, + "loss": 3.1939737796783447, + "step": 3407, + "token_acc": 0.27676198337982183 + }, + { + "epoch": 1.9979478158897686, + "grad_norm": 0.43014307105734145, + "learning_rate": 0.000297963755980759, + "loss": 3.196213960647583, + "step": 3408, + "token_acc": 0.27807366427959423 + }, + { + "epoch": 1.9985341542069774, + "grad_norm": 0.43607282692039373, + "learning_rate": 0.0002979613679479954, + "loss": 3.1638436317443848, + "step": 3409, + "token_acc": 0.28304777413816656 + }, + { + "epoch": 1.9991204925241863, + "grad_norm": 0.40025565446754036, + "learning_rate": 0.00029795897852533413, + "loss": 3.179595947265625, + "step": 3410, + "token_acc": 0.28001762029342087 + }, + { + "epoch": 1.9997068308413954, + "grad_norm": 0.38469475673173525, + "learning_rate": 0.0002979565877127976, + "loss": 3.179605484008789, + "step": 3411, + "token_acc": 0.2783551831672339 + }, + { + "epoch": 2.0, + "grad_norm": 0.3772237471439729, + "learning_rate": 0.00029795419551040833, + "loss": 3.1722822189331055, + "step": 3412, + "token_acc": 0.2822865944549339 + }, + { + "epoch": 2.0, + "eval_loss": 3.1689116954803467, + "eval_runtime": 8.8282, + "eval_samples_per_second": 28.998, + "eval_steps_per_second": 3.625, + "eval_token_acc": 0.2808737831853162, + "step": 3412 + }, + { + "epoch": 2.000586338317209, + "grad_norm": 0.4693268402300224, + "learning_rate": 0.0002979518019181888, + "loss": 3.128814935684204, + "step": 3413, + "token_acc": 0.2850745210087439 + }, + { + "epoch": 2.0011726766344182, + "grad_norm": 0.4494293147080739, + "learning_rate": 0.00029794940693616135, + "loss": 3.1643800735473633, + "step": 3414, + "token_acc": 0.280364862164351 + }, + { + "epoch": 2.001759014951627, + "grad_norm": 0.4999891350411283, + "learning_rate": 0.00029794701056434867, + "loss": 3.212587356567383, + "step": 3415, + "token_acc": 0.2754552227610109 + }, + { + "epoch": 2.002345353268836, + "grad_norm": 0.42985230431226545, + "learning_rate": 0.00029794461280277317, + "loss": 3.1644279956817627, + "step": 3416, + "token_acc": 0.27951792664306757 + }, + { + "epoch": 2.002931691586045, + "grad_norm": 0.4936059455030206, + "learning_rate": 0.0002979422136514574, + "loss": 3.149993419647217, + "step": 3417, + "token_acc": 0.28243212911779597 + }, + { + "epoch": 2.0035180299032542, + "grad_norm": 0.4265801067684844, + "learning_rate": 0.0002979398131104239, + "loss": 3.1089510917663574, + "step": 3418, + "token_acc": 0.2888276955438235 + }, + { + "epoch": 2.0041043682204633, + "grad_norm": 0.4357110941176869, + "learning_rate": 0.0002979374111796952, + "loss": 3.1907286643981934, + "step": 3419, + "token_acc": 0.27735494094595753 + }, + { + "epoch": 2.0046907065376725, + "grad_norm": 0.39912122899714053, + "learning_rate": 0.0002979350078592938, + "loss": 3.149362802505493, + "step": 3420, + "token_acc": 0.2815598010441765 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.36255386799156597, + "learning_rate": 0.00029793260314924246, + "loss": 3.1551661491394043, + "step": 3421, + "token_acc": 0.2817579707217503 + }, + { + "epoch": 2.0058633831720902, + "grad_norm": 0.36632928426957834, + "learning_rate": 0.0002979301970495636, + "loss": 3.165836811065674, + "step": 3422, + "token_acc": 0.2825132746155687 + }, + { + "epoch": 2.0064497214892993, + "grad_norm": 0.41413484301211717, + "learning_rate": 0.00029792778956027986, + "loss": 3.0863258838653564, + "step": 3423, + "token_acc": 0.29070250648135487 + }, + { + "epoch": 2.0070360598065085, + "grad_norm": 0.3964880212843554, + "learning_rate": 0.00029792538068141385, + "loss": 3.1333117485046387, + "step": 3424, + "token_acc": 0.2848141383755909 + }, + { + "epoch": 2.0076223981237176, + "grad_norm": 0.4106567877032098, + "learning_rate": 0.00029792297041298825, + "loss": 3.1283042430877686, + "step": 3425, + "token_acc": 0.28598961725149824 + }, + { + "epoch": 2.0082087364409262, + "grad_norm": 0.4678643064897086, + "learning_rate": 0.0002979205587550257, + "loss": 3.137134075164795, + "step": 3426, + "token_acc": 0.2844349198595474 + }, + { + "epoch": 2.0087950747581353, + "grad_norm": 0.4084393550899194, + "learning_rate": 0.0002979181457075488, + "loss": 3.150017738342285, + "step": 3427, + "token_acc": 0.2829810991537732 + }, + { + "epoch": 2.0093814130753445, + "grad_norm": 0.4209590751654001, + "learning_rate": 0.0002979157312705803, + "loss": 3.137946367263794, + "step": 3428, + "token_acc": 0.2841458576804726 + }, + { + "epoch": 2.0099677513925536, + "grad_norm": 0.4500839660224752, + "learning_rate": 0.0002979133154441427, + "loss": 3.1342806816101074, + "step": 3429, + "token_acc": 0.28405961149247816 + }, + { + "epoch": 2.0105540897097627, + "grad_norm": 0.40353803071576827, + "learning_rate": 0.0002979108982282589, + "loss": 3.104767322540283, + "step": 3430, + "token_acc": 0.28898804174838966 + }, + { + "epoch": 2.0111404280269713, + "grad_norm": 0.3597226941894105, + "learning_rate": 0.00029790847962295154, + "loss": 3.098072052001953, + "step": 3431, + "token_acc": 0.2889789353695289 + }, + { + "epoch": 2.0117267663441805, + "grad_norm": 0.3730829068510743, + "learning_rate": 0.0002979060596282433, + "loss": 3.1085596084594727, + "step": 3432, + "token_acc": 0.2870292568382306 + }, + { + "epoch": 2.0123131046613896, + "grad_norm": 0.38421643536287337, + "learning_rate": 0.00029790363824415693, + "loss": 3.1333045959472656, + "step": 3433, + "token_acc": 0.28487105971253973 + }, + { + "epoch": 2.0128994429785987, + "grad_norm": 0.4015381665042725, + "learning_rate": 0.00029790121547071516, + "loss": 3.1151533126831055, + "step": 3434, + "token_acc": 0.28771270951505284 + }, + { + "epoch": 2.013485781295808, + "grad_norm": 0.3908685556457136, + "learning_rate": 0.0002978987913079408, + "loss": 3.1296937465667725, + "step": 3435, + "token_acc": 0.2865242176290379 + }, + { + "epoch": 2.014072119613017, + "grad_norm": 0.362248717817106, + "learning_rate": 0.00029789636575585656, + "loss": 3.1299726963043213, + "step": 3436, + "token_acc": 0.2862445403203012 + }, + { + "epoch": 2.0146584579302256, + "grad_norm": 0.4121208915781683, + "learning_rate": 0.00029789393881448533, + "loss": 3.107898712158203, + "step": 3437, + "token_acc": 0.28749310300921016 + }, + { + "epoch": 2.0152447962474347, + "grad_norm": 0.3651512223553721, + "learning_rate": 0.0002978915104838498, + "loss": 3.1158671379089355, + "step": 3438, + "token_acc": 0.2862957242495729 + }, + { + "epoch": 2.015831134564644, + "grad_norm": 0.3667427697512673, + "learning_rate": 0.0002978890807639728, + "loss": 3.1370677947998047, + "step": 3439, + "token_acc": 0.282250969728476 + }, + { + "epoch": 2.016417472881853, + "grad_norm": 0.42623514841764887, + "learning_rate": 0.0002978866496548771, + "loss": 3.1452929973602295, + "step": 3440, + "token_acc": 0.28459368827702103 + }, + { + "epoch": 2.017003811199062, + "grad_norm": 0.36351875555823093, + "learning_rate": 0.00029788421715658573, + "loss": 3.132627010345459, + "step": 3441, + "token_acc": 0.2848018845451417 + }, + { + "epoch": 2.0175901495162707, + "grad_norm": 0.37883952206581556, + "learning_rate": 0.00029788178326912133, + "loss": 3.1542673110961914, + "step": 3442, + "token_acc": 0.2834036474504359 + }, + { + "epoch": 2.01817648783348, + "grad_norm": 0.39772390047638145, + "learning_rate": 0.00029787934799250685, + "loss": 3.2037856578826904, + "step": 3443, + "token_acc": 0.2734826375180308 + }, + { + "epoch": 2.018762826150689, + "grad_norm": 0.369383127275441, + "learning_rate": 0.0002978769113267652, + "loss": 3.159280776977539, + "step": 3444, + "token_acc": 0.28142865057898525 + }, + { + "epoch": 2.019349164467898, + "grad_norm": 0.3853129161257599, + "learning_rate": 0.00029787447327191927, + "loss": 3.1549551486968994, + "step": 3445, + "token_acc": 0.28148429513571116 + }, + { + "epoch": 2.019935502785107, + "grad_norm": 0.35432230549810184, + "learning_rate": 0.0002978720338279919, + "loss": 3.148341655731201, + "step": 3446, + "token_acc": 0.2831590088572573 + }, + { + "epoch": 2.0205218411023163, + "grad_norm": 0.3794908010657576, + "learning_rate": 0.00029786959299500605, + "loss": 3.121584892272949, + "step": 3447, + "token_acc": 0.28644777914064323 + }, + { + "epoch": 2.021108179419525, + "grad_norm": 0.38514720598065894, + "learning_rate": 0.00029786715077298454, + "loss": 3.09490966796875, + "step": 3448, + "token_acc": 0.2911864795625741 + }, + { + "epoch": 2.021694517736734, + "grad_norm": 0.33695839204147454, + "learning_rate": 0.0002978647071619505, + "loss": 3.096996545791626, + "step": 3449, + "token_acc": 0.2910687147342017 + }, + { + "epoch": 2.022280856053943, + "grad_norm": 0.34468055695152805, + "learning_rate": 0.00029786226216192675, + "loss": 3.132293462753296, + "step": 3450, + "token_acc": 0.28298106527484007 + }, + { + "epoch": 2.0228671943711523, + "grad_norm": 0.38124693863745074, + "learning_rate": 0.00029785981577293627, + "loss": 3.1362264156341553, + "step": 3451, + "token_acc": 0.2824507392337959 + }, + { + "epoch": 2.0234535326883614, + "grad_norm": 0.382586837702508, + "learning_rate": 0.00029785736799500215, + "loss": 3.1886556148529053, + "step": 3452, + "token_acc": 0.2787853441319425 + }, + { + "epoch": 2.02403987100557, + "grad_norm": 0.38038669886498655, + "learning_rate": 0.0002978549188281472, + "loss": 3.0679826736450195, + "step": 3453, + "token_acc": 0.29393952216480396 + }, + { + "epoch": 2.024626209322779, + "grad_norm": 0.38004811148090367, + "learning_rate": 0.00029785246827239453, + "loss": 3.194671154022217, + "step": 3454, + "token_acc": 0.27638870389312076 + }, + { + "epoch": 2.0252125476399883, + "grad_norm": 0.3711478235635344, + "learning_rate": 0.0002978500163277672, + "loss": 3.1409754753112793, + "step": 3455, + "token_acc": 0.28372774923452176 + }, + { + "epoch": 2.0257988859571974, + "grad_norm": 0.36493214590936185, + "learning_rate": 0.0002978475629942882, + "loss": 3.098780870437622, + "step": 3456, + "token_acc": 0.29087998555834005 + }, + { + "epoch": 2.0263852242744065, + "grad_norm": 0.36195177990017, + "learning_rate": 0.00029784510827198055, + "loss": 3.1051435470581055, + "step": 3457, + "token_acc": 0.2887861780565643 + }, + { + "epoch": 2.026971562591615, + "grad_norm": 0.34780890327243913, + "learning_rate": 0.00029784265216086734, + "loss": 3.148003339767456, + "step": 3458, + "token_acc": 0.2812387883422328 + }, + { + "epoch": 2.0275579009088243, + "grad_norm": 0.3727664772732618, + "learning_rate": 0.0002978401946609716, + "loss": 3.0691351890563965, + "step": 3459, + "token_acc": 0.29365884240222745 + }, + { + "epoch": 2.0281442392260334, + "grad_norm": 0.3855725913060223, + "learning_rate": 0.0002978377357723165, + "loss": 3.159719467163086, + "step": 3460, + "token_acc": 0.2826157324153522 + }, + { + "epoch": 2.0287305775432425, + "grad_norm": 0.36752659830781986, + "learning_rate": 0.00029783527549492503, + "loss": 3.137112855911255, + "step": 3461, + "token_acc": 0.2822810288137503 + }, + { + "epoch": 2.0293169158604516, + "grad_norm": 0.39319522138372287, + "learning_rate": 0.0002978328138288204, + "loss": 3.1231744289398193, + "step": 3462, + "token_acc": 0.2861779978009375 + }, + { + "epoch": 2.0299032541776607, + "grad_norm": 0.3241839013696892, + "learning_rate": 0.0002978303507740257, + "loss": 3.1348609924316406, + "step": 3463, + "token_acc": 0.28654076199146755 + }, + { + "epoch": 2.0304895924948694, + "grad_norm": 0.329587503257926, + "learning_rate": 0.0002978278863305641, + "loss": 3.122725009918213, + "step": 3464, + "token_acc": 0.2869753751611894 + }, + { + "epoch": 2.0310759308120785, + "grad_norm": 0.3552415486900838, + "learning_rate": 0.0002978254204984586, + "loss": 3.135404109954834, + "step": 3465, + "token_acc": 0.2851585716503981 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.3374796323262722, + "learning_rate": 0.0002978229532777325, + "loss": 3.1368470191955566, + "step": 3466, + "token_acc": 0.2850211399999471 + }, + { + "epoch": 2.0322486074464967, + "grad_norm": 0.42808961517240873, + "learning_rate": 0.000297820484668409, + "loss": 3.1344170570373535, + "step": 3467, + "token_acc": 0.2839436894625125 + }, + { + "epoch": 2.032834945763706, + "grad_norm": 0.4285950097165144, + "learning_rate": 0.0002978180146705112, + "loss": 3.162283420562744, + "step": 3468, + "token_acc": 0.27864230359341535 + }, + { + "epoch": 2.0334212840809145, + "grad_norm": 0.3942104733731069, + "learning_rate": 0.00029781554328406237, + "loss": 3.1488261222839355, + "step": 3469, + "token_acc": 0.2823741909588079 + }, + { + "epoch": 2.0340076223981236, + "grad_norm": 0.4275392574220518, + "learning_rate": 0.0002978130705090857, + "loss": 3.1416211128234863, + "step": 3470, + "token_acc": 0.28273339300307426 + }, + { + "epoch": 2.0345939607153327, + "grad_norm": 0.48542663274783737, + "learning_rate": 0.0002978105963456043, + "loss": 3.1221628189086914, + "step": 3471, + "token_acc": 0.28567592650524515 + }, + { + "epoch": 2.035180299032542, + "grad_norm": 0.4540814159435083, + "learning_rate": 0.00029780812079364163, + "loss": 3.1432619094848633, + "step": 3472, + "token_acc": 0.2830303683133356 + }, + { + "epoch": 2.035766637349751, + "grad_norm": 0.3712601559118844, + "learning_rate": 0.00029780564385322085, + "loss": 3.1556954383850098, + "step": 3473, + "token_acc": 0.2823031671396317 + }, + { + "epoch": 2.03635297566696, + "grad_norm": 0.33314191723912395, + "learning_rate": 0.0002978031655243652, + "loss": 3.1179394721984863, + "step": 3474, + "token_acc": 0.2865865855043005 + }, + { + "epoch": 2.0369393139841687, + "grad_norm": 0.32220550490807864, + "learning_rate": 0.00029780068580709793, + "loss": 3.1002423763275146, + "step": 3475, + "token_acc": 0.2900517581377644 + }, + { + "epoch": 2.037525652301378, + "grad_norm": 0.41199791774491223, + "learning_rate": 0.0002977982047014424, + "loss": 3.135874032974243, + "step": 3476, + "token_acc": 0.28354010423945053 + }, + { + "epoch": 2.038111990618587, + "grad_norm": 0.40860395631955454, + "learning_rate": 0.0002977957222074219, + "loss": 3.0974645614624023, + "step": 3477, + "token_acc": 0.2870893157585079 + }, + { + "epoch": 2.038698328935796, + "grad_norm": 0.2997445321027915, + "learning_rate": 0.0002977932383250598, + "loss": 3.133939027786255, + "step": 3478, + "token_acc": 0.285202293724116 + }, + { + "epoch": 2.039284667253005, + "grad_norm": 0.3522548715001856, + "learning_rate": 0.00029779075305437936, + "loss": 3.1448988914489746, + "step": 3479, + "token_acc": 0.2847898246931877 + }, + { + "epoch": 2.039871005570214, + "grad_norm": 0.35811174498889337, + "learning_rate": 0.0002977882663954039, + "loss": 3.164663076400757, + "step": 3480, + "token_acc": 0.2799471861765558 + }, + { + "epoch": 2.040457343887423, + "grad_norm": 0.3523736077082178, + "learning_rate": 0.0002977857783481568, + "loss": 3.1280064582824707, + "step": 3481, + "token_acc": 0.2871072249430628 + }, + { + "epoch": 2.041043682204632, + "grad_norm": 0.3389376874865217, + "learning_rate": 0.0002977832889126615, + "loss": 3.1250505447387695, + "step": 3482, + "token_acc": 0.2853470233041602 + }, + { + "epoch": 2.041630020521841, + "grad_norm": 0.39684627094864483, + "learning_rate": 0.00029778079808894133, + "loss": 3.101452350616455, + "step": 3483, + "token_acc": 0.28957974284321536 + }, + { + "epoch": 2.0422163588390503, + "grad_norm": 0.3557693879707045, + "learning_rate": 0.00029777830587701974, + "loss": 3.16340708732605, + "step": 3484, + "token_acc": 0.28090264409642446 + }, + { + "epoch": 2.042802697156259, + "grad_norm": 0.33116527751910174, + "learning_rate": 0.0002977758122769201, + "loss": 3.155015468597412, + "step": 3485, + "token_acc": 0.281952505003973 + }, + { + "epoch": 2.043389035473468, + "grad_norm": 0.3669881335006332, + "learning_rate": 0.00029777331728866576, + "loss": 3.126927375793457, + "step": 3486, + "token_acc": 0.2837334104566342 + }, + { + "epoch": 2.043975373790677, + "grad_norm": 0.3732859662559385, + "learning_rate": 0.0002977708209122803, + "loss": 3.1357572078704834, + "step": 3487, + "token_acc": 0.28539349843830186 + }, + { + "epoch": 2.0445617121078863, + "grad_norm": 0.4382736906957138, + "learning_rate": 0.0002977683231477871, + "loss": 3.1393203735351562, + "step": 3488, + "token_acc": 0.2842107145046278 + }, + { + "epoch": 2.0451480504250954, + "grad_norm": 0.43775367165838047, + "learning_rate": 0.0002977658239952096, + "loss": 3.1039083003997803, + "step": 3489, + "token_acc": 0.28923629968707165 + }, + { + "epoch": 2.0457343887423045, + "grad_norm": 0.3287333893533817, + "learning_rate": 0.0002977633234545713, + "loss": 3.0934486389160156, + "step": 3490, + "token_acc": 0.2911684104051372 + }, + { + "epoch": 2.046320727059513, + "grad_norm": 0.40452705999500393, + "learning_rate": 0.0002977608215258957, + "loss": 3.134270668029785, + "step": 3491, + "token_acc": 0.28572189097103917 + }, + { + "epoch": 2.0469070653767223, + "grad_norm": 0.43003384312600423, + "learning_rate": 0.0002977583182092063, + "loss": 3.1502599716186523, + "step": 3492, + "token_acc": 0.2832874867549496 + }, + { + "epoch": 2.0474934036939314, + "grad_norm": 0.474634651217974, + "learning_rate": 0.00029775581350452657, + "loss": 3.1211743354797363, + "step": 3493, + "token_acc": 0.2877612312228905 + }, + { + "epoch": 2.0480797420111405, + "grad_norm": 0.4547410178087421, + "learning_rate": 0.0002977533074118801, + "loss": 3.1612777709960938, + "step": 3494, + "token_acc": 0.2815385720958643 + }, + { + "epoch": 2.0486660803283496, + "grad_norm": 0.37815061197681005, + "learning_rate": 0.0002977507999312904, + "loss": 3.127793312072754, + "step": 3495, + "token_acc": 0.28566074700894023 + }, + { + "epoch": 2.0492524186455583, + "grad_norm": 0.33109502420966236, + "learning_rate": 0.00029774829106278105, + "loss": 3.1137139797210693, + "step": 3496, + "token_acc": 0.2864732358141778 + }, + { + "epoch": 2.0498387569627674, + "grad_norm": 0.3488454773249142, + "learning_rate": 0.0002977457808063756, + "loss": 3.157222270965576, + "step": 3497, + "token_acc": 0.2803369130778737 + }, + { + "epoch": 2.0504250952799765, + "grad_norm": 0.3572317651350675, + "learning_rate": 0.0002977432691620976, + "loss": 3.148104190826416, + "step": 3498, + "token_acc": 0.2833471869804882 + }, + { + "epoch": 2.0510114335971856, + "grad_norm": 0.31564968542581406, + "learning_rate": 0.0002977407561299707, + "loss": 3.116899013519287, + "step": 3499, + "token_acc": 0.28815661147057897 + }, + { + "epoch": 2.0515977719143947, + "grad_norm": 0.3664372042406818, + "learning_rate": 0.00029773824171001846, + "loss": 3.1276073455810547, + "step": 3500, + "token_acc": 0.2850510213481638 + }, + { + "epoch": 2.052184110231604, + "grad_norm": 0.3481291757459716, + "learning_rate": 0.00029773572590226455, + "loss": 3.124985694885254, + "step": 3501, + "token_acc": 0.2849069095685324 + }, + { + "epoch": 2.0527704485488125, + "grad_norm": 0.3865212812170118, + "learning_rate": 0.00029773320870673256, + "loss": 3.0939674377441406, + "step": 3502, + "token_acc": 0.2909795897746792 + }, + { + "epoch": 2.0533567868660216, + "grad_norm": 0.3337478467015942, + "learning_rate": 0.0002977306901234461, + "loss": 3.1493101119995117, + "step": 3503, + "token_acc": 0.2830482033119354 + }, + { + "epoch": 2.0539431251832307, + "grad_norm": 0.35569292053366425, + "learning_rate": 0.0002977281701524289, + "loss": 3.130641460418701, + "step": 3504, + "token_acc": 0.2848616270406106 + }, + { + "epoch": 2.05452946350044, + "grad_norm": 0.3883722275928381, + "learning_rate": 0.0002977256487937046, + "loss": 3.157576560974121, + "step": 3505, + "token_acc": 0.27892406366382333 + }, + { + "epoch": 2.055115801817649, + "grad_norm": 0.33984671228836855, + "learning_rate": 0.00029772312604729696, + "loss": 3.113192319869995, + "step": 3506, + "token_acc": 0.286729158007569 + }, + { + "epoch": 2.0557021401348576, + "grad_norm": 0.33295295626915616, + "learning_rate": 0.00029772060191322956, + "loss": 3.12691593170166, + "step": 3507, + "token_acc": 0.2872470360880705 + }, + { + "epoch": 2.0562884784520667, + "grad_norm": 0.3637716501314176, + "learning_rate": 0.0002977180763915262, + "loss": 3.150609016418457, + "step": 3508, + "token_acc": 0.2836260346671929 + }, + { + "epoch": 2.056874816769276, + "grad_norm": 0.4138224822856884, + "learning_rate": 0.0002977155494822105, + "loss": 3.1472744941711426, + "step": 3509, + "token_acc": 0.2803072253327591 + }, + { + "epoch": 2.057461155086485, + "grad_norm": 0.41272097146259584, + "learning_rate": 0.00029771302118530624, + "loss": 3.1490025520324707, + "step": 3510, + "token_acc": 0.2815108445299942 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.38247622432235545, + "learning_rate": 0.00029771049150083723, + "loss": 3.1066107749938965, + "step": 3511, + "token_acc": 0.28804639728970416 + }, + { + "epoch": 2.0586338317209028, + "grad_norm": 0.44756299158841223, + "learning_rate": 0.00029770796042882726, + "loss": 3.1254422664642334, + "step": 3512, + "token_acc": 0.28580617661874685 + }, + { + "epoch": 2.059220170038112, + "grad_norm": 0.38781975131153684, + "learning_rate": 0.00029770542796929997, + "loss": 3.1050002574920654, + "step": 3513, + "token_acc": 0.2876175238215935 + }, + { + "epoch": 2.059806508355321, + "grad_norm": 0.38283329468430505, + "learning_rate": 0.0002977028941222792, + "loss": 3.1218655109405518, + "step": 3514, + "token_acc": 0.28573012763305894 + }, + { + "epoch": 2.06039284667253, + "grad_norm": 0.37080950960535186, + "learning_rate": 0.00029770035888778887, + "loss": 3.1293225288391113, + "step": 3515, + "token_acc": 0.28524457256103053 + }, + { + "epoch": 2.060979184989739, + "grad_norm": 0.3460738813636834, + "learning_rate": 0.0002976978222658526, + "loss": 3.156442642211914, + "step": 3516, + "token_acc": 0.280388287514918 + }, + { + "epoch": 2.0615655233069483, + "grad_norm": 0.38894253341154644, + "learning_rate": 0.0002976952842564943, + "loss": 3.1228482723236084, + "step": 3517, + "token_acc": 0.2874927142275018 + }, + { + "epoch": 2.062151861624157, + "grad_norm": 0.3918812413865541, + "learning_rate": 0.0002976927448597379, + "loss": 3.1407010555267334, + "step": 3518, + "token_acc": 0.28330341932343633 + }, + { + "epoch": 2.062738199941366, + "grad_norm": 0.3603479048629899, + "learning_rate": 0.0002976902040756072, + "loss": 3.1232118606567383, + "step": 3519, + "token_acc": 0.2859059458427339 + }, + { + "epoch": 2.063324538258575, + "grad_norm": 0.3959649666699924, + "learning_rate": 0.000297687661904126, + "loss": 3.1359477043151855, + "step": 3520, + "token_acc": 0.28413763530880143 + }, + { + "epoch": 2.0639108765757843, + "grad_norm": 0.3869707058558757, + "learning_rate": 0.0002976851183453182, + "loss": 3.131862163543701, + "step": 3521, + "token_acc": 0.2860237456094662 + }, + { + "epoch": 2.0644972148929934, + "grad_norm": 0.3714431022261122, + "learning_rate": 0.00029768257339920774, + "loss": 3.1441493034362793, + "step": 3522, + "token_acc": 0.2838706294164689 + }, + { + "epoch": 2.065083553210202, + "grad_norm": 0.3335036041680849, + "learning_rate": 0.00029768002706581854, + "loss": 3.153364419937134, + "step": 3523, + "token_acc": 0.2819435287521609 + }, + { + "epoch": 2.065669891527411, + "grad_norm": 0.3824269434467475, + "learning_rate": 0.0002976774793451745, + "loss": 3.1012821197509766, + "step": 3524, + "token_acc": 0.2883134212248136 + }, + { + "epoch": 2.0662562298446203, + "grad_norm": 0.35969115990196454, + "learning_rate": 0.0002976749302372995, + "loss": 3.1937437057495117, + "step": 3525, + "token_acc": 0.27583544752707534 + }, + { + "epoch": 2.0668425681618294, + "grad_norm": 0.36222974563201477, + "learning_rate": 0.0002976723797422175, + "loss": 3.146131992340088, + "step": 3526, + "token_acc": 0.2818005960161029 + }, + { + "epoch": 2.0674289064790385, + "grad_norm": 0.35350449822194085, + "learning_rate": 0.00029766982785995255, + "loss": 3.113847017288208, + "step": 3527, + "token_acc": 0.287634684994635 + }, + { + "epoch": 2.068015244796247, + "grad_norm": 0.3340319690663173, + "learning_rate": 0.00029766727459052853, + "loss": 3.1147704124450684, + "step": 3528, + "token_acc": 0.28748429129340874 + }, + { + "epoch": 2.0686015831134563, + "grad_norm": 0.3993938391747189, + "learning_rate": 0.00029766471993396943, + "loss": 3.145965576171875, + "step": 3529, + "token_acc": 0.28364981068530104 + }, + { + "epoch": 2.0691879214306654, + "grad_norm": 0.3615959001463227, + "learning_rate": 0.00029766216389029925, + "loss": 3.121049165725708, + "step": 3530, + "token_acc": 0.2873849523436305 + }, + { + "epoch": 2.0697742597478745, + "grad_norm": 0.364255840245574, + "learning_rate": 0.00029765960645954207, + "loss": 3.146820068359375, + "step": 3531, + "token_acc": 0.2837215838286996 + }, + { + "epoch": 2.0703605980650837, + "grad_norm": 0.36466553204277075, + "learning_rate": 0.00029765704764172184, + "loss": 3.1751701831817627, + "step": 3532, + "token_acc": 0.2791495648028582 + }, + { + "epoch": 2.0709469363822928, + "grad_norm": 0.37937564535364704, + "learning_rate": 0.0002976544874368626, + "loss": 3.100560188293457, + "step": 3533, + "token_acc": 0.288098886505261 + }, + { + "epoch": 2.0715332746995014, + "grad_norm": 0.38360413351165645, + "learning_rate": 0.00029765192584498847, + "loss": 3.1499667167663574, + "step": 3534, + "token_acc": 0.28279552371390876 + }, + { + "epoch": 2.0721196130167105, + "grad_norm": 0.3765750315115907, + "learning_rate": 0.00029764936286612336, + "loss": 3.128371477127075, + "step": 3535, + "token_acc": 0.28464078793437586 + }, + { + "epoch": 2.0727059513339197, + "grad_norm": 0.44904992265709776, + "learning_rate": 0.00029764679850029154, + "loss": 3.1282753944396973, + "step": 3536, + "token_acc": 0.2861553857162116 + }, + { + "epoch": 2.0732922896511288, + "grad_norm": 0.38263740998766227, + "learning_rate": 0.00029764423274751696, + "loss": 3.113499164581299, + "step": 3537, + "token_acc": 0.2890378946736946 + }, + { + "epoch": 2.073878627968338, + "grad_norm": 0.3281635353850766, + "learning_rate": 0.0002976416656078238, + "loss": 3.13016676902771, + "step": 3538, + "token_acc": 0.285801624803117 + }, + { + "epoch": 2.0744649662855466, + "grad_norm": 0.3433197943314979, + "learning_rate": 0.0002976390970812361, + "loss": 3.11934494972229, + "step": 3539, + "token_acc": 0.2851191679757776 + }, + { + "epoch": 2.0750513046027557, + "grad_norm": 0.3888432848829322, + "learning_rate": 0.000297636527167778, + "loss": 3.140639305114746, + "step": 3540, + "token_acc": 0.2839151551452493 + }, + { + "epoch": 2.0756376429199648, + "grad_norm": 0.41079850319974714, + "learning_rate": 0.00029763395586747377, + "loss": 3.1287364959716797, + "step": 3541, + "token_acc": 0.28452843383934745 + }, + { + "epoch": 2.076223981237174, + "grad_norm": 0.34197633708891684, + "learning_rate": 0.00029763138318034745, + "loss": 3.1278767585754395, + "step": 3542, + "token_acc": 0.2860553423135104 + }, + { + "epoch": 2.076810319554383, + "grad_norm": 0.3798433352248671, + "learning_rate": 0.00029762880910642317, + "loss": 3.1371684074401855, + "step": 3543, + "token_acc": 0.2858989351880347 + }, + { + "epoch": 2.077396657871592, + "grad_norm": 0.3841485055669031, + "learning_rate": 0.00029762623364572516, + "loss": 3.1701271533966064, + "step": 3544, + "token_acc": 0.27996077253652324 + }, + { + "epoch": 2.077982996188801, + "grad_norm": 0.38086084025145295, + "learning_rate": 0.0002976236567982776, + "loss": 3.1060070991516113, + "step": 3545, + "token_acc": 0.28773669995858625 + }, + { + "epoch": 2.07856933450601, + "grad_norm": 0.3255364386142253, + "learning_rate": 0.00029762107856410474, + "loss": 3.1586480140686035, + "step": 3546, + "token_acc": 0.280195677163609 + }, + { + "epoch": 2.079155672823219, + "grad_norm": 0.3557486075825875, + "learning_rate": 0.0002976184989432308, + "loss": 3.151869297027588, + "step": 3547, + "token_acc": 0.28266084279116394 + }, + { + "epoch": 2.079742011140428, + "grad_norm": 0.4177411514690902, + "learning_rate": 0.00029761591793567993, + "loss": 3.154684066772461, + "step": 3548, + "token_acc": 0.2822419283261836 + }, + { + "epoch": 2.0803283494576372, + "grad_norm": 0.4010392912952029, + "learning_rate": 0.00029761333554147645, + "loss": 3.168264150619507, + "step": 3549, + "token_acc": 0.27807405110035827 + }, + { + "epoch": 2.080914687774846, + "grad_norm": 0.3937215626325022, + "learning_rate": 0.0002976107517606446, + "loss": 3.133364677429199, + "step": 3550, + "token_acc": 0.2847749062911299 + }, + { + "epoch": 2.081501026092055, + "grad_norm": 0.3836105065696574, + "learning_rate": 0.0002976081665932086, + "loss": 3.112882614135742, + "step": 3551, + "token_acc": 0.2858181552838897 + }, + { + "epoch": 2.082087364409264, + "grad_norm": 0.3602429426971644, + "learning_rate": 0.00029760558003919283, + "loss": 3.14243221282959, + "step": 3552, + "token_acc": 0.284724178734434 + }, + { + "epoch": 2.0826737027264732, + "grad_norm": 0.3610106469486345, + "learning_rate": 0.0002976029920986215, + "loss": 3.1269149780273438, + "step": 3553, + "token_acc": 0.28660333666649124 + }, + { + "epoch": 2.0832600410436823, + "grad_norm": 0.41613010757234326, + "learning_rate": 0.00029760040277151896, + "loss": 3.1113481521606445, + "step": 3554, + "token_acc": 0.28706547171387986 + }, + { + "epoch": 2.0838463793608915, + "grad_norm": 0.3325729284686215, + "learning_rate": 0.0002975978120579096, + "loss": 3.1228528022766113, + "step": 3555, + "token_acc": 0.285179826105093 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.3515770367479981, + "learning_rate": 0.00029759521995781764, + "loss": 3.131241798400879, + "step": 3556, + "token_acc": 0.28476430683130927 + }, + { + "epoch": 2.0850190559953092, + "grad_norm": 0.39192313867772643, + "learning_rate": 0.00029759262647126745, + "loss": 3.1419215202331543, + "step": 3557, + "token_acc": 0.2818572023192494 + }, + { + "epoch": 2.0856053943125183, + "grad_norm": 0.35513370214930834, + "learning_rate": 0.0002975900315982834, + "loss": 3.128060817718506, + "step": 3558, + "token_acc": 0.2859430495938014 + }, + { + "epoch": 2.0861917326297275, + "grad_norm": 0.3611433351560402, + "learning_rate": 0.00029758743533889, + "loss": 3.12465500831604, + "step": 3559, + "token_acc": 0.2854470622905293 + }, + { + "epoch": 2.0867780709469366, + "grad_norm": 0.3880295163940839, + "learning_rate": 0.00029758483769311137, + "loss": 3.143515110015869, + "step": 3560, + "token_acc": 0.28390329579502016 + }, + { + "epoch": 2.0873644092641452, + "grad_norm": 0.3972605105443471, + "learning_rate": 0.0002975822386609722, + "loss": 3.1253089904785156, + "step": 3561, + "token_acc": 0.2852068726200874 + }, + { + "epoch": 2.0879507475813543, + "grad_norm": 0.3317098536815664, + "learning_rate": 0.00029757963824249663, + "loss": 3.1522445678710938, + "step": 3562, + "token_acc": 0.2828404790909777 + }, + { + "epoch": 2.0885370858985635, + "grad_norm": 0.32468246299298087, + "learning_rate": 0.0002975770364377093, + "loss": 3.1402926445007324, + "step": 3563, + "token_acc": 0.28204549129105855 + }, + { + "epoch": 2.0891234242157726, + "grad_norm": 0.406059457917977, + "learning_rate": 0.0002975744332466346, + "loss": 3.1296956539154053, + "step": 3564, + "token_acc": 0.2857222428518381 + }, + { + "epoch": 2.0897097625329817, + "grad_norm": 0.3690906744241124, + "learning_rate": 0.0002975718286692969, + "loss": 3.2085421085357666, + "step": 3565, + "token_acc": 0.2740298239310097 + }, + { + "epoch": 2.0902961008501904, + "grad_norm": 0.3529771055414883, + "learning_rate": 0.00029756922270572075, + "loss": 3.0420050621032715, + "step": 3566, + "token_acc": 0.2981452663179039 + }, + { + "epoch": 2.0908824391673995, + "grad_norm": 0.39812031092471895, + "learning_rate": 0.00029756661535593063, + "loss": 3.150214195251465, + "step": 3567, + "token_acc": 0.28322985495980196 + }, + { + "epoch": 2.0914687774846086, + "grad_norm": 0.4796722219156357, + "learning_rate": 0.000297564006619951, + "loss": 3.1549415588378906, + "step": 3568, + "token_acc": 0.28169084218441115 + }, + { + "epoch": 2.0920551158018177, + "grad_norm": 0.49373391490108404, + "learning_rate": 0.00029756139649780633, + "loss": 3.11130428314209, + "step": 3569, + "token_acc": 0.2873060906380779 + }, + { + "epoch": 2.092641454119027, + "grad_norm": 0.35317175253818717, + "learning_rate": 0.0002975587849895212, + "loss": 3.1161997318267822, + "step": 3570, + "token_acc": 0.28728083678916694 + }, + { + "epoch": 2.093227792436236, + "grad_norm": 0.4683002052064412, + "learning_rate": 0.00029755617209512015, + "loss": 3.119175910949707, + "step": 3571, + "token_acc": 0.28698096206033047 + }, + { + "epoch": 2.0938141307534446, + "grad_norm": 0.6186168394449949, + "learning_rate": 0.0002975535578146277, + "loss": 3.1231987476348877, + "step": 3572, + "token_acc": 0.2875237426972732 + }, + { + "epoch": 2.0944004690706537, + "grad_norm": 0.4705795273417421, + "learning_rate": 0.0002975509421480684, + "loss": 3.107603073120117, + "step": 3573, + "token_acc": 0.28907461424187547 + }, + { + "epoch": 2.094986807387863, + "grad_norm": 0.48348225012270607, + "learning_rate": 0.0002975483250954668, + "loss": 3.131746768951416, + "step": 3574, + "token_acc": 0.28382917975715377 + }, + { + "epoch": 2.095573145705072, + "grad_norm": 0.515562667266918, + "learning_rate": 0.00029754570665684754, + "loss": 3.1096596717834473, + "step": 3575, + "token_acc": 0.2879959405899859 + }, + { + "epoch": 2.096159484022281, + "grad_norm": 0.48368499286266764, + "learning_rate": 0.00029754308683223514, + "loss": 3.1457672119140625, + "step": 3576, + "token_acc": 0.2827620890538602 + }, + { + "epoch": 2.0967458223394897, + "grad_norm": 0.4182187474375285, + "learning_rate": 0.0002975404656216543, + "loss": 3.1215929985046387, + "step": 3577, + "token_acc": 0.288066705551726 + }, + { + "epoch": 2.097332160656699, + "grad_norm": 0.4080862006835993, + "learning_rate": 0.00029753784302512953, + "loss": 3.098050832748413, + "step": 3578, + "token_acc": 0.2904650996496174 + }, + { + "epoch": 2.097918498973908, + "grad_norm": 0.4153475572668984, + "learning_rate": 0.0002975352190426856, + "loss": 3.196120262145996, + "step": 3579, + "token_acc": 0.2744038105150249 + }, + { + "epoch": 2.098504837291117, + "grad_norm": 0.40481276513796816, + "learning_rate": 0.0002975325936743471, + "loss": 3.076892614364624, + "step": 3580, + "token_acc": 0.2929127112579418 + }, + { + "epoch": 2.099091175608326, + "grad_norm": 0.37725165741294653, + "learning_rate": 0.0002975299669201387, + "loss": 3.1096906661987305, + "step": 3581, + "token_acc": 0.28758836746932487 + }, + { + "epoch": 2.099677513925535, + "grad_norm": 0.40330931200002346, + "learning_rate": 0.000297527338780085, + "loss": 3.145169973373413, + "step": 3582, + "token_acc": 0.28202253247803355 + }, + { + "epoch": 2.100263852242744, + "grad_norm": 0.37089091837489785, + "learning_rate": 0.0002975247092542108, + "loss": 3.1549324989318848, + "step": 3583, + "token_acc": 0.28181232405710455 + }, + { + "epoch": 2.100850190559953, + "grad_norm": 0.3572884964490882, + "learning_rate": 0.00029752207834254067, + "loss": 3.1102874279022217, + "step": 3584, + "token_acc": 0.28770217336365933 + }, + { + "epoch": 2.101436528877162, + "grad_norm": 0.33487077283584704, + "learning_rate": 0.00029751944604509945, + "loss": 3.071106195449829, + "step": 3585, + "token_acc": 0.2922787519616859 + }, + { + "epoch": 2.1020228671943713, + "grad_norm": 0.32623614118179456, + "learning_rate": 0.00029751681236191185, + "loss": 3.1419413089752197, + "step": 3586, + "token_acc": 0.28314423749847156 + }, + { + "epoch": 2.1026092055115804, + "grad_norm": 0.3352223218145237, + "learning_rate": 0.00029751417729300257, + "loss": 3.1588094234466553, + "step": 3587, + "token_acc": 0.2824614879412302 + }, + { + "epoch": 2.103195543828789, + "grad_norm": 0.35047350883467737, + "learning_rate": 0.00029751154083839634, + "loss": 3.160095691680908, + "step": 3588, + "token_acc": 0.2810374607274833 + }, + { + "epoch": 2.103781882145998, + "grad_norm": 0.33249898386672827, + "learning_rate": 0.00029750890299811794, + "loss": 3.1361794471740723, + "step": 3589, + "token_acc": 0.28384104176486735 + }, + { + "epoch": 2.1043682204632073, + "grad_norm": 0.3197873264980517, + "learning_rate": 0.0002975062637721922, + "loss": 3.146862506866455, + "step": 3590, + "token_acc": 0.2827296727994848 + }, + { + "epoch": 2.1049545587804164, + "grad_norm": 0.3635327029849855, + "learning_rate": 0.00029750362316064387, + "loss": 3.1684470176696777, + "step": 3591, + "token_acc": 0.28133028005895977 + }, + { + "epoch": 2.1055408970976255, + "grad_norm": 0.329738225742666, + "learning_rate": 0.0002975009811634978, + "loss": 3.1855082511901855, + "step": 3592, + "token_acc": 0.2780068159662828 + }, + { + "epoch": 2.106127235414834, + "grad_norm": 0.3367470358602128, + "learning_rate": 0.0002974983377807787, + "loss": 3.105827808380127, + "step": 3593, + "token_acc": 0.2894561419947261 + }, + { + "epoch": 2.1067135737320433, + "grad_norm": 0.35161801930545405, + "learning_rate": 0.0002974956930125116, + "loss": 3.1370208263397217, + "step": 3594, + "token_acc": 0.2831918151270352 + }, + { + "epoch": 2.1072999120492524, + "grad_norm": 0.3532362738760793, + "learning_rate": 0.00029749304685872113, + "loss": 3.1760120391845703, + "step": 3595, + "token_acc": 0.2798521375935876 + }, + { + "epoch": 2.1078862503664615, + "grad_norm": 0.3052014277711385, + "learning_rate": 0.0002974903993194322, + "loss": 3.149266481399536, + "step": 3596, + "token_acc": 0.2831579132065162 + }, + { + "epoch": 2.1084725886836706, + "grad_norm": 0.34982371711926846, + "learning_rate": 0.0002974877503946698, + "loss": 3.1322388648986816, + "step": 3597, + "token_acc": 0.2843834965551831 + }, + { + "epoch": 2.1090589270008797, + "grad_norm": 0.3784844717049778, + "learning_rate": 0.0002974851000844586, + "loss": 3.128718376159668, + "step": 3598, + "token_acc": 0.2859041148001775 + }, + { + "epoch": 2.1096452653180884, + "grad_norm": 0.37386227007663875, + "learning_rate": 0.00029748244838882375, + "loss": 3.182392120361328, + "step": 3599, + "token_acc": 0.2768258033742236 + }, + { + "epoch": 2.1102316036352975, + "grad_norm": 0.33080129152623244, + "learning_rate": 0.00029747979530778996, + "loss": 3.1474149227142334, + "step": 3600, + "token_acc": 0.28321249508778684 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.3753553148563084, + "learning_rate": 0.00029747714084138227, + "loss": 3.1178596019744873, + "step": 3601, + "token_acc": 0.28559688917692805 + }, + { + "epoch": 2.1114042802697157, + "grad_norm": 0.3731691525259236, + "learning_rate": 0.00029747448498962555, + "loss": 3.0865046977996826, + "step": 3602, + "token_acc": 0.29061540862508456 + }, + { + "epoch": 2.111990618586925, + "grad_norm": 0.3781002074743101, + "learning_rate": 0.0002974718277525448, + "loss": 3.1164581775665283, + "step": 3603, + "token_acc": 0.28721874263088204 + }, + { + "epoch": 2.1125769569041335, + "grad_norm": 0.3453019683483662, + "learning_rate": 0.00029746916913016486, + "loss": 3.070502519607544, + "step": 3604, + "token_acc": 0.29477374123645633 + }, + { + "epoch": 2.1131632952213426, + "grad_norm": 0.38588116351145907, + "learning_rate": 0.0002974665091225109, + "loss": 3.1674442291259766, + "step": 3605, + "token_acc": 0.28288178960057775 + }, + { + "epoch": 2.1137496335385517, + "grad_norm": 0.34391724128141, + "learning_rate": 0.00029746384772960774, + "loss": 3.1040329933166504, + "step": 3606, + "token_acc": 0.28940249147702085 + }, + { + "epoch": 2.114335971855761, + "grad_norm": 0.39691114757066587, + "learning_rate": 0.00029746118495148046, + "loss": 3.1164209842681885, + "step": 3607, + "token_acc": 0.2855855949407837 + }, + { + "epoch": 2.11492231017297, + "grad_norm": 0.3785613378593729, + "learning_rate": 0.00029745852078815404, + "loss": 3.1203668117523193, + "step": 3608, + "token_acc": 0.2866854267667885 + }, + { + "epoch": 2.115508648490179, + "grad_norm": 0.3535489550677346, + "learning_rate": 0.00029745585523965353, + "loss": 3.1209752559661865, + "step": 3609, + "token_acc": 0.2857183884038598 + }, + { + "epoch": 2.1160949868073877, + "grad_norm": 0.3497399312883876, + "learning_rate": 0.0002974531883060039, + "loss": 3.1599011421203613, + "step": 3610, + "token_acc": 0.2814807521906075 + }, + { + "epoch": 2.116681325124597, + "grad_norm": 0.3091547294333782, + "learning_rate": 0.00029745051998723035, + "loss": 3.1567039489746094, + "step": 3611, + "token_acc": 0.2806494305366563 + }, + { + "epoch": 2.117267663441806, + "grad_norm": 0.3351889044790218, + "learning_rate": 0.00029744785028335783, + "loss": 3.1179704666137695, + "step": 3612, + "token_acc": 0.2853116836098153 + }, + { + "epoch": 2.117854001759015, + "grad_norm": 0.31912916059871516, + "learning_rate": 0.00029744517919441145, + "loss": 3.087202548980713, + "step": 3613, + "token_acc": 0.29077147733439596 + }, + { + "epoch": 2.118440340076224, + "grad_norm": 0.3028282332306614, + "learning_rate": 0.00029744250672041625, + "loss": 3.143599033355713, + "step": 3614, + "token_acc": 0.28466444484905096 + }, + { + "epoch": 2.119026678393433, + "grad_norm": 0.31557314686677157, + "learning_rate": 0.00029743983286139745, + "loss": 3.0971221923828125, + "step": 3615, + "token_acc": 0.2904314588763716 + }, + { + "epoch": 2.119613016710642, + "grad_norm": 0.3553347787494989, + "learning_rate": 0.00029743715761738004, + "loss": 3.0950396060943604, + "step": 3616, + "token_acc": 0.28926570102945653 + }, + { + "epoch": 2.120199355027851, + "grad_norm": 0.40497298504576884, + "learning_rate": 0.0002974344809883892, + "loss": 3.1668670177459717, + "step": 3617, + "token_acc": 0.2792664213087144 + }, + { + "epoch": 2.12078569334506, + "grad_norm": 0.36363457777852265, + "learning_rate": 0.00029743180297445013, + "loss": 3.1283974647521973, + "step": 3618, + "token_acc": 0.2871843760344257 + }, + { + "epoch": 2.1213720316622693, + "grad_norm": 0.37108617335266614, + "learning_rate": 0.00029742912357558796, + "loss": 3.1360888481140137, + "step": 3619, + "token_acc": 0.28266875463179975 + }, + { + "epoch": 2.121958369979478, + "grad_norm": 0.3743372212156133, + "learning_rate": 0.0002974264427918278, + "loss": 3.1293435096740723, + "step": 3620, + "token_acc": 0.28594275382240364 + }, + { + "epoch": 2.122544708296687, + "grad_norm": 0.3686261478387375, + "learning_rate": 0.00029742376062319486, + "loss": 3.160668134689331, + "step": 3621, + "token_acc": 0.281427115837738 + }, + { + "epoch": 2.123131046613896, + "grad_norm": 0.34355449323084775, + "learning_rate": 0.00029742107706971435, + "loss": 3.103022575378418, + "step": 3622, + "token_acc": 0.288292641929655 + }, + { + "epoch": 2.1237173849311053, + "grad_norm": 0.3747066978294427, + "learning_rate": 0.00029741839213141147, + "loss": 3.133305072784424, + "step": 3623, + "token_acc": 0.2850711970230641 + }, + { + "epoch": 2.1243037232483144, + "grad_norm": 0.4094916292632341, + "learning_rate": 0.0002974157058083114, + "loss": 3.126133918762207, + "step": 3624, + "token_acc": 0.2851364600943327 + }, + { + "epoch": 2.1248900615655235, + "grad_norm": 0.3861210919909546, + "learning_rate": 0.0002974130181004395, + "loss": 3.1097095012664795, + "step": 3625, + "token_acc": 0.28936690374229995 + }, + { + "epoch": 2.125476399882732, + "grad_norm": 0.3866438357201555, + "learning_rate": 0.0002974103290078209, + "loss": 3.110128402709961, + "step": 3626, + "token_acc": 0.28882484051956037 + }, + { + "epoch": 2.1260627381999413, + "grad_norm": 0.4280062320951143, + "learning_rate": 0.00029740763853048095, + "loss": 3.067365884780884, + "step": 3627, + "token_acc": 0.29454783867124723 + }, + { + "epoch": 2.1266490765171504, + "grad_norm": 0.3756356268079855, + "learning_rate": 0.0002974049466684448, + "loss": 3.1206445693969727, + "step": 3628, + "token_acc": 0.2875451709790596 + }, + { + "epoch": 2.1272354148343595, + "grad_norm": 0.38587802126662996, + "learning_rate": 0.00029740225342173786, + "loss": 3.118781328201294, + "step": 3629, + "token_acc": 0.2865087603245281 + }, + { + "epoch": 2.1278217531515686, + "grad_norm": 0.3401795810505323, + "learning_rate": 0.00029739955879038533, + "loss": 3.142815113067627, + "step": 3630, + "token_acc": 0.2833537220434422 + }, + { + "epoch": 2.1284080914687773, + "grad_norm": 0.3824268935167012, + "learning_rate": 0.0002973968627744125, + "loss": 3.150843381881714, + "step": 3631, + "token_acc": 0.2818112990810359 + }, + { + "epoch": 2.1289944297859864, + "grad_norm": 0.36166420470483296, + "learning_rate": 0.0002973941653738449, + "loss": 3.124642848968506, + "step": 3632, + "token_acc": 0.2845077951918085 + }, + { + "epoch": 2.1295807681031955, + "grad_norm": 0.3768012167825204, + "learning_rate": 0.0002973914665887077, + "loss": 3.1119298934936523, + "step": 3633, + "token_acc": 0.2856994582552782 + }, + { + "epoch": 2.1301671064204046, + "grad_norm": 0.3178086023299626, + "learning_rate": 0.00029738876641902627, + "loss": 3.1113576889038086, + "step": 3634, + "token_acc": 0.28883917278016696 + }, + { + "epoch": 2.1307534447376137, + "grad_norm": 0.32007537882220766, + "learning_rate": 0.000297386064864826, + "loss": 3.1324214935302734, + "step": 3635, + "token_acc": 0.28540138509174856 + }, + { + "epoch": 2.1313397830548224, + "grad_norm": 0.35061623679753595, + "learning_rate": 0.0002973833619261322, + "loss": 3.1358771324157715, + "step": 3636, + "token_acc": 0.2850238470394126 + }, + { + "epoch": 2.1319261213720315, + "grad_norm": 0.3727979365920853, + "learning_rate": 0.00029738065760297037, + "loss": 3.1353282928466797, + "step": 3637, + "token_acc": 0.28409883168191324 + }, + { + "epoch": 2.1325124596892406, + "grad_norm": 0.33173411988510243, + "learning_rate": 0.00029737795189536584, + "loss": 3.1528213024139404, + "step": 3638, + "token_acc": 0.2827891809443663 + }, + { + "epoch": 2.1330987980064497, + "grad_norm": 0.34613418929036993, + "learning_rate": 0.00029737524480334405, + "loss": 3.145735740661621, + "step": 3639, + "token_acc": 0.2821236595445828 + }, + { + "epoch": 2.133685136323659, + "grad_norm": 0.37945271611362374, + "learning_rate": 0.00029737253632693047, + "loss": 3.177666187286377, + "step": 3640, + "token_acc": 0.2779716804872037 + }, + { + "epoch": 2.134271474640868, + "grad_norm": 0.38928774215867945, + "learning_rate": 0.0002973698264661504, + "loss": 3.143641948699951, + "step": 3641, + "token_acc": 0.28483405792664257 + }, + { + "epoch": 2.1348578129580766, + "grad_norm": 0.3525326574678476, + "learning_rate": 0.0002973671152210295, + "loss": 3.1402170658111572, + "step": 3642, + "token_acc": 0.2835931132172523 + }, + { + "epoch": 2.1354441512752858, + "grad_norm": 0.3690826914844778, + "learning_rate": 0.00029736440259159303, + "loss": 3.1585018634796143, + "step": 3643, + "token_acc": 0.2830827978849169 + }, + { + "epoch": 2.136030489592495, + "grad_norm": 0.4272356535361869, + "learning_rate": 0.00029736168857786666, + "loss": 3.161320209503174, + "step": 3644, + "token_acc": 0.2819222608975323 + }, + { + "epoch": 2.136616827909704, + "grad_norm": 0.37547532349016904, + "learning_rate": 0.0002973589731798757, + "loss": 3.1558077335357666, + "step": 3645, + "token_acc": 0.2822075186581045 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.3848749721475837, + "learning_rate": 0.0002973562563976459, + "loss": 3.0737271308898926, + "step": 3646, + "token_acc": 0.2905421434375962 + }, + { + "epoch": 2.1377895045441218, + "grad_norm": 0.37523437087585715, + "learning_rate": 0.00029735353823120254, + "loss": 3.1658897399902344, + "step": 3647, + "token_acc": 0.28095983111601663 + }, + { + "epoch": 2.138375842861331, + "grad_norm": 0.3591564976510279, + "learning_rate": 0.00029735081868057124, + "loss": 3.091909646987915, + "step": 3648, + "token_acc": 0.2905892443292761 + }, + { + "epoch": 2.13896218117854, + "grad_norm": 0.39226765137752523, + "learning_rate": 0.0002973480977457776, + "loss": 3.172088384628296, + "step": 3649, + "token_acc": 0.2801198168695981 + }, + { + "epoch": 2.139548519495749, + "grad_norm": 0.38308088062879925, + "learning_rate": 0.00029734537542684713, + "loss": 3.1077122688293457, + "step": 3650, + "token_acc": 0.2891672103524625 + }, + { + "epoch": 2.140134857812958, + "grad_norm": 0.3679522544072703, + "learning_rate": 0.0002973426517238054, + "loss": 3.128175735473633, + "step": 3651, + "token_acc": 0.28663294493753416 + }, + { + "epoch": 2.1407211961301673, + "grad_norm": 0.3779372267794978, + "learning_rate": 0.00029733992663667796, + "loss": 3.1356520652770996, + "step": 3652, + "token_acc": 0.28427416172532266 + }, + { + "epoch": 2.141307534447376, + "grad_norm": 0.37868653216076603, + "learning_rate": 0.0002973372001654905, + "loss": 3.191227674484253, + "step": 3653, + "token_acc": 0.2760653738584097 + }, + { + "epoch": 2.141893872764585, + "grad_norm": 0.3907757891161318, + "learning_rate": 0.0002973344723102686, + "loss": 3.1168456077575684, + "step": 3654, + "token_acc": 0.2876745402599493 + }, + { + "epoch": 2.142480211081794, + "grad_norm": 0.35936585638240326, + "learning_rate": 0.0002973317430710378, + "loss": 3.1460936069488525, + "step": 3655, + "token_acc": 0.2831868843245196 + }, + { + "epoch": 2.1430665493990033, + "grad_norm": 0.4348895267124142, + "learning_rate": 0.00029732901244782384, + "loss": 3.0864439010620117, + "step": 3656, + "token_acc": 0.2912484606750144 + }, + { + "epoch": 2.1436528877162124, + "grad_norm": 0.41434188903484775, + "learning_rate": 0.00029732628044065235, + "loss": 3.0974295139312744, + "step": 3657, + "token_acc": 0.29001516807623834 + }, + { + "epoch": 2.144239226033421, + "grad_norm": 0.35540701313535933, + "learning_rate": 0.000297323547049549, + "loss": 3.1467647552490234, + "step": 3658, + "token_acc": 0.28208001242107444 + }, + { + "epoch": 2.14482556435063, + "grad_norm": 0.39805944269643206, + "learning_rate": 0.0002973208122745394, + "loss": 3.126558780670166, + "step": 3659, + "token_acc": 0.2856025520697101 + }, + { + "epoch": 2.1454119026678393, + "grad_norm": 0.39415621137394274, + "learning_rate": 0.00029731807611564935, + "loss": 3.1468863487243652, + "step": 3660, + "token_acc": 0.2817636849664858 + }, + { + "epoch": 2.1459982409850484, + "grad_norm": 0.39463116534085596, + "learning_rate": 0.0002973153385729044, + "loss": 3.1094295978546143, + "step": 3661, + "token_acc": 0.28611949157410166 + }, + { + "epoch": 2.1465845793022575, + "grad_norm": 0.4082472157834538, + "learning_rate": 0.0002973125996463304, + "loss": 3.1418848037719727, + "step": 3662, + "token_acc": 0.28449804429430414 + }, + { + "epoch": 2.1471709176194667, + "grad_norm": 0.39048125810586015, + "learning_rate": 0.0002973098593359531, + "loss": 3.158926486968994, + "step": 3663, + "token_acc": 0.28208582177905234 + }, + { + "epoch": 2.1477572559366753, + "grad_norm": 0.3622529469682711, + "learning_rate": 0.00029730711764179807, + "loss": 3.102562427520752, + "step": 3664, + "token_acc": 0.28952291489209264 + }, + { + "epoch": 2.1483435942538844, + "grad_norm": 0.36877861549427204, + "learning_rate": 0.0002973043745638912, + "loss": 3.1528053283691406, + "step": 3665, + "token_acc": 0.2829999352625105 + }, + { + "epoch": 2.1489299325710935, + "grad_norm": 0.40602504675755247, + "learning_rate": 0.00029730163010225827, + "loss": 3.09246826171875, + "step": 3666, + "token_acc": 0.2910243568448947 + }, + { + "epoch": 2.1495162708883027, + "grad_norm": 0.3625122608283235, + "learning_rate": 0.00029729888425692494, + "loss": 3.1218814849853516, + "step": 3667, + "token_acc": 0.28450962705713845 + }, + { + "epoch": 2.1501026092055118, + "grad_norm": 0.37925539397051744, + "learning_rate": 0.0002972961370279171, + "loss": 3.1087779998779297, + "step": 3668, + "token_acc": 0.2884590130543964 + }, + { + "epoch": 2.1506889475227204, + "grad_norm": 0.35095677442654594, + "learning_rate": 0.0002972933884152606, + "loss": 3.1027166843414307, + "step": 3669, + "token_acc": 0.28937091183765745 + }, + { + "epoch": 2.1512752858399296, + "grad_norm": 0.3144874769407481, + "learning_rate": 0.00029729063841898117, + "loss": 3.1827893257141113, + "step": 3670, + "token_acc": 0.2768148129018929 + }, + { + "epoch": 2.1518616241571387, + "grad_norm": 0.34357852056190324, + "learning_rate": 0.00029728788703910463, + "loss": 3.1004798412323, + "step": 3671, + "token_acc": 0.287819975094957 + }, + { + "epoch": 2.1524479624743478, + "grad_norm": 0.35489894241709186, + "learning_rate": 0.0002972851342756569, + "loss": 3.13132905960083, + "step": 3672, + "token_acc": 0.2841390413623201 + }, + { + "epoch": 2.153034300791557, + "grad_norm": 0.2890847081218417, + "learning_rate": 0.0002972823801286638, + "loss": 3.118220567703247, + "step": 3673, + "token_acc": 0.28704080214536826 + }, + { + "epoch": 2.1536206391087656, + "grad_norm": 0.3737304874616641, + "learning_rate": 0.00029727962459815115, + "loss": 3.1819117069244385, + "step": 3674, + "token_acc": 0.27841109620950094 + }, + { + "epoch": 2.1542069774259747, + "grad_norm": 0.34913409484649843, + "learning_rate": 0.00029727686768414493, + "loss": 3.0963265895843506, + "step": 3675, + "token_acc": 0.2915285936807653 + }, + { + "epoch": 2.154793315743184, + "grad_norm": 0.32281029421913626, + "learning_rate": 0.000297274109386671, + "loss": 3.119079113006592, + "step": 3676, + "token_acc": 0.2857142857142857 + }, + { + "epoch": 2.155379654060393, + "grad_norm": 0.3959722050397268, + "learning_rate": 0.00029727134970575523, + "loss": 3.1270384788513184, + "step": 3677, + "token_acc": 0.2850111632473174 + }, + { + "epoch": 2.155965992377602, + "grad_norm": 0.31602764833827146, + "learning_rate": 0.00029726858864142364, + "loss": 3.079014778137207, + "step": 3678, + "token_acc": 0.29039350058180613 + }, + { + "epoch": 2.1565523306948107, + "grad_norm": 0.32875718990960107, + "learning_rate": 0.0002972658261937021, + "loss": 3.1619462966918945, + "step": 3679, + "token_acc": 0.2820438783781636 + }, + { + "epoch": 2.15713866901202, + "grad_norm": 0.37718074180093003, + "learning_rate": 0.0002972630623626165, + "loss": 3.139472246170044, + "step": 3680, + "token_acc": 0.2851135743655937 + }, + { + "epoch": 2.157725007329229, + "grad_norm": 0.37040363958150374, + "learning_rate": 0.0002972602971481929, + "loss": 3.1133675575256348, + "step": 3681, + "token_acc": 0.28805701061758837 + }, + { + "epoch": 2.158311345646438, + "grad_norm": 0.33948286677427697, + "learning_rate": 0.0002972575305504573, + "loss": 3.1386313438415527, + "step": 3682, + "token_acc": 0.28480566666928625 + }, + { + "epoch": 2.158897683963647, + "grad_norm": 0.3612626653013946, + "learning_rate": 0.00029725476256943555, + "loss": 3.210702419281006, + "step": 3683, + "token_acc": 0.2749471183246283 + }, + { + "epoch": 2.1594840222808562, + "grad_norm": 0.3903099635280311, + "learning_rate": 0.0002972519932051538, + "loss": 3.138247013092041, + "step": 3684, + "token_acc": 0.28619558704496 + }, + { + "epoch": 2.160070360598065, + "grad_norm": 0.3615692010727741, + "learning_rate": 0.000297249222457638, + "loss": 3.1362054347991943, + "step": 3685, + "token_acc": 0.28323117856242247 + }, + { + "epoch": 2.160656698915274, + "grad_norm": 0.3606492339244674, + "learning_rate": 0.00029724645032691414, + "loss": 3.114980697631836, + "step": 3686, + "token_acc": 0.2869639012099697 + }, + { + "epoch": 2.161243037232483, + "grad_norm": 0.4016473151811356, + "learning_rate": 0.0002972436768130083, + "loss": 3.1188037395477295, + "step": 3687, + "token_acc": 0.2857533526442812 + }, + { + "epoch": 2.1618293755496922, + "grad_norm": 0.413287236823868, + "learning_rate": 0.00029724090191594654, + "loss": 3.1648106575012207, + "step": 3688, + "token_acc": 0.281669070898114 + }, + { + "epoch": 2.1624157138669013, + "grad_norm": 0.3948354021205157, + "learning_rate": 0.0002972381256357549, + "loss": 3.132230043411255, + "step": 3689, + "token_acc": 0.2838120414496001 + }, + { + "epoch": 2.16300205218411, + "grad_norm": 0.345112297981206, + "learning_rate": 0.0002972353479724595, + "loss": 3.0848844051361084, + "step": 3690, + "token_acc": 0.29048484880241876 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.351916771334899, + "learning_rate": 0.0002972325689260864, + "loss": 3.125626564025879, + "step": 3691, + "token_acc": 0.2874450808969182 + }, + { + "epoch": 2.1641747288185282, + "grad_norm": 0.3164030938334927, + "learning_rate": 0.0002972297884966617, + "loss": 3.1505215167999268, + "step": 3692, + "token_acc": 0.28120564348299376 + }, + { + "epoch": 2.1647610671357373, + "grad_norm": 0.34315390849571514, + "learning_rate": 0.0002972270066842115, + "loss": 3.128664970397949, + "step": 3693, + "token_acc": 0.2848959903527284 + }, + { + "epoch": 2.1653474054529465, + "grad_norm": 0.3449984384364031, + "learning_rate": 0.000297224223488762, + "loss": 3.1447925567626953, + "step": 3694, + "token_acc": 0.2818267651357399 + }, + { + "epoch": 2.1659337437701556, + "grad_norm": 0.3681370129097049, + "learning_rate": 0.00029722143891033935, + "loss": 3.1524910926818848, + "step": 3695, + "token_acc": 0.28267057795403555 + }, + { + "epoch": 2.1665200820873642, + "grad_norm": 0.3977140577950616, + "learning_rate": 0.0002972186529489696, + "loss": 3.123647689819336, + "step": 3696, + "token_acc": 0.28472184559919217 + }, + { + "epoch": 2.1671064204045734, + "grad_norm": 0.33435729189688934, + "learning_rate": 0.000297215865604679, + "loss": 3.1324071884155273, + "step": 3697, + "token_acc": 0.28364051370177484 + }, + { + "epoch": 2.1676927587217825, + "grad_norm": 0.3519433101085958, + "learning_rate": 0.00029721307687749374, + "loss": 3.1004347801208496, + "step": 3698, + "token_acc": 0.2887127786727641 + }, + { + "epoch": 2.1682790970389916, + "grad_norm": 0.31005432810732253, + "learning_rate": 0.00029721028676744, + "loss": 3.0965301990509033, + "step": 3699, + "token_acc": 0.29104107676296315 + }, + { + "epoch": 2.1688654353562007, + "grad_norm": 0.37171401958548694, + "learning_rate": 0.000297207495274544, + "loss": 3.158102512359619, + "step": 3700, + "token_acc": 0.28180967567638804 + }, + { + "epoch": 2.1694517736734094, + "grad_norm": 0.38371287955406946, + "learning_rate": 0.0002972047023988319, + "loss": 3.093679428100586, + "step": 3701, + "token_acc": 0.29066419851491376 + }, + { + "epoch": 2.1700381119906185, + "grad_norm": 0.33021700239126456, + "learning_rate": 0.00029720190814032995, + "loss": 3.080838203430176, + "step": 3702, + "token_acc": 0.29000745712155107 + }, + { + "epoch": 2.1706244503078276, + "grad_norm": 0.3212098235047024, + "learning_rate": 0.0002971991124990645, + "loss": 3.1033544540405273, + "step": 3703, + "token_acc": 0.2890720926153306 + }, + { + "epoch": 2.1712107886250367, + "grad_norm": 0.37538442273914396, + "learning_rate": 0.0002971963154750617, + "loss": 3.1580235958099365, + "step": 3704, + "token_acc": 0.28001626477339 + }, + { + "epoch": 2.171797126942246, + "grad_norm": 0.40590662521371323, + "learning_rate": 0.0002971935170683479, + "loss": 3.148336887359619, + "step": 3705, + "token_acc": 0.2819193430493214 + }, + { + "epoch": 2.172383465259455, + "grad_norm": 0.33690911595167705, + "learning_rate": 0.0002971907172789493, + "loss": 3.1577134132385254, + "step": 3706, + "token_acc": 0.2818503103766038 + }, + { + "epoch": 2.1729698035766636, + "grad_norm": 0.3460556106496662, + "learning_rate": 0.0002971879161068923, + "loss": 3.108748435974121, + "step": 3707, + "token_acc": 0.2883463248083489 + }, + { + "epoch": 2.1735561418938727, + "grad_norm": 0.3864134249865432, + "learning_rate": 0.00029718511355220317, + "loss": 3.130164623260498, + "step": 3708, + "token_acc": 0.2834747295830476 + }, + { + "epoch": 2.174142480211082, + "grad_norm": 0.32881384185865764, + "learning_rate": 0.0002971823096149082, + "loss": 3.134215831756592, + "step": 3709, + "token_acc": 0.2856028797339242 + }, + { + "epoch": 2.174728818528291, + "grad_norm": 0.3351313090045613, + "learning_rate": 0.0002971795042950338, + "loss": 3.1081809997558594, + "step": 3710, + "token_acc": 0.2869916354196712 + }, + { + "epoch": 2.1753151568455, + "grad_norm": 0.3549544996527675, + "learning_rate": 0.00029717669759260625, + "loss": 3.1474461555480957, + "step": 3711, + "token_acc": 0.2831281243364676 + }, + { + "epoch": 2.1759014951627087, + "grad_norm": 0.35026042871518703, + "learning_rate": 0.00029717388950765197, + "loss": 3.177511215209961, + "step": 3712, + "token_acc": 0.2781876613673855 + }, + { + "epoch": 2.176487833479918, + "grad_norm": 0.2974110737850865, + "learning_rate": 0.0002971710800401973, + "loss": 3.1171035766601562, + "step": 3713, + "token_acc": 0.28907784707193074 + }, + { + "epoch": 2.177074171797127, + "grad_norm": 0.3738621478159767, + "learning_rate": 0.0002971682691902687, + "loss": 3.1230955123901367, + "step": 3714, + "token_acc": 0.2877539375745363 + }, + { + "epoch": 2.177660510114336, + "grad_norm": 0.3528096398357954, + "learning_rate": 0.00029716545695789243, + "loss": 3.124410629272461, + "step": 3715, + "token_acc": 0.2841609858682732 + }, + { + "epoch": 2.178246848431545, + "grad_norm": 0.33569778548569945, + "learning_rate": 0.00029716264334309506, + "loss": 3.1467747688293457, + "step": 3716, + "token_acc": 0.28410485850885125 + }, + { + "epoch": 2.1788331867487543, + "grad_norm": 0.38710390616394685, + "learning_rate": 0.00029715982834590296, + "loss": 3.132063865661621, + "step": 3717, + "token_acc": 0.28413639871708934 + }, + { + "epoch": 2.179419525065963, + "grad_norm": 0.3089674366816096, + "learning_rate": 0.00029715701196634256, + "loss": 3.0990710258483887, + "step": 3718, + "token_acc": 0.28976225630193236 + }, + { + "epoch": 2.180005863383172, + "grad_norm": 0.3150721522407789, + "learning_rate": 0.00029715419420444034, + "loss": 3.114352226257324, + "step": 3719, + "token_acc": 0.289362573652204 + }, + { + "epoch": 2.180592201700381, + "grad_norm": 0.34724289088430854, + "learning_rate": 0.00029715137506022274, + "loss": 3.130066394805908, + "step": 3720, + "token_acc": 0.2841681618624524 + }, + { + "epoch": 2.1811785400175903, + "grad_norm": 0.38270116759199097, + "learning_rate": 0.00029714855453371626, + "loss": 3.123567581176758, + "step": 3721, + "token_acc": 0.28573445350237364 + }, + { + "epoch": 2.1817648783347994, + "grad_norm": 0.3628778286849395, + "learning_rate": 0.00029714573262494743, + "loss": 3.157637357711792, + "step": 3722, + "token_acc": 0.28230468956058447 + }, + { + "epoch": 2.182351216652008, + "grad_norm": 0.3282955382901595, + "learning_rate": 0.0002971429093339427, + "loss": 3.196810245513916, + "step": 3723, + "token_acc": 0.2766472221389707 + }, + { + "epoch": 2.182937554969217, + "grad_norm": 0.31388366259867345, + "learning_rate": 0.0002971400846607286, + "loss": 3.1468496322631836, + "step": 3724, + "token_acc": 0.28312359756977706 + }, + { + "epoch": 2.1835238932864263, + "grad_norm": 0.3296758979553433, + "learning_rate": 0.0002971372586053317, + "loss": 3.12540602684021, + "step": 3725, + "token_acc": 0.28753877945785394 + }, + { + "epoch": 2.1841102316036354, + "grad_norm": 0.33613772749076, + "learning_rate": 0.0002971344311677785, + "loss": 3.158262252807617, + "step": 3726, + "token_acc": 0.2813853039676992 + }, + { + "epoch": 2.1846965699208445, + "grad_norm": 0.30841193010965795, + "learning_rate": 0.0002971316023480956, + "loss": 3.153303623199463, + "step": 3727, + "token_acc": 0.28097400329324884 + }, + { + "epoch": 2.185282908238053, + "grad_norm": 0.38920712240591765, + "learning_rate": 0.0002971287721463096, + "loss": 3.1404852867126465, + "step": 3728, + "token_acc": 0.2840865204062336 + }, + { + "epoch": 2.1858692465552623, + "grad_norm": 0.3116747108469472, + "learning_rate": 0.00029712594056244696, + "loss": 3.1646945476531982, + "step": 3729, + "token_acc": 0.2791000745444068 + }, + { + "epoch": 2.1864555848724714, + "grad_norm": 0.3579300577012172, + "learning_rate": 0.0002971231075965345, + "loss": 3.145068407058716, + "step": 3730, + "token_acc": 0.2828042334800524 + }, + { + "epoch": 2.1870419231896805, + "grad_norm": 0.31820979443196135, + "learning_rate": 0.00029712027324859855, + "loss": 3.1453914642333984, + "step": 3731, + "token_acc": 0.28512502870138534 + }, + { + "epoch": 2.1876282615068896, + "grad_norm": 0.3595393785855766, + "learning_rate": 0.00029711743751866594, + "loss": 3.1465158462524414, + "step": 3732, + "token_acc": 0.2823496005464187 + }, + { + "epoch": 2.1882145998240983, + "grad_norm": 0.30877011368973845, + "learning_rate": 0.0002971146004067632, + "loss": 3.129978656768799, + "step": 3733, + "token_acc": 0.2860482610666876 + }, + { + "epoch": 2.1888009381413074, + "grad_norm": 0.33223886640453354, + "learning_rate": 0.0002971117619129171, + "loss": 3.126527786254883, + "step": 3734, + "token_acc": 0.28645519237727657 + }, + { + "epoch": 2.1893872764585165, + "grad_norm": 0.3501422888772264, + "learning_rate": 0.00029710892203715423, + "loss": 3.1880886554718018, + "step": 3735, + "token_acc": 0.27684443280823495 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.3848066531155169, + "learning_rate": 0.0002971060807795012, + "loss": 3.159013271331787, + "step": 3736, + "token_acc": 0.2830311398762082 + }, + { + "epoch": 2.1905599530929347, + "grad_norm": 0.3941500679060397, + "learning_rate": 0.00029710323813998484, + "loss": 3.1417603492736816, + "step": 3737, + "token_acc": 0.2833038656584666 + }, + { + "epoch": 2.191146291410144, + "grad_norm": 0.4005480122039298, + "learning_rate": 0.00029710039411863173, + "loss": 3.1206064224243164, + "step": 3738, + "token_acc": 0.28731070052743846 + }, + { + "epoch": 2.1917326297273525, + "grad_norm": 0.35424637769856315, + "learning_rate": 0.00029709754871546864, + "loss": 3.186551332473755, + "step": 3739, + "token_acc": 0.2771676199164883 + }, + { + "epoch": 2.1923189680445616, + "grad_norm": 0.38891189503844975, + "learning_rate": 0.00029709470193052236, + "loss": 3.1638402938842773, + "step": 3740, + "token_acc": 0.2814092730961453 + }, + { + "epoch": 2.1929053063617707, + "grad_norm": 0.3017733795871617, + "learning_rate": 0.0002970918537638195, + "loss": 3.1486992835998535, + "step": 3741, + "token_acc": 0.2837440336712176 + }, + { + "epoch": 2.19349164467898, + "grad_norm": 0.34582891863757653, + "learning_rate": 0.00029708900421538694, + "loss": 3.141735553741455, + "step": 3742, + "token_acc": 0.2828959128523811 + }, + { + "epoch": 2.194077982996189, + "grad_norm": 0.3822139579348903, + "learning_rate": 0.0002970861532852513, + "loss": 3.1588408946990967, + "step": 3743, + "token_acc": 0.281739848305041 + }, + { + "epoch": 2.1946643213133976, + "grad_norm": 0.39104487183229963, + "learning_rate": 0.00029708330097343955, + "loss": 3.1367104053497314, + "step": 3744, + "token_acc": 0.28399802400729285 + }, + { + "epoch": 2.1952506596306067, + "grad_norm": 0.4008412132518758, + "learning_rate": 0.0002970804472799784, + "loss": 3.153278112411499, + "step": 3745, + "token_acc": 0.28172001340651054 + }, + { + "epoch": 2.195836997947816, + "grad_norm": 0.3837046494631987, + "learning_rate": 0.0002970775922048945, + "loss": 3.176415205001831, + "step": 3746, + "token_acc": 0.27976056482144684 + }, + { + "epoch": 2.196423336265025, + "grad_norm": 0.3167368828147193, + "learning_rate": 0.0002970747357482149, + "loss": 3.163569927215576, + "step": 3747, + "token_acc": 0.28121139452074706 + }, + { + "epoch": 2.197009674582234, + "grad_norm": 0.33338572636707026, + "learning_rate": 0.00029707187790996634, + "loss": 3.173992156982422, + "step": 3748, + "token_acc": 0.278925153425765 + }, + { + "epoch": 2.197596012899443, + "grad_norm": 0.33325046619417104, + "learning_rate": 0.0002970690186901757, + "loss": 3.1533126831054688, + "step": 3749, + "token_acc": 0.28241743548602755 + }, + { + "epoch": 2.198182351216652, + "grad_norm": 0.38337292217801866, + "learning_rate": 0.00029706615808886976, + "loss": 3.1665897369384766, + "step": 3750, + "token_acc": 0.2806721886204581 + }, + { + "epoch": 2.198768689533861, + "grad_norm": 0.4057576269254285, + "learning_rate": 0.00029706329610607545, + "loss": 3.1035351753234863, + "step": 3751, + "token_acc": 0.28971972532179735 + }, + { + "epoch": 2.19935502785107, + "grad_norm": 0.3286326306188834, + "learning_rate": 0.0002970604327418196, + "loss": 3.0740952491760254, + "step": 3752, + "token_acc": 0.29236206818718946 + }, + { + "epoch": 2.199941366168279, + "grad_norm": 0.4137232096991444, + "learning_rate": 0.00029705756799612923, + "loss": 3.168051242828369, + "step": 3753, + "token_acc": 0.2792734466324971 + }, + { + "epoch": 2.2005277044854883, + "grad_norm": 0.34514452079048275, + "learning_rate": 0.00029705470186903115, + "loss": 3.163088321685791, + "step": 3754, + "token_acc": 0.28117876248807927 + }, + { + "epoch": 2.201114042802697, + "grad_norm": 0.35387527189584134, + "learning_rate": 0.0002970518343605523, + "loss": 3.0887796878814697, + "step": 3755, + "token_acc": 0.29122049259795857 + }, + { + "epoch": 2.201700381119906, + "grad_norm": 0.3308009359952547, + "learning_rate": 0.0002970489654707196, + "loss": 3.1101760864257812, + "step": 3756, + "token_acc": 0.28565986061084 + }, + { + "epoch": 2.202286719437115, + "grad_norm": 0.38555611921801497, + "learning_rate": 0.00029704609519956, + "loss": 3.135230541229248, + "step": 3757, + "token_acc": 0.28450695623185235 + }, + { + "epoch": 2.2028730577543243, + "grad_norm": 0.3634144851836268, + "learning_rate": 0.00029704322354710057, + "loss": 3.1189966201782227, + "step": 3758, + "token_acc": 0.2863978334352279 + }, + { + "epoch": 2.2034593960715334, + "grad_norm": 0.3156287722560124, + "learning_rate": 0.00029704035051336816, + "loss": 3.1024208068847656, + "step": 3759, + "token_acc": 0.2903743518127591 + }, + { + "epoch": 2.2040457343887425, + "grad_norm": 0.3211094660933389, + "learning_rate": 0.00029703747609838977, + "loss": 3.126427412033081, + "step": 3760, + "token_acc": 0.28755061275171406 + }, + { + "epoch": 2.204632072705951, + "grad_norm": 0.3222561464711346, + "learning_rate": 0.0002970346003021924, + "loss": 3.099661350250244, + "step": 3761, + "token_acc": 0.2903913637459484 + }, + { + "epoch": 2.2052184110231603, + "grad_norm": 0.3678023808414285, + "learning_rate": 0.00029703172312480316, + "loss": 3.1137800216674805, + "step": 3762, + "token_acc": 0.2870190790378748 + }, + { + "epoch": 2.2058047493403694, + "grad_norm": 0.3787740942796549, + "learning_rate": 0.000297028844566249, + "loss": 3.1343183517456055, + "step": 3763, + "token_acc": 0.28731045490822027 + }, + { + "epoch": 2.2063910876575785, + "grad_norm": 0.34356797269212563, + "learning_rate": 0.00029702596462655693, + "loss": 3.1562204360961914, + "step": 3764, + "token_acc": 0.2820999463910506 + }, + { + "epoch": 2.2069774259747876, + "grad_norm": 0.356217598313379, + "learning_rate": 0.0002970230833057541, + "loss": 3.1309165954589844, + "step": 3765, + "token_acc": 0.2846722137717118 + }, + { + "epoch": 2.2075637642919963, + "grad_norm": 0.37165901593333567, + "learning_rate": 0.00029702020060386745, + "loss": 3.1627721786499023, + "step": 3766, + "token_acc": 0.2815375217079306 + }, + { + "epoch": 2.2081501026092054, + "grad_norm": 0.4192899048015179, + "learning_rate": 0.00029701731652092417, + "loss": 3.1466424465179443, + "step": 3767, + "token_acc": 0.28414412384737936 + }, + { + "epoch": 2.2087364409264145, + "grad_norm": 0.411215745526209, + "learning_rate": 0.00029701443105695127, + "loss": 3.1085567474365234, + "step": 3768, + "token_acc": 0.2883136988770717 + }, + { + "epoch": 2.2093227792436236, + "grad_norm": 0.319641749025169, + "learning_rate": 0.0002970115442119759, + "loss": 3.118044376373291, + "step": 3769, + "token_acc": 0.2889292667760777 + }, + { + "epoch": 2.2099091175608327, + "grad_norm": 0.3427863814687333, + "learning_rate": 0.00029700865598602524, + "loss": 3.1661291122436523, + "step": 3770, + "token_acc": 0.28011693457630843 + }, + { + "epoch": 2.210495455878042, + "grad_norm": 0.43331515616171407, + "learning_rate": 0.0002970057663791263, + "loss": 3.1126625537872314, + "step": 3771, + "token_acc": 0.28659843581516276 + }, + { + "epoch": 2.2110817941952505, + "grad_norm": 0.37408488330845346, + "learning_rate": 0.0002970028753913063, + "loss": 3.1556777954101562, + "step": 3772, + "token_acc": 0.2819533670570152 + }, + { + "epoch": 2.2116681325124596, + "grad_norm": 0.3325957105824189, + "learning_rate": 0.0002969999830225923, + "loss": 3.136723518371582, + "step": 3773, + "token_acc": 0.28456658061697887 + }, + { + "epoch": 2.2122544708296688, + "grad_norm": 0.34109516243354243, + "learning_rate": 0.00029699708927301163, + "loss": 3.1388840675354004, + "step": 3774, + "token_acc": 0.2860615560367158 + }, + { + "epoch": 2.212840809146878, + "grad_norm": 0.3702297725464668, + "learning_rate": 0.00029699419414259135, + "loss": 3.139317512512207, + "step": 3775, + "token_acc": 0.2832988599116062 + }, + { + "epoch": 2.213427147464087, + "grad_norm": 0.3508604838928925, + "learning_rate": 0.0002969912976313587, + "loss": 3.1461031436920166, + "step": 3776, + "token_acc": 0.2828179572365619 + }, + { + "epoch": 2.2140134857812956, + "grad_norm": 0.34468504295810776, + "learning_rate": 0.0002969883997393409, + "loss": 3.154512882232666, + "step": 3777, + "token_acc": 0.2808072670349814 + }, + { + "epoch": 2.2145998240985048, + "grad_norm": 0.352324993808345, + "learning_rate": 0.00029698550046656515, + "loss": 3.1246447563171387, + "step": 3778, + "token_acc": 0.2866449924106317 + }, + { + "epoch": 2.215186162415714, + "grad_norm": 0.3460664851058068, + "learning_rate": 0.0002969825998130587, + "loss": 3.1506452560424805, + "step": 3779, + "token_acc": 0.2831327539801774 + }, + { + "epoch": 2.215772500732923, + "grad_norm": 0.30803323480517325, + "learning_rate": 0.00029697969777884876, + "loss": 3.1673409938812256, + "step": 3780, + "token_acc": 0.27995166361150653 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.32156367067864855, + "learning_rate": 0.00029697679436396264, + "loss": 3.164299964904785, + "step": 3781, + "token_acc": 0.27988870620968764 + }, + { + "epoch": 2.2169451773673408, + "grad_norm": 0.3179234427896199, + "learning_rate": 0.00029697388956842756, + "loss": 3.100285053253174, + "step": 3782, + "token_acc": 0.287210271802568 + }, + { + "epoch": 2.21753151568455, + "grad_norm": 0.32121304989483795, + "learning_rate": 0.00029697098339227085, + "loss": 3.1462411880493164, + "step": 3783, + "token_acc": 0.2836473048724547 + }, + { + "epoch": 2.218117854001759, + "grad_norm": 0.2971499383749068, + "learning_rate": 0.00029696807583551977, + "loss": 3.0867717266082764, + "step": 3784, + "token_acc": 0.2906005395420537 + }, + { + "epoch": 2.218704192318968, + "grad_norm": 0.30428698891306205, + "learning_rate": 0.0002969651668982017, + "loss": 3.116027355194092, + "step": 3785, + "token_acc": 0.2875592401926344 + }, + { + "epoch": 2.219290530636177, + "grad_norm": 0.3437803202371795, + "learning_rate": 0.0002969622565803439, + "loss": 3.1645591259002686, + "step": 3786, + "token_acc": 0.2798665201600026 + }, + { + "epoch": 2.219876868953386, + "grad_norm": 0.35301056642093914, + "learning_rate": 0.00029695934488197374, + "loss": 3.1671109199523926, + "step": 3787, + "token_acc": 0.2778507970332579 + }, + { + "epoch": 2.220463207270595, + "grad_norm": 0.3385506536449666, + "learning_rate": 0.0002969564318031186, + "loss": 3.124004364013672, + "step": 3788, + "token_acc": 0.28593985334016603 + }, + { + "epoch": 2.221049545587804, + "grad_norm": 0.35997547002333613, + "learning_rate": 0.0002969535173438058, + "loss": 3.0796189308166504, + "step": 3789, + "token_acc": 0.29089671957395025 + }, + { + "epoch": 2.221635883905013, + "grad_norm": 0.3529830675098698, + "learning_rate": 0.0002969506015040627, + "loss": 3.1322474479675293, + "step": 3790, + "token_acc": 0.28471278675898276 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3258650669055917, + "learning_rate": 0.0002969476842839167, + "loss": 3.131068468093872, + "step": 3791, + "token_acc": 0.286188213054302 + }, + { + "epoch": 2.2228085605394314, + "grad_norm": 0.3515607586600924, + "learning_rate": 0.0002969447656833952, + "loss": 3.135874032974243, + "step": 3792, + "token_acc": 0.2841311746898877 + }, + { + "epoch": 2.22339489885664, + "grad_norm": 0.3731176472732044, + "learning_rate": 0.00029694184570252575, + "loss": 3.113550901412964, + "step": 3793, + "token_acc": 0.28656675777316537 + }, + { + "epoch": 2.223981237173849, + "grad_norm": 0.36658313486687794, + "learning_rate": 0.0002969389243413356, + "loss": 3.10867977142334, + "step": 3794, + "token_acc": 0.28809046104002667 + }, + { + "epoch": 2.2245675754910583, + "grad_norm": 0.33458899675408305, + "learning_rate": 0.0002969360015998522, + "loss": 3.146430492401123, + "step": 3795, + "token_acc": 0.2833598571594923 + }, + { + "epoch": 2.2251539138082674, + "grad_norm": 0.3363372338360508, + "learning_rate": 0.00029693307747810313, + "loss": 3.1588902473449707, + "step": 3796, + "token_acc": 0.2818576332664033 + }, + { + "epoch": 2.2257402521254765, + "grad_norm": 0.3807862388558553, + "learning_rate": 0.0002969301519761158, + "loss": 3.166027069091797, + "step": 3797, + "token_acc": 0.2793695011620186 + }, + { + "epoch": 2.226326590442685, + "grad_norm": 0.36408872665974407, + "learning_rate": 0.0002969272250939177, + "loss": 3.179474115371704, + "step": 3798, + "token_acc": 0.27683282595988795 + }, + { + "epoch": 2.2269129287598943, + "grad_norm": 0.35326380619169234, + "learning_rate": 0.00029692429683153624, + "loss": 3.157876968383789, + "step": 3799, + "token_acc": 0.28264938844128135 + }, + { + "epoch": 2.2274992670771034, + "grad_norm": 0.312563084899839, + "learning_rate": 0.000296921367188999, + "loss": 3.124584197998047, + "step": 3800, + "token_acc": 0.28520868113522535 + }, + { + "epoch": 2.2280856053943126, + "grad_norm": 0.368973298726103, + "learning_rate": 0.00029691843616633354, + "loss": 3.0984749794006348, + "step": 3801, + "token_acc": 0.28978142728862916 + }, + { + "epoch": 2.2286719437115217, + "grad_norm": 0.33291453259596576, + "learning_rate": 0.00029691550376356735, + "loss": 3.155500650405884, + "step": 3802, + "token_acc": 0.2828012883392936 + }, + { + "epoch": 2.2292582820287308, + "grad_norm": 0.31029813734226924, + "learning_rate": 0.00029691256998072796, + "loss": 3.1208577156066895, + "step": 3803, + "token_acc": 0.2851888616059767 + }, + { + "epoch": 2.2298446203459394, + "grad_norm": 0.35265120750819473, + "learning_rate": 0.0002969096348178429, + "loss": 3.13283109664917, + "step": 3804, + "token_acc": 0.28393237962869416 + }, + { + "epoch": 2.2304309586631486, + "grad_norm": 0.32799916943871077, + "learning_rate": 0.0002969066982749398, + "loss": 3.1414284706115723, + "step": 3805, + "token_acc": 0.2848699324198741 + }, + { + "epoch": 2.2310172969803577, + "grad_norm": 0.3497467205532327, + "learning_rate": 0.00029690376035204624, + "loss": 3.1115567684173584, + "step": 3806, + "token_acc": 0.28824159682482564 + }, + { + "epoch": 2.231603635297567, + "grad_norm": 0.41641875697546643, + "learning_rate": 0.00029690082104918985, + "loss": 3.1981654167175293, + "step": 3807, + "token_acc": 0.27552068616758607 + }, + { + "epoch": 2.232189973614776, + "grad_norm": 0.38435080507944286, + "learning_rate": 0.0002968978803663981, + "loss": 3.1274185180664062, + "step": 3808, + "token_acc": 0.28408991839290293 + }, + { + "epoch": 2.2327763119319846, + "grad_norm": 0.39330360153440047, + "learning_rate": 0.0002968949383036988, + "loss": 3.135956287384033, + "step": 3809, + "token_acc": 0.2835522588224286 + }, + { + "epoch": 2.2333626502491937, + "grad_norm": 0.3521579260985205, + "learning_rate": 0.0002968919948611195, + "loss": 3.1432793140411377, + "step": 3810, + "token_acc": 0.28275794996346354 + }, + { + "epoch": 2.233948988566403, + "grad_norm": 0.36667814471807175, + "learning_rate": 0.0002968890500386878, + "loss": 3.1187589168548584, + "step": 3811, + "token_acc": 0.28658694876204693 + }, + { + "epoch": 2.234535326883612, + "grad_norm": 0.3767321266451095, + "learning_rate": 0.0002968861038364315, + "loss": 3.109773635864258, + "step": 3812, + "token_acc": 0.28770082354529497 + }, + { + "epoch": 2.235121665200821, + "grad_norm": 0.3500032225913167, + "learning_rate": 0.0002968831562543781, + "loss": 3.176600217819214, + "step": 3813, + "token_acc": 0.2793028692661023 + }, + { + "epoch": 2.23570800351803, + "grad_norm": 0.3795521237061469, + "learning_rate": 0.00029688020729255537, + "loss": 3.1243607997894287, + "step": 3814, + "token_acc": 0.2853834461207068 + }, + { + "epoch": 2.236294341835239, + "grad_norm": 0.3417693445955888, + "learning_rate": 0.00029687725695099105, + "loss": 3.1730802059173584, + "step": 3815, + "token_acc": 0.2785408819281203 + }, + { + "epoch": 2.236880680152448, + "grad_norm": 0.34060567431239436, + "learning_rate": 0.0002968743052297128, + "loss": 3.100273609161377, + "step": 3816, + "token_acc": 0.28810480064097005 + }, + { + "epoch": 2.237467018469657, + "grad_norm": 0.33395715237760143, + "learning_rate": 0.00029687135212874834, + "loss": 3.0997109413146973, + "step": 3817, + "token_acc": 0.29117475916467317 + }, + { + "epoch": 2.238053356786866, + "grad_norm": 0.29554968053902053, + "learning_rate": 0.00029686839764812546, + "loss": 3.093292713165283, + "step": 3818, + "token_acc": 0.2920530448833404 + }, + { + "epoch": 2.2386396951040752, + "grad_norm": 0.29924613908688724, + "learning_rate": 0.00029686544178787196, + "loss": 3.110499382019043, + "step": 3819, + "token_acc": 0.28761358577526386 + }, + { + "epoch": 2.239226033421284, + "grad_norm": 0.33545279414875173, + "learning_rate": 0.00029686248454801543, + "loss": 3.107767105102539, + "step": 3820, + "token_acc": 0.28697280986337453 + }, + { + "epoch": 2.239812371738493, + "grad_norm": 0.3642427925226817, + "learning_rate": 0.00029685952592858384, + "loss": 3.127642869949341, + "step": 3821, + "token_acc": 0.2865531023003976 + }, + { + "epoch": 2.240398710055702, + "grad_norm": 0.3660368473803843, + "learning_rate": 0.00029685656592960485, + "loss": 3.144003391265869, + "step": 3822, + "token_acc": 0.2835657027963703 + }, + { + "epoch": 2.2409850483729112, + "grad_norm": 0.32814374538190083, + "learning_rate": 0.00029685360455110636, + "loss": 3.147284507751465, + "step": 3823, + "token_acc": 0.28307908411079147 + }, + { + "epoch": 2.2415713866901203, + "grad_norm": 0.30730484711641337, + "learning_rate": 0.0002968506417931161, + "loss": 3.1099624633789062, + "step": 3824, + "token_acc": 0.2894372652276281 + }, + { + "epoch": 2.2421577250073295, + "grad_norm": 0.3298981889905991, + "learning_rate": 0.00029684767765566193, + "loss": 3.1177191734313965, + "step": 3825, + "token_acc": 0.2851696311366097 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.33551707266600006, + "learning_rate": 0.00029684471213877177, + "loss": 3.159877300262451, + "step": 3826, + "token_acc": 0.28150116723345475 + }, + { + "epoch": 2.2433304016417472, + "grad_norm": 0.37770551659273416, + "learning_rate": 0.0002968417452424734, + "loss": 3.1461005210876465, + "step": 3827, + "token_acc": 0.28053293856402667 + }, + { + "epoch": 2.2439167399589564, + "grad_norm": 0.40656106231741346, + "learning_rate": 0.0002968387769667947, + "loss": 3.159878730773926, + "step": 3828, + "token_acc": 0.2809157554844164 + }, + { + "epoch": 2.2445030782761655, + "grad_norm": 0.37258371286941255, + "learning_rate": 0.0002968358073117635, + "loss": 3.086667537689209, + "step": 3829, + "token_acc": 0.28974647311388263 + }, + { + "epoch": 2.2450894165933746, + "grad_norm": 0.3765656311318651, + "learning_rate": 0.0002968328362774078, + "loss": 3.1338484287261963, + "step": 3830, + "token_acc": 0.2845573751445667 + }, + { + "epoch": 2.2456757549105832, + "grad_norm": 0.36054739497645144, + "learning_rate": 0.0002968298638637555, + "loss": 3.1424970626831055, + "step": 3831, + "token_acc": 0.2831309891076901 + }, + { + "epoch": 2.2462620932277924, + "grad_norm": 0.3563008399280607, + "learning_rate": 0.00029682689007083445, + "loss": 3.1569464206695557, + "step": 3832, + "token_acc": 0.2807823799295909 + }, + { + "epoch": 2.2468484315450015, + "grad_norm": 0.34154786239696044, + "learning_rate": 0.0002968239148986726, + "loss": 3.126563549041748, + "step": 3833, + "token_acc": 0.28375958948828855 + }, + { + "epoch": 2.2474347698622106, + "grad_norm": 0.32308367313291103, + "learning_rate": 0.0002968209383472979, + "loss": 3.1239233016967773, + "step": 3834, + "token_acc": 0.284844520350946 + }, + { + "epoch": 2.2480211081794197, + "grad_norm": 0.3043028392150398, + "learning_rate": 0.00029681796041673844, + "loss": 3.1476945877075195, + "step": 3835, + "token_acc": 0.28281338113268345 + }, + { + "epoch": 2.2486074464966284, + "grad_norm": 0.3037170146744987, + "learning_rate": 0.00029681498110702197, + "loss": 3.10396146774292, + "step": 3836, + "token_acc": 0.28775192098316743 + }, + { + "epoch": 2.2491937848138375, + "grad_norm": 0.3233377813541696, + "learning_rate": 0.00029681200041817665, + "loss": 3.1522209644317627, + "step": 3837, + "token_acc": 0.2827375109779408 + }, + { + "epoch": 2.2497801231310466, + "grad_norm": 0.3346904642707688, + "learning_rate": 0.0002968090183502304, + "loss": 3.1349873542785645, + "step": 3838, + "token_acc": 0.28562865238413376 + }, + { + "epoch": 2.2503664614482557, + "grad_norm": 0.3604269112568013, + "learning_rate": 0.0002968060349032112, + "loss": 3.1142184734344482, + "step": 3839, + "token_acc": 0.2879528040387433 + }, + { + "epoch": 2.250952799765465, + "grad_norm": 0.41697501414188737, + "learning_rate": 0.0002968030500771472, + "loss": 3.1308512687683105, + "step": 3840, + "token_acc": 0.28416912242767856 + }, + { + "epoch": 2.2515391380826735, + "grad_norm": 0.3775215996045852, + "learning_rate": 0.0002968000638720663, + "loss": 3.114043712615967, + "step": 3841, + "token_acc": 0.28654369301498 + }, + { + "epoch": 2.2521254763998826, + "grad_norm": 0.3001607838943336, + "learning_rate": 0.0002967970762879966, + "loss": 3.1494038105010986, + "step": 3842, + "token_acc": 0.28356555719559096 + }, + { + "epoch": 2.2527118147170917, + "grad_norm": 0.37390369884088986, + "learning_rate": 0.0002967940873249663, + "loss": 3.146125316619873, + "step": 3843, + "token_acc": 0.2838406153253715 + }, + { + "epoch": 2.253298153034301, + "grad_norm": 0.3945815798769358, + "learning_rate": 0.00029679109698300325, + "loss": 3.1140263080596924, + "step": 3844, + "token_acc": 0.28696604034677364 + }, + { + "epoch": 2.25388449135151, + "grad_norm": 0.3398301393583601, + "learning_rate": 0.0002967881052621357, + "loss": 3.1278798580169678, + "step": 3845, + "token_acc": 0.2845089357549584 + }, + { + "epoch": 2.254470829668719, + "grad_norm": 0.33682761351978074, + "learning_rate": 0.00029678511216239166, + "loss": 3.144913673400879, + "step": 3846, + "token_acc": 0.2820162649954136 + }, + { + "epoch": 2.2550571679859277, + "grad_norm": 0.361349656022621, + "learning_rate": 0.00029678211768379933, + "loss": 3.1437714099884033, + "step": 3847, + "token_acc": 0.284574283188829 + }, + { + "epoch": 2.255643506303137, + "grad_norm": 0.32778601007311964, + "learning_rate": 0.00029677912182638676, + "loss": 3.1215953826904297, + "step": 3848, + "token_acc": 0.28636693897577103 + }, + { + "epoch": 2.256229844620346, + "grad_norm": 0.3364444704217451, + "learning_rate": 0.00029677612459018214, + "loss": 3.0988543033599854, + "step": 3849, + "token_acc": 0.2882610877776897 + }, + { + "epoch": 2.256816182937555, + "grad_norm": 0.3923369037646225, + "learning_rate": 0.0002967731259752136, + "loss": 3.161343574523926, + "step": 3850, + "token_acc": 0.2792298287241869 + }, + { + "epoch": 2.257402521254764, + "grad_norm": 0.3555615523116818, + "learning_rate": 0.00029677012598150937, + "loss": 3.1440279483795166, + "step": 3851, + "token_acc": 0.28371905103937806 + }, + { + "epoch": 2.257988859571973, + "grad_norm": 0.37126208302062824, + "learning_rate": 0.00029676712460909754, + "loss": 3.102146625518799, + "step": 3852, + "token_acc": 0.2896049057884909 + }, + { + "epoch": 2.258575197889182, + "grad_norm": 0.3266113859872727, + "learning_rate": 0.00029676412185800636, + "loss": 3.148850440979004, + "step": 3853, + "token_acc": 0.282865212255426 + }, + { + "epoch": 2.259161536206391, + "grad_norm": 0.3781781504771243, + "learning_rate": 0.00029676111772826403, + "loss": 3.1555047035217285, + "step": 3854, + "token_acc": 0.2828065203477729 + }, + { + "epoch": 2.2597478745236, + "grad_norm": 0.39110195545602594, + "learning_rate": 0.00029675811221989873, + "loss": 3.1286869049072266, + "step": 3855, + "token_acc": 0.28481809380685785 + }, + { + "epoch": 2.2603342128408093, + "grad_norm": 0.34498430255440204, + "learning_rate": 0.0002967551053329387, + "loss": 3.108625888824463, + "step": 3856, + "token_acc": 0.287273098651358 + }, + { + "epoch": 2.2609205511580184, + "grad_norm": 0.3265490740514863, + "learning_rate": 0.0002967520970674123, + "loss": 3.145751953125, + "step": 3857, + "token_acc": 0.28336042827262 + }, + { + "epoch": 2.261506889475227, + "grad_norm": 0.37687619857249566, + "learning_rate": 0.0002967490874233476, + "loss": 3.153994083404541, + "step": 3858, + "token_acc": 0.2805961651345176 + }, + { + "epoch": 2.262093227792436, + "grad_norm": 0.3659101633880088, + "learning_rate": 0.00029674607640077305, + "loss": 3.1383116245269775, + "step": 3859, + "token_acc": 0.2840793496291625 + }, + { + "epoch": 2.2626795661096453, + "grad_norm": 0.3649728108488499, + "learning_rate": 0.0002967430639997168, + "loss": 3.1229546070098877, + "step": 3860, + "token_acc": 0.2862786525052479 + }, + { + "epoch": 2.2632659044268544, + "grad_norm": 0.40516352434086467, + "learning_rate": 0.00029674005022020726, + "loss": 3.1451821327209473, + "step": 3861, + "token_acc": 0.2831056383668179 + }, + { + "epoch": 2.2638522427440635, + "grad_norm": 0.340402796275945, + "learning_rate": 0.0002967370350622727, + "loss": 3.109259843826294, + "step": 3862, + "token_acc": 0.2878759043211888 + }, + { + "epoch": 2.264438581061272, + "grad_norm": 0.33687622846561444, + "learning_rate": 0.00029673401852594136, + "loss": 3.0722784996032715, + "step": 3863, + "token_acc": 0.2931314709797911 + }, + { + "epoch": 2.2650249193784813, + "grad_norm": 0.3838536611274872, + "learning_rate": 0.00029673100061124164, + "loss": 3.1475141048431396, + "step": 3864, + "token_acc": 0.28426435692413715 + }, + { + "epoch": 2.2656112576956904, + "grad_norm": 0.3338870541478003, + "learning_rate": 0.0002967279813182019, + "loss": 3.1759462356567383, + "step": 3865, + "token_acc": 0.27812563496561826 + }, + { + "epoch": 2.2661975960128995, + "grad_norm": 0.35767772301268946, + "learning_rate": 0.0002967249606468505, + "loss": 3.1165928840637207, + "step": 3866, + "token_acc": 0.2856306857477257 + }, + { + "epoch": 2.2667839343301086, + "grad_norm": 0.3335793755320412, + "learning_rate": 0.0002967219385972158, + "loss": 3.124889850616455, + "step": 3867, + "token_acc": 0.2839560058161089 + }, + { + "epoch": 2.2673702726473177, + "grad_norm": 0.3425308685522111, + "learning_rate": 0.00029671891516932624, + "loss": 3.0921688079833984, + "step": 3868, + "token_acc": 0.2904580832364372 + }, + { + "epoch": 2.2679566109645264, + "grad_norm": 0.3734309048171176, + "learning_rate": 0.00029671589036321016, + "loss": 3.1452436447143555, + "step": 3869, + "token_acc": 0.2828461341672888 + }, + { + "epoch": 2.2685429492817355, + "grad_norm": 0.30028012289456135, + "learning_rate": 0.00029671286417889595, + "loss": 3.118638277053833, + "step": 3870, + "token_acc": 0.2879698124290348 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.34802881387688106, + "learning_rate": 0.00029670983661641214, + "loss": 3.1682145595550537, + "step": 3871, + "token_acc": 0.28056332871302664 + }, + { + "epoch": 2.2697156259161537, + "grad_norm": 0.3757465966308485, + "learning_rate": 0.0002967068076757871, + "loss": 3.088167428970337, + "step": 3872, + "token_acc": 0.2922698606802053 + }, + { + "epoch": 2.270301964233363, + "grad_norm": 0.33390297229917815, + "learning_rate": 0.0002967037773570492, + "loss": 3.1354007720947266, + "step": 3873, + "token_acc": 0.2854307371267025 + }, + { + "epoch": 2.2708883025505715, + "grad_norm": 0.31112408057579755, + "learning_rate": 0.0002967007456602271, + "loss": 3.1719579696655273, + "step": 3874, + "token_acc": 0.2808750491119551 + }, + { + "epoch": 2.2714746408677806, + "grad_norm": 0.31376901647324984, + "learning_rate": 0.0002966977125853492, + "loss": 3.1327662467956543, + "step": 3875, + "token_acc": 0.2848221258580295 + }, + { + "epoch": 2.2720609791849897, + "grad_norm": 0.36455449106456456, + "learning_rate": 0.0002966946781324439, + "loss": 3.098599910736084, + "step": 3876, + "token_acc": 0.2903051358958734 + }, + { + "epoch": 2.272647317502199, + "grad_norm": 0.3309040090128516, + "learning_rate": 0.0002966916423015398, + "loss": 3.148508071899414, + "step": 3877, + "token_acc": 0.2819160960906339 + }, + { + "epoch": 2.273233655819408, + "grad_norm": 0.3214519503023754, + "learning_rate": 0.00029668860509266535, + "loss": 3.167297840118408, + "step": 3878, + "token_acc": 0.2805810693347158 + }, + { + "epoch": 2.273819994136617, + "grad_norm": 0.3503842681760275, + "learning_rate": 0.00029668556650584916, + "loss": 3.1011996269226074, + "step": 3879, + "token_acc": 0.2896082449085622 + }, + { + "epoch": 2.2744063324538257, + "grad_norm": 0.3299744501590456, + "learning_rate": 0.00029668252654111967, + "loss": 3.1620965003967285, + "step": 3880, + "token_acc": 0.2817415831916971 + }, + { + "epoch": 2.274992670771035, + "grad_norm": 0.3519721356743776, + "learning_rate": 0.00029667948519850556, + "loss": 3.1272995471954346, + "step": 3881, + "token_acc": 0.28598799043305684 + }, + { + "epoch": 2.275579009088244, + "grad_norm": 0.3348353766366132, + "learning_rate": 0.00029667644247803534, + "loss": 3.106405258178711, + "step": 3882, + "token_acc": 0.2869021038900759 + }, + { + "epoch": 2.276165347405453, + "grad_norm": 0.3156452423075233, + "learning_rate": 0.0002966733983797376, + "loss": 3.13942813873291, + "step": 3883, + "token_acc": 0.2834587058513514 + }, + { + "epoch": 2.2767516857226617, + "grad_norm": 0.3529102264331451, + "learning_rate": 0.0002966703529036409, + "loss": 3.1225242614746094, + "step": 3884, + "token_acc": 0.2842691930363516 + }, + { + "epoch": 2.277338024039871, + "grad_norm": 0.33722665754907466, + "learning_rate": 0.0002966673060497739, + "loss": 3.1357922554016113, + "step": 3885, + "token_acc": 0.28500778216317574 + }, + { + "epoch": 2.27792436235708, + "grad_norm": 0.3475788528720892, + "learning_rate": 0.0002966642578181652, + "loss": 3.1662609577178955, + "step": 3886, + "token_acc": 0.2800709905566666 + }, + { + "epoch": 2.278510700674289, + "grad_norm": 0.35692902207972327, + "learning_rate": 0.0002966612082088434, + "loss": 3.1336910724639893, + "step": 3887, + "token_acc": 0.2837569968674575 + }, + { + "epoch": 2.279097038991498, + "grad_norm": 0.3301430970741716, + "learning_rate": 0.00029665815722183716, + "loss": 3.1702640056610107, + "step": 3888, + "token_acc": 0.2815069844366915 + }, + { + "epoch": 2.2796833773087073, + "grad_norm": 0.3613877821275972, + "learning_rate": 0.0002966551048571752, + "loss": 3.167794704437256, + "step": 3889, + "token_acc": 0.2811859742113042 + }, + { + "epoch": 2.280269715625916, + "grad_norm": 0.3299456115221439, + "learning_rate": 0.00029665205111488615, + "loss": 3.1297707557678223, + "step": 3890, + "token_acc": 0.2853902608939845 + }, + { + "epoch": 2.280856053943125, + "grad_norm": 0.3398061648929076, + "learning_rate": 0.00029664899599499866, + "loss": 3.0753061771392822, + "step": 3891, + "token_acc": 0.29305059538864664 + }, + { + "epoch": 2.281442392260334, + "grad_norm": 0.3752650198139334, + "learning_rate": 0.00029664593949754145, + "loss": 3.1909584999084473, + "step": 3892, + "token_acc": 0.2783515901910154 + }, + { + "epoch": 2.2820287305775433, + "grad_norm": 0.35936101703074697, + "learning_rate": 0.0002966428816225433, + "loss": 3.155496597290039, + "step": 3893, + "token_acc": 0.28021690718803666 + }, + { + "epoch": 2.2826150688947524, + "grad_norm": 0.3655586993002499, + "learning_rate": 0.0002966398223700329, + "loss": 3.1330766677856445, + "step": 3894, + "token_acc": 0.28580541343672644 + }, + { + "epoch": 2.283201407211961, + "grad_norm": 0.34027944033069607, + "learning_rate": 0.0002966367617400389, + "loss": 3.124612331390381, + "step": 3895, + "token_acc": 0.2859967946105995 + }, + { + "epoch": 2.28378774552917, + "grad_norm": 0.3706794538789568, + "learning_rate": 0.00029663369973259015, + "loss": 3.130789279937744, + "step": 3896, + "token_acc": 0.2844121418104852 + }, + { + "epoch": 2.2843740838463793, + "grad_norm": 0.34332320720651394, + "learning_rate": 0.0002966306363477154, + "loss": 3.1528263092041016, + "step": 3897, + "token_acc": 0.2824466217550635 + }, + { + "epoch": 2.2849604221635884, + "grad_norm": 0.302780726186809, + "learning_rate": 0.0002966275715854434, + "loss": 3.1284918785095215, + "step": 3898, + "token_acc": 0.28730334141664055 + }, + { + "epoch": 2.2855467604807975, + "grad_norm": 0.31777351851650415, + "learning_rate": 0.00029662450544580294, + "loss": 3.163057804107666, + "step": 3899, + "token_acc": 0.2820830229565924 + }, + { + "epoch": 2.2861330987980066, + "grad_norm": 0.33607005571603743, + "learning_rate": 0.0002966214379288228, + "loss": 3.143604278564453, + "step": 3900, + "token_acc": 0.28307982586514036 + }, + { + "epoch": 2.2867194371152153, + "grad_norm": 0.4175559245841742, + "learning_rate": 0.00029661836903453184, + "loss": 3.1550049781799316, + "step": 3901, + "token_acc": 0.281235959863931 + }, + { + "epoch": 2.2873057754324244, + "grad_norm": 0.3748280162975092, + "learning_rate": 0.00029661529876295894, + "loss": 3.141681671142578, + "step": 3902, + "token_acc": 0.2826647964802507 + }, + { + "epoch": 2.2878921137496335, + "grad_norm": 0.36143263743728443, + "learning_rate": 0.0002966122271141328, + "loss": 3.121706247329712, + "step": 3903, + "token_acc": 0.2855800214822771 + }, + { + "epoch": 2.2884784520668426, + "grad_norm": 0.38173752096554653, + "learning_rate": 0.0002966091540880824, + "loss": 3.09523344039917, + "step": 3904, + "token_acc": 0.29140673797881544 + }, + { + "epoch": 2.2890647903840518, + "grad_norm": 0.3375537807760514, + "learning_rate": 0.0002966060796848365, + "loss": 3.1792566776275635, + "step": 3905, + "token_acc": 0.2796097274694641 + }, + { + "epoch": 2.2896511287012604, + "grad_norm": 0.32067076985580467, + "learning_rate": 0.0002966030039044241, + "loss": 3.0969648361206055, + "step": 3906, + "token_acc": 0.2886835199081478 + }, + { + "epoch": 2.2902374670184695, + "grad_norm": 0.3334424061310324, + "learning_rate": 0.000296599926746874, + "loss": 3.123234272003174, + "step": 3907, + "token_acc": 0.284843666512652 + }, + { + "epoch": 2.2908238053356786, + "grad_norm": 0.3099521682328931, + "learning_rate": 0.00029659684821221514, + "loss": 3.116097927093506, + "step": 3908, + "token_acc": 0.28575720388824105 + }, + { + "epoch": 2.2914101436528878, + "grad_norm": 0.32783597070791204, + "learning_rate": 0.0002965937683004764, + "loss": 3.091261148452759, + "step": 3909, + "token_acc": 0.290263353115727 + }, + { + "epoch": 2.291996481970097, + "grad_norm": 0.32131269174333393, + "learning_rate": 0.00029659068701168675, + "loss": 3.1498451232910156, + "step": 3910, + "token_acc": 0.281163006495515 + }, + { + "epoch": 2.292582820287306, + "grad_norm": 0.3422824619005444, + "learning_rate": 0.00029658760434587517, + "loss": 3.135819911956787, + "step": 3911, + "token_acc": 0.2830876855031873 + }, + { + "epoch": 2.2931691586045146, + "grad_norm": 0.3769530906535803, + "learning_rate": 0.00029658452030307056, + "loss": 3.1156129837036133, + "step": 3912, + "token_acc": 0.2854136102441734 + }, + { + "epoch": 2.2937554969217238, + "grad_norm": 0.3468570513356614, + "learning_rate": 0.00029658143488330187, + "loss": 3.123091220855713, + "step": 3913, + "token_acc": 0.285465907032207 + }, + { + "epoch": 2.294341835238933, + "grad_norm": 0.3758397522396083, + "learning_rate": 0.00029657834808659815, + "loss": 3.1301941871643066, + "step": 3914, + "token_acc": 0.28541657475196003 + }, + { + "epoch": 2.294928173556142, + "grad_norm": 0.3955409525476425, + "learning_rate": 0.00029657525991298836, + "loss": 3.133077621459961, + "step": 3915, + "token_acc": 0.2851247563097777 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.3792246447401132, + "learning_rate": 0.00029657217036250155, + "loss": 3.1052136421203613, + "step": 3916, + "token_acc": 0.287138235993905 + }, + { + "epoch": 2.2961008501905598, + "grad_norm": 0.3535137053883964, + "learning_rate": 0.00029656907943516667, + "loss": 3.1539230346679688, + "step": 3917, + "token_acc": 0.28208302471478414 + }, + { + "epoch": 2.296687188507769, + "grad_norm": 0.40949508696268716, + "learning_rate": 0.00029656598713101277, + "loss": 3.167966365814209, + "step": 3918, + "token_acc": 0.27944552695624636 + }, + { + "epoch": 2.297273526824978, + "grad_norm": 0.3689495820905759, + "learning_rate": 0.00029656289345006897, + "loss": 3.149592399597168, + "step": 3919, + "token_acc": 0.2803497498706227 + }, + { + "epoch": 2.297859865142187, + "grad_norm": 0.35365823333052937, + "learning_rate": 0.0002965597983923642, + "loss": 3.1518445014953613, + "step": 3920, + "token_acc": 0.2823548342604938 + }, + { + "epoch": 2.298446203459396, + "grad_norm": 0.37423928834599296, + "learning_rate": 0.0002965567019579277, + "loss": 3.152134895324707, + "step": 3921, + "token_acc": 0.28398140054134324 + }, + { + "epoch": 2.2990325417766053, + "grad_norm": 0.3618158156999347, + "learning_rate": 0.0002965536041467885, + "loss": 3.1430611610412598, + "step": 3922, + "token_acc": 0.2829186828244765 + }, + { + "epoch": 2.299618880093814, + "grad_norm": 0.2717137584957784, + "learning_rate": 0.00029655050495897565, + "loss": 3.1336140632629395, + "step": 3923, + "token_acc": 0.2853663197319643 + }, + { + "epoch": 2.300205218411023, + "grad_norm": 0.326539145877011, + "learning_rate": 0.00029654740439451823, + "loss": 3.1463160514831543, + "step": 3924, + "token_acc": 0.28246615247454954 + }, + { + "epoch": 2.300791556728232, + "grad_norm": 0.32903399491159996, + "learning_rate": 0.0002965443024534454, + "loss": 3.0967490673065186, + "step": 3925, + "token_acc": 0.2888586107588872 + }, + { + "epoch": 2.3013778950454413, + "grad_norm": 0.3218240281904549, + "learning_rate": 0.0002965411991357864, + "loss": 3.1402153968811035, + "step": 3926, + "token_acc": 0.28366721931129274 + }, + { + "epoch": 2.3019642333626504, + "grad_norm": 0.3337488523605423, + "learning_rate": 0.0002965380944415703, + "loss": 3.1310198307037354, + "step": 3927, + "token_acc": 0.28258745979430566 + }, + { + "epoch": 2.302550571679859, + "grad_norm": 0.3739003140582341, + "learning_rate": 0.00029653498837082625, + "loss": 3.0922772884368896, + "step": 3928, + "token_acc": 0.2896576442633179 + }, + { + "epoch": 2.303136909997068, + "grad_norm": 0.349797427692297, + "learning_rate": 0.0002965318809235834, + "loss": 3.1183156967163086, + "step": 3929, + "token_acc": 0.28653450891262366 + }, + { + "epoch": 2.3037232483142773, + "grad_norm": 0.3330890973584048, + "learning_rate": 0.000296528772099871, + "loss": 3.1265792846679688, + "step": 3930, + "token_acc": 0.28602233743454825 + }, + { + "epoch": 2.3043095866314864, + "grad_norm": 0.31747836969058285, + "learning_rate": 0.00029652566189971826, + "loss": 3.0859501361846924, + "step": 3931, + "token_acc": 0.29039291460535616 + }, + { + "epoch": 2.3048959249486956, + "grad_norm": 0.31927717232309866, + "learning_rate": 0.00029652255032315436, + "loss": 3.08859920501709, + "step": 3932, + "token_acc": 0.2898383822103449 + }, + { + "epoch": 2.3054822632659047, + "grad_norm": 0.30303922238932196, + "learning_rate": 0.00029651943737020854, + "loss": 3.1357712745666504, + "step": 3933, + "token_acc": 0.2841259175023261 + }, + { + "epoch": 2.3060686015831133, + "grad_norm": 0.31209786074099005, + "learning_rate": 0.00029651632304091, + "loss": 3.1018872261047363, + "step": 3934, + "token_acc": 0.28658283527051354 + }, + { + "epoch": 2.3066549399003224, + "grad_norm": 0.3140674826141942, + "learning_rate": 0.00029651320733528814, + "loss": 3.146984577178955, + "step": 3935, + "token_acc": 0.2828596480305483 + }, + { + "epoch": 2.3072412782175316, + "grad_norm": 0.31580586498134955, + "learning_rate": 0.00029651009025337204, + "loss": 3.102692127227783, + "step": 3936, + "token_acc": 0.2889240727956699 + }, + { + "epoch": 2.3078276165347407, + "grad_norm": 0.3893506483722437, + "learning_rate": 0.0002965069717951911, + "loss": 3.1709413528442383, + "step": 3937, + "token_acc": 0.279024983008259 + }, + { + "epoch": 2.3084139548519493, + "grad_norm": 0.4131410864702742, + "learning_rate": 0.0002965038519607746, + "loss": 3.1448557376861572, + "step": 3938, + "token_acc": 0.2823305163459752 + }, + { + "epoch": 2.3090002931691584, + "grad_norm": 0.4184412134362271, + "learning_rate": 0.0002965007307501518, + "loss": 3.135000467300415, + "step": 3939, + "token_acc": 0.2847566284131381 + }, + { + "epoch": 2.3095866314863676, + "grad_norm": 0.3527129246227712, + "learning_rate": 0.00029649760816335204, + "loss": 3.1537468433380127, + "step": 3940, + "token_acc": 0.28108774848684775 + }, + { + "epoch": 2.3101729698035767, + "grad_norm": 0.4130054447942725, + "learning_rate": 0.0002964944842004047, + "loss": 3.111358642578125, + "step": 3941, + "token_acc": 0.28666590099919603 + }, + { + "epoch": 2.310759308120786, + "grad_norm": 0.3351690768305104, + "learning_rate": 0.000296491358861339, + "loss": 3.093397855758667, + "step": 3942, + "token_acc": 0.2904525542885551 + }, + { + "epoch": 2.311345646437995, + "grad_norm": 0.3224020722044585, + "learning_rate": 0.0002964882321461845, + "loss": 3.134650230407715, + "step": 3943, + "token_acc": 0.2862757495465615 + }, + { + "epoch": 2.3119319847552036, + "grad_norm": 0.387140069543194, + "learning_rate": 0.00029648510405497035, + "loss": 3.0998148918151855, + "step": 3944, + "token_acc": 0.2879225416351742 + }, + { + "epoch": 2.3125183230724127, + "grad_norm": 0.40692566500967514, + "learning_rate": 0.0002964819745877261, + "loss": 3.1301662921905518, + "step": 3945, + "token_acc": 0.2839390885015218 + }, + { + "epoch": 2.313104661389622, + "grad_norm": 0.32819625967990196, + "learning_rate": 0.00029647884374448105, + "loss": 3.1174418926239014, + "step": 3946, + "token_acc": 0.28632289907780234 + }, + { + "epoch": 2.313690999706831, + "grad_norm": 0.3294581142727935, + "learning_rate": 0.0002964757115252647, + "loss": 3.187544822692871, + "step": 3947, + "token_acc": 0.2777391725089307 + }, + { + "epoch": 2.31427733802404, + "grad_norm": 0.36576806378099, + "learning_rate": 0.0002964725779301063, + "loss": 3.1446797847747803, + "step": 3948, + "token_acc": 0.28411444948055387 + }, + { + "epoch": 2.3148636763412487, + "grad_norm": 0.34682858093929086, + "learning_rate": 0.00029646944295903555, + "loss": 3.1067044734954834, + "step": 3949, + "token_acc": 0.2877818253421286 + }, + { + "epoch": 2.315450014658458, + "grad_norm": 0.30360641586581255, + "learning_rate": 0.00029646630661208165, + "loss": 3.171372413635254, + "step": 3950, + "token_acc": 0.27954881921252583 + }, + { + "epoch": 2.316036352975667, + "grad_norm": 0.32071206464679897, + "learning_rate": 0.0002964631688892742, + "loss": 3.1548526287078857, + "step": 3951, + "token_acc": 0.2818333748177648 + }, + { + "epoch": 2.316622691292876, + "grad_norm": 0.3011349284656512, + "learning_rate": 0.0002964600297906427, + "loss": 3.130326509475708, + "step": 3952, + "token_acc": 0.28512613647587215 + }, + { + "epoch": 2.317209029610085, + "grad_norm": 0.2842852131705738, + "learning_rate": 0.0002964568893162165, + "loss": 3.1081178188323975, + "step": 3953, + "token_acc": 0.28764650701968886 + }, + { + "epoch": 2.3177953679272942, + "grad_norm": 0.30492201892864884, + "learning_rate": 0.0002964537474660252, + "loss": 3.1088638305664062, + "step": 3954, + "token_acc": 0.28838705711182605 + }, + { + "epoch": 2.318381706244503, + "grad_norm": 0.29042600108079875, + "learning_rate": 0.0002964506042400983, + "loss": 3.1179306507110596, + "step": 3955, + "token_acc": 0.28552800451768573 + }, + { + "epoch": 2.318968044561712, + "grad_norm": 0.3452267393152285, + "learning_rate": 0.0002964474596384653, + "loss": 3.157042980194092, + "step": 3956, + "token_acc": 0.2826617974592864 + }, + { + "epoch": 2.319554382878921, + "grad_norm": 0.3092341425990623, + "learning_rate": 0.0002964443136611558, + "loss": 3.095261573791504, + "step": 3957, + "token_acc": 0.2905771909065206 + }, + { + "epoch": 2.3201407211961302, + "grad_norm": 0.33663624105604206, + "learning_rate": 0.00029644116630819924, + "loss": 3.171421527862549, + "step": 3958, + "token_acc": 0.27816660048108904 + }, + { + "epoch": 2.3207270595133394, + "grad_norm": 0.32444857746916417, + "learning_rate": 0.0002964380175796253, + "loss": 3.1454882621765137, + "step": 3959, + "token_acc": 0.28199545904469187 + }, + { + "epoch": 2.321313397830548, + "grad_norm": 0.3093550942552928, + "learning_rate": 0.00029643486747546347, + "loss": 3.143613815307617, + "step": 3960, + "token_acc": 0.2845564775072277 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.3382588347576792, + "learning_rate": 0.00029643171599574343, + "loss": 3.1464314460754395, + "step": 3961, + "token_acc": 0.28252593708165996 + }, + { + "epoch": 2.3224860744649662, + "grad_norm": 0.35538502107573894, + "learning_rate": 0.00029642856314049474, + "loss": 3.132188320159912, + "step": 3962, + "token_acc": 0.2850337793085177 + }, + { + "epoch": 2.3230724127821754, + "grad_norm": 0.33200635455332145, + "learning_rate": 0.000296425408909747, + "loss": 3.0920443534851074, + "step": 3963, + "token_acc": 0.29052592266549293 + }, + { + "epoch": 2.3236587510993845, + "grad_norm": 0.3253505439891719, + "learning_rate": 0.00029642225330352986, + "loss": 3.151855707168579, + "step": 3964, + "token_acc": 0.282366732896616 + }, + { + "epoch": 2.3242450894165936, + "grad_norm": 0.409766579251743, + "learning_rate": 0.0002964190963218729, + "loss": 3.09792160987854, + "step": 3965, + "token_acc": 0.28958612887099744 + }, + { + "epoch": 2.3248314277338022, + "grad_norm": 0.33186332916078143, + "learning_rate": 0.0002964159379648059, + "loss": 3.1322860717773438, + "step": 3966, + "token_acc": 0.28488048326447485 + }, + { + "epoch": 2.3254177660510114, + "grad_norm": 0.3272413661718686, + "learning_rate": 0.00029641277823235846, + "loss": 3.157687187194824, + "step": 3967, + "token_acc": 0.2818728465997084 + }, + { + "epoch": 2.3260041043682205, + "grad_norm": 0.37484030417323033, + "learning_rate": 0.0002964096171245602, + "loss": 3.139496088027954, + "step": 3968, + "token_acc": 0.28392500074008886 + }, + { + "epoch": 2.3265904426854296, + "grad_norm": 0.34778044600332114, + "learning_rate": 0.00029640645464144096, + "loss": 3.156140089035034, + "step": 3969, + "token_acc": 0.2819651225165228 + }, + { + "epoch": 2.3271767810026387, + "grad_norm": 0.3027776066419856, + "learning_rate": 0.0002964032907830303, + "loss": 3.1464316844940186, + "step": 3970, + "token_acc": 0.28328766981792264 + }, + { + "epoch": 2.3277631193198474, + "grad_norm": 0.36096171851981074, + "learning_rate": 0.00029640012554935807, + "loss": 3.1473302841186523, + "step": 3971, + "token_acc": 0.28267223603936825 + }, + { + "epoch": 2.3283494576370565, + "grad_norm": 0.3258349418188849, + "learning_rate": 0.00029639695894045383, + "loss": 3.155453681945801, + "step": 3972, + "token_acc": 0.2803161645610625 + }, + { + "epoch": 2.3289357959542656, + "grad_norm": 0.34544446514383104, + "learning_rate": 0.00029639379095634756, + "loss": 3.1648499965667725, + "step": 3973, + "token_acc": 0.2802476246078269 + }, + { + "epoch": 2.3295221342714747, + "grad_norm": 0.3139305613152924, + "learning_rate": 0.0002963906215970688, + "loss": 3.1688733100891113, + "step": 3974, + "token_acc": 0.2774523323207084 + }, + { + "epoch": 2.330108472588684, + "grad_norm": 0.31801358213108577, + "learning_rate": 0.00029638745086264746, + "loss": 3.1264429092407227, + "step": 3975, + "token_acc": 0.2853276618370331 + }, + { + "epoch": 2.330694810905893, + "grad_norm": 0.32132890089752575, + "learning_rate": 0.00029638427875311327, + "loss": 3.1065244674682617, + "step": 3976, + "token_acc": 0.28859072182785156 + }, + { + "epoch": 2.3312811492231016, + "grad_norm": 0.31120303707540703, + "learning_rate": 0.00029638110526849604, + "loss": 3.0976758003234863, + "step": 3977, + "token_acc": 0.2894734770871102 + }, + { + "epoch": 2.3318674875403107, + "grad_norm": 0.3194433083350436, + "learning_rate": 0.0002963779304088255, + "loss": 3.1128406524658203, + "step": 3978, + "token_acc": 0.28683485321122365 + }, + { + "epoch": 2.33245382585752, + "grad_norm": 0.341211404222118, + "learning_rate": 0.0002963747541741317, + "loss": 3.114959955215454, + "step": 3979, + "token_acc": 0.28751626924351725 + }, + { + "epoch": 2.333040164174729, + "grad_norm": 0.3403736337168058, + "learning_rate": 0.0002963715765644442, + "loss": 3.086817741394043, + "step": 3980, + "token_acc": 0.2920673460911792 + }, + { + "epoch": 2.333626502491938, + "grad_norm": 0.3514878459147988, + "learning_rate": 0.000296368397579793, + "loss": 3.168426990509033, + "step": 3981, + "token_acc": 0.27941350019537753 + }, + { + "epoch": 2.3342128408091467, + "grad_norm": 0.36394326447013103, + "learning_rate": 0.00029636521722020795, + "loss": 3.1469502449035645, + "step": 3982, + "token_acc": 0.2828156877653171 + }, + { + "epoch": 2.334799179126356, + "grad_norm": 0.3236336101844483, + "learning_rate": 0.00029636203548571896, + "loss": 3.0912115573883057, + "step": 3983, + "token_acc": 0.28926650802469783 + }, + { + "epoch": 2.335385517443565, + "grad_norm": 0.33984706048480506, + "learning_rate": 0.00029635885237635584, + "loss": 3.1286025047302246, + "step": 3984, + "token_acc": 0.2839609014971456 + }, + { + "epoch": 2.335971855760774, + "grad_norm": 0.31787548191424597, + "learning_rate": 0.0002963556678921485, + "loss": 3.1059465408325195, + "step": 3985, + "token_acc": 0.29002777646509037 + }, + { + "epoch": 2.336558194077983, + "grad_norm": 0.32525891787625666, + "learning_rate": 0.0002963524820331269, + "loss": 3.122732639312744, + "step": 3986, + "token_acc": 0.28664019727554213 + }, + { + "epoch": 2.3371445323951923, + "grad_norm": 0.3939331440926721, + "learning_rate": 0.0002963492947993209, + "loss": 3.0824568271636963, + "step": 3987, + "token_acc": 0.2915181379744181 + }, + { + "epoch": 2.337730870712401, + "grad_norm": 0.3937511848206936, + "learning_rate": 0.0002963461061907605, + "loss": 3.140044927597046, + "step": 3988, + "token_acc": 0.2827458548304517 + }, + { + "epoch": 2.33831720902961, + "grad_norm": 0.3525153941912084, + "learning_rate": 0.0002963429162074757, + "loss": 3.12719464302063, + "step": 3989, + "token_acc": 0.2856854653828519 + }, + { + "epoch": 2.338903547346819, + "grad_norm": 0.34240452425242723, + "learning_rate": 0.0002963397248494964, + "loss": 3.1316399574279785, + "step": 3990, + "token_acc": 0.28416875166218686 + }, + { + "epoch": 2.3394898856640283, + "grad_norm": 0.3549401904942206, + "learning_rate": 0.00029633653211685255, + "loss": 3.1107828617095947, + "step": 3991, + "token_acc": 0.2874998366418798 + }, + { + "epoch": 2.340076223981237, + "grad_norm": 0.29849073705542006, + "learning_rate": 0.00029633333800957413, + "loss": 3.132326602935791, + "step": 3992, + "token_acc": 0.2827539591568191 + }, + { + "epoch": 2.340662562298446, + "grad_norm": 0.3459356721374136, + "learning_rate": 0.00029633014252769123, + "loss": 3.1048688888549805, + "step": 3993, + "token_acc": 0.2880155930572814 + }, + { + "epoch": 2.341248900615655, + "grad_norm": 0.3267966345195763, + "learning_rate": 0.0002963269456712338, + "loss": 3.143235445022583, + "step": 3994, + "token_acc": 0.2826626598754357 + }, + { + "epoch": 2.3418352389328643, + "grad_norm": 0.30339352674640824, + "learning_rate": 0.0002963237474402319, + "loss": 3.1140542030334473, + "step": 3995, + "token_acc": 0.28642132337667997 + }, + { + "epoch": 2.3424215772500734, + "grad_norm": 0.3102619538977227, + "learning_rate": 0.00029632054783471556, + "loss": 3.140777111053467, + "step": 3996, + "token_acc": 0.28516348998135943 + }, + { + "epoch": 2.3430079155672825, + "grad_norm": 0.32332447712786505, + "learning_rate": 0.00029631734685471486, + "loss": 3.112484931945801, + "step": 3997, + "token_acc": 0.2883135321930377 + }, + { + "epoch": 2.343594253884491, + "grad_norm": 0.3087385496165147, + "learning_rate": 0.00029631414450025984, + "loss": 3.1353821754455566, + "step": 3998, + "token_acc": 0.2840378474518065 + }, + { + "epoch": 2.3441805922017003, + "grad_norm": 0.3174757693173645, + "learning_rate": 0.0002963109407713806, + "loss": 3.106165885925293, + "step": 3999, + "token_acc": 0.2885664414864559 + }, + { + "epoch": 2.3447669305189094, + "grad_norm": 0.3502495059107925, + "learning_rate": 0.0002963077356681072, + "loss": 3.0772173404693604, + "step": 4000, + "token_acc": 0.29168265804046273 + }, + { + "epoch": 2.3453532688361185, + "grad_norm": 0.36539377137315054, + "learning_rate": 0.0002963045291904699, + "loss": 3.103294610977173, + "step": 4001, + "token_acc": 0.2899008055032453 + }, + { + "epoch": 2.3459396071533276, + "grad_norm": 0.4235753577062346, + "learning_rate": 0.0002963013213384985, + "loss": 3.167001485824585, + "step": 4002, + "token_acc": 0.27866895138329445 + }, + { + "epoch": 2.3465259454705363, + "grad_norm": 0.41216292895103995, + "learning_rate": 0.0002962981121122235, + "loss": 3.1391708850860596, + "step": 4003, + "token_acc": 0.2836572378921391 + }, + { + "epoch": 2.3471122837877454, + "grad_norm": 0.36287085215595655, + "learning_rate": 0.0002962949015116748, + "loss": 3.1735856533050537, + "step": 4004, + "token_acc": 0.2780420702699605 + }, + { + "epoch": 2.3476986221049545, + "grad_norm": 0.2866053570603616, + "learning_rate": 0.0002962916895368826, + "loss": 3.1510367393493652, + "step": 4005, + "token_acc": 0.28242361402970123 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.39624344703642184, + "learning_rate": 0.0002962884761878772, + "loss": 3.1754367351531982, + "step": 4006, + "token_acc": 0.2784526719487525 + }, + { + "epoch": 2.3488712987393727, + "grad_norm": 0.37577173799126873, + "learning_rate": 0.0002962852614646886, + "loss": 3.110475540161133, + "step": 4007, + "token_acc": 0.2874274416431515 + }, + { + "epoch": 2.349457637056582, + "grad_norm": 0.3120587502323707, + "learning_rate": 0.0002962820453673471, + "loss": 3.1042656898498535, + "step": 4008, + "token_acc": 0.2890577445408928 + }, + { + "epoch": 2.3500439753737905, + "grad_norm": 0.3339394466202423, + "learning_rate": 0.00029627882789588295, + "loss": 3.1650099754333496, + "step": 4009, + "token_acc": 0.28150531182530714 + }, + { + "epoch": 2.3506303136909996, + "grad_norm": 0.35143519282375235, + "learning_rate": 0.00029627560905032626, + "loss": 3.1488072872161865, + "step": 4010, + "token_acc": 0.28168383947939263 + }, + { + "epoch": 2.3512166520082087, + "grad_norm": 0.3462029532525808, + "learning_rate": 0.0002962723888307074, + "loss": 3.1110737323760986, + "step": 4011, + "token_acc": 0.28760575328339977 + }, + { + "epoch": 2.351802990325418, + "grad_norm": 0.33700075200261886, + "learning_rate": 0.0002962691672370565, + "loss": 3.1305384635925293, + "step": 4012, + "token_acc": 0.2852899208717084 + }, + { + "epoch": 2.352389328642627, + "grad_norm": 0.35343213532433165, + "learning_rate": 0.00029626594426940397, + "loss": 3.1542434692382812, + "step": 4013, + "token_acc": 0.27996121113138456 + }, + { + "epoch": 2.3529756669598356, + "grad_norm": 0.3290123110196839, + "learning_rate": 0.00029626271992777987, + "loss": 3.135974884033203, + "step": 4014, + "token_acc": 0.28392988577815337 + }, + { + "epoch": 2.3535620052770447, + "grad_norm": 0.3232403138813767, + "learning_rate": 0.00029625949421221466, + "loss": 3.127687931060791, + "step": 4015, + "token_acc": 0.2842832585679109 + }, + { + "epoch": 2.354148343594254, + "grad_norm": 0.37049770078288097, + "learning_rate": 0.00029625626712273865, + "loss": 3.1001381874084473, + "step": 4016, + "token_acc": 0.2899405262606256 + }, + { + "epoch": 2.354734681911463, + "grad_norm": 0.36671027763347663, + "learning_rate": 0.00029625303865938197, + "loss": 3.1361236572265625, + "step": 4017, + "token_acc": 0.2822531737943356 + }, + { + "epoch": 2.355321020228672, + "grad_norm": 0.33918883653612, + "learning_rate": 0.00029624980882217515, + "loss": 3.1558055877685547, + "step": 4018, + "token_acc": 0.28311769232612694 + }, + { + "epoch": 2.355907358545881, + "grad_norm": 0.3381591991155212, + "learning_rate": 0.00029624657761114843, + "loss": 3.099740505218506, + "step": 4019, + "token_acc": 0.28886623079025375 + }, + { + "epoch": 2.35649369686309, + "grad_norm": 0.34766829766852736, + "learning_rate": 0.0002962433450263322, + "loss": 3.131413459777832, + "step": 4020, + "token_acc": 0.285502096079316 + }, + { + "epoch": 2.357080035180299, + "grad_norm": 0.35838235456841555, + "learning_rate": 0.0002962401110677568, + "loss": 3.158010959625244, + "step": 4021, + "token_acc": 0.2793105837608395 + }, + { + "epoch": 2.357666373497508, + "grad_norm": 0.3443667883443588, + "learning_rate": 0.0002962368757354526, + "loss": 3.1425743103027344, + "step": 4022, + "token_acc": 0.28125927584608734 + }, + { + "epoch": 2.358252711814717, + "grad_norm": 0.3301598643512975, + "learning_rate": 0.00029623363902945004, + "loss": 3.1080713272094727, + "step": 4023, + "token_acc": 0.2889172395241 + }, + { + "epoch": 2.3588390501319263, + "grad_norm": 0.3576615253979242, + "learning_rate": 0.00029623040094977943, + "loss": 3.117068290710449, + "step": 4024, + "token_acc": 0.28600126876718124 + }, + { + "epoch": 2.359425388449135, + "grad_norm": 0.3345417991709525, + "learning_rate": 0.0002962271614964713, + "loss": 3.1294901371002197, + "step": 4025, + "token_acc": 0.287467301843857 + }, + { + "epoch": 2.360011726766344, + "grad_norm": 0.35664834864674017, + "learning_rate": 0.00029622392066955603, + "loss": 3.1373677253723145, + "step": 4026, + "token_acc": 0.28445016084020364 + }, + { + "epoch": 2.360598065083553, + "grad_norm": 0.32651405600435834, + "learning_rate": 0.00029622067846906406, + "loss": 3.122591018676758, + "step": 4027, + "token_acc": 0.28593844422139747 + }, + { + "epoch": 2.3611844034007623, + "grad_norm": 0.36632852545161243, + "learning_rate": 0.00029621743489502585, + "loss": 3.1384105682373047, + "step": 4028, + "token_acc": 0.2837051757777122 + }, + { + "epoch": 2.3617707417179714, + "grad_norm": 0.3410284343938234, + "learning_rate": 0.00029621418994747186, + "loss": 3.1472935676574707, + "step": 4029, + "token_acc": 0.2808508269398166 + }, + { + "epoch": 2.3623570800351805, + "grad_norm": 0.3183450443639991, + "learning_rate": 0.00029621094362643257, + "loss": 3.1052231788635254, + "step": 4030, + "token_acc": 0.28649753238741515 + }, + { + "epoch": 2.362943418352389, + "grad_norm": 0.302784952813276, + "learning_rate": 0.00029620769593193853, + "loss": 3.1508584022521973, + "step": 4031, + "token_acc": 0.2835671979124212 + }, + { + "epoch": 2.3635297566695983, + "grad_norm": 0.29871037637940984, + "learning_rate": 0.00029620444686402023, + "loss": 3.1142523288726807, + "step": 4032, + "token_acc": 0.28713735505209964 + }, + { + "epoch": 2.3641160949868074, + "grad_norm": 0.32005576723436063, + "learning_rate": 0.0002962011964227081, + "loss": 3.0794737339019775, + "step": 4033, + "token_acc": 0.29188106824373095 + }, + { + "epoch": 2.3647024333040165, + "grad_norm": 0.3287711271571033, + "learning_rate": 0.00029619794460803274, + "loss": 3.1443397998809814, + "step": 4034, + "token_acc": 0.2817958693438471 + }, + { + "epoch": 2.3652887716212256, + "grad_norm": 0.34891523167838623, + "learning_rate": 0.0002961946914200247, + "loss": 3.0749239921569824, + "step": 4035, + "token_acc": 0.29375930475403883 + }, + { + "epoch": 2.3658751099384343, + "grad_norm": 0.3575819862446973, + "learning_rate": 0.0002961914368587145, + "loss": 3.1467981338500977, + "step": 4036, + "token_acc": 0.28168531956424603 + }, + { + "epoch": 2.3664614482556434, + "grad_norm": 0.3060707502384176, + "learning_rate": 0.00029618818092413284, + "loss": 3.1116278171539307, + "step": 4037, + "token_acc": 0.2875863916421109 + }, + { + "epoch": 2.3670477865728525, + "grad_norm": 0.38860454815255263, + "learning_rate": 0.0002961849236163102, + "loss": 3.1438052654266357, + "step": 4038, + "token_acc": 0.2838916218914811 + }, + { + "epoch": 2.3676341248900616, + "grad_norm": 0.38122393721910874, + "learning_rate": 0.0002961816649352771, + "loss": 3.0928635597229004, + "step": 4039, + "token_acc": 0.2898877001794139 + }, + { + "epoch": 2.3682204632072708, + "grad_norm": 0.30310739867053454, + "learning_rate": 0.0002961784048810643, + "loss": 3.111734628677368, + "step": 4040, + "token_acc": 0.28747518983775755 + }, + { + "epoch": 2.36880680152448, + "grad_norm": 0.3610304676336057, + "learning_rate": 0.00029617514345370234, + "loss": 3.1203346252441406, + "step": 4041, + "token_acc": 0.2861496067205308 + }, + { + "epoch": 2.3693931398416885, + "grad_norm": 0.42574548618412045, + "learning_rate": 0.0002961718806532219, + "loss": 3.1274099349975586, + "step": 4042, + "token_acc": 0.28313789443711007 + }, + { + "epoch": 2.3699794781588976, + "grad_norm": 0.38230190329200986, + "learning_rate": 0.00029616861647965365, + "loss": 3.072632312774658, + "step": 4043, + "token_acc": 0.29222820373295577 + }, + { + "epoch": 2.3705658164761068, + "grad_norm": 0.3433491510755498, + "learning_rate": 0.0002961653509330281, + "loss": 3.060450315475464, + "step": 4044, + "token_acc": 0.2946065875768338 + }, + { + "epoch": 2.371152154793316, + "grad_norm": 0.3506460771442109, + "learning_rate": 0.00029616208401337616, + "loss": 3.1136536598205566, + "step": 4045, + "token_acc": 0.28747545219638243 + }, + { + "epoch": 2.3717384931105245, + "grad_norm": 0.3835965271805788, + "learning_rate": 0.00029615881572072836, + "loss": 3.1222023963928223, + "step": 4046, + "token_acc": 0.2857692116558079 + }, + { + "epoch": 2.3723248314277336, + "grad_norm": 0.39173783632215414, + "learning_rate": 0.00029615554605511544, + "loss": 3.1482996940612793, + "step": 4047, + "token_acc": 0.28218468852951895 + }, + { + "epoch": 2.3729111697449428, + "grad_norm": 0.38283616174466534, + "learning_rate": 0.00029615227501656805, + "loss": 3.105515956878662, + "step": 4048, + "token_acc": 0.289061929928803 + }, + { + "epoch": 2.373497508062152, + "grad_norm": 0.34234163076003277, + "learning_rate": 0.000296149002605117, + "loss": 3.1336746215820312, + "step": 4049, + "token_acc": 0.28492549673132017 + }, + { + "epoch": 2.374083846379361, + "grad_norm": 0.3406497260932234, + "learning_rate": 0.00029614572882079304, + "loss": 3.1649649143218994, + "step": 4050, + "token_acc": 0.28033965110596437 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.37071580450177355, + "learning_rate": 0.0002961424536636269, + "loss": 3.1256635189056396, + "step": 4051, + "token_acc": 0.2870016406559391 + }, + { + "epoch": 2.3752565230137788, + "grad_norm": 0.3261009887266913, + "learning_rate": 0.00029613917713364933, + "loss": 3.112837791442871, + "step": 4052, + "token_acc": 0.2882105952612339 + }, + { + "epoch": 2.375842861330988, + "grad_norm": 0.32844758197651663, + "learning_rate": 0.00029613589923089113, + "loss": 3.1194145679473877, + "step": 4053, + "token_acc": 0.2866617577309187 + }, + { + "epoch": 2.376429199648197, + "grad_norm": 0.3076168975013088, + "learning_rate": 0.000296132619955383, + "loss": 3.144341230392456, + "step": 4054, + "token_acc": 0.28342546087151715 + }, + { + "epoch": 2.377015537965406, + "grad_norm": 0.3353014927479724, + "learning_rate": 0.0002961293393071559, + "loss": 3.1068387031555176, + "step": 4055, + "token_acc": 0.28893783816779617 + }, + { + "epoch": 2.377601876282615, + "grad_norm": 0.3400359099807351, + "learning_rate": 0.00029612605728624055, + "loss": 3.122575521469116, + "step": 4056, + "token_acc": 0.28587565253651304 + }, + { + "epoch": 2.378188214599824, + "grad_norm": 0.31526799643771264, + "learning_rate": 0.0002961227738926678, + "loss": 3.0796141624450684, + "step": 4057, + "token_acc": 0.2906491321567029 + }, + { + "epoch": 2.378774552917033, + "grad_norm": 0.3449717272074962, + "learning_rate": 0.00029611948912646846, + "loss": 3.1379218101501465, + "step": 4058, + "token_acc": 0.2851804302197978 + }, + { + "epoch": 2.379360891234242, + "grad_norm": 0.3144034080239278, + "learning_rate": 0.00029611620298767346, + "loss": 3.1387691497802734, + "step": 4059, + "token_acc": 0.28454155122159536 + }, + { + "epoch": 2.379947229551451, + "grad_norm": 0.3419398553620478, + "learning_rate": 0.0002961129154763136, + "loss": 3.1118006706237793, + "step": 4060, + "token_acc": 0.2872363250077186 + }, + { + "epoch": 2.3805335678686603, + "grad_norm": 0.3028329584812501, + "learning_rate": 0.0002961096265924198, + "loss": 3.1462883949279785, + "step": 4061, + "token_acc": 0.28254646154485613 + }, + { + "epoch": 2.3811199061858694, + "grad_norm": 0.30734348794617594, + "learning_rate": 0.0002961063363360229, + "loss": 3.1184375286102295, + "step": 4062, + "token_acc": 0.2858412337989612 + }, + { + "epoch": 2.381706244503078, + "grad_norm": 0.3089975517855078, + "learning_rate": 0.0002961030447071539, + "loss": 3.0917253494262695, + "step": 4063, + "token_acc": 0.2902494149542237 + }, + { + "epoch": 2.382292582820287, + "grad_norm": 0.3515939521765386, + "learning_rate": 0.0002960997517058437, + "loss": 3.074690818786621, + "step": 4064, + "token_acc": 0.2917661433589762 + }, + { + "epoch": 2.3828789211374963, + "grad_norm": 0.29647511080238154, + "learning_rate": 0.00029609645733212316, + "loss": 3.1222891807556152, + "step": 4065, + "token_acc": 0.28517263448304 + }, + { + "epoch": 2.3834652594547054, + "grad_norm": 0.34521614119582134, + "learning_rate": 0.0002960931615860233, + "loss": 3.1230716705322266, + "step": 4066, + "token_acc": 0.2854011099899092 + }, + { + "epoch": 2.3840515977719146, + "grad_norm": 0.3041334236621187, + "learning_rate": 0.00029608986446757503, + "loss": 3.16392183303833, + "step": 4067, + "token_acc": 0.2816469333644976 + }, + { + "epoch": 2.3846379360891232, + "grad_norm": 0.30059224044736854, + "learning_rate": 0.00029608656597680935, + "loss": 3.114427089691162, + "step": 4068, + "token_acc": 0.2863731377948837 + }, + { + "epoch": 2.3852242744063323, + "grad_norm": 0.3225804534347919, + "learning_rate": 0.0002960832661137572, + "loss": 3.1056575775146484, + "step": 4069, + "token_acc": 0.289475895609792 + }, + { + "epoch": 2.3858106127235414, + "grad_norm": 0.2777038315065759, + "learning_rate": 0.0002960799648784497, + "loss": 3.087029457092285, + "step": 4070, + "token_acc": 0.29098091897933664 + }, + { + "epoch": 2.3863969510407506, + "grad_norm": 0.3437772758278913, + "learning_rate": 0.0002960766622709177, + "loss": 3.125253677368164, + "step": 4071, + "token_acc": 0.2846819151620822 + }, + { + "epoch": 2.3869832893579597, + "grad_norm": 0.4025348878327826, + "learning_rate": 0.0002960733582911923, + "loss": 3.1176581382751465, + "step": 4072, + "token_acc": 0.284976229995287 + }, + { + "epoch": 2.387569627675169, + "grad_norm": 0.3975775316064328, + "learning_rate": 0.00029607005293930453, + "loss": 3.086373805999756, + "step": 4073, + "token_acc": 0.28947672819529446 + }, + { + "epoch": 2.3881559659923774, + "grad_norm": 0.336936223997053, + "learning_rate": 0.00029606674621528547, + "loss": 3.134627342224121, + "step": 4074, + "token_acc": 0.2858716467431601 + }, + { + "epoch": 2.3887423043095866, + "grad_norm": 0.3508290550956907, + "learning_rate": 0.00029606343811916616, + "loss": 3.1428351402282715, + "step": 4075, + "token_acc": 0.2839732000425396 + }, + { + "epoch": 2.3893286426267957, + "grad_norm": 0.38666413131651467, + "learning_rate": 0.0002960601286509777, + "loss": 3.1315760612487793, + "step": 4076, + "token_acc": 0.2842102577188058 + }, + { + "epoch": 2.389914980944005, + "grad_norm": 0.39542023347359023, + "learning_rate": 0.0002960568178107511, + "loss": 3.1438581943511963, + "step": 4077, + "token_acc": 0.28342988247738565 + }, + { + "epoch": 2.390501319261214, + "grad_norm": 0.3718882888342459, + "learning_rate": 0.0002960535055985175, + "loss": 3.1236462593078613, + "step": 4078, + "token_acc": 0.28655080606961436 + }, + { + "epoch": 2.3910876575784226, + "grad_norm": 0.33925481837890226, + "learning_rate": 0.00029605019201430806, + "loss": 3.1145498752593994, + "step": 4079, + "token_acc": 0.28739140468551094 + }, + { + "epoch": 2.3916739958956317, + "grad_norm": 0.33448650446974554, + "learning_rate": 0.00029604687705815386, + "loss": 3.0845329761505127, + "step": 4080, + "token_acc": 0.28863444367908714 + }, + { + "epoch": 2.392260334212841, + "grad_norm": 0.37587453914622954, + "learning_rate": 0.00029604356073008607, + "loss": 3.1235480308532715, + "step": 4081, + "token_acc": 0.2857210148246039 + }, + { + "epoch": 2.39284667253005, + "grad_norm": 0.36603099398672506, + "learning_rate": 0.00029604024303013575, + "loss": 3.1164331436157227, + "step": 4082, + "token_acc": 0.2866449167206013 + }, + { + "epoch": 2.393433010847259, + "grad_norm": 0.3221779260161908, + "learning_rate": 0.0002960369239583342, + "loss": 3.12549090385437, + "step": 4083, + "token_acc": 0.2851708070510063 + }, + { + "epoch": 2.394019349164468, + "grad_norm": 0.3880953580475413, + "learning_rate": 0.00029603360351471256, + "loss": 3.117244243621826, + "step": 4084, + "token_acc": 0.2858404635602847 + }, + { + "epoch": 2.394605687481677, + "grad_norm": 0.3736562404370773, + "learning_rate": 0.000296030281699302, + "loss": 3.1411666870117188, + "step": 4085, + "token_acc": 0.2826039682560476 + }, + { + "epoch": 2.395192025798886, + "grad_norm": 0.29477651727289295, + "learning_rate": 0.00029602695851213367, + "loss": 3.1056132316589355, + "step": 4086, + "token_acc": 0.28745504023964474 + }, + { + "epoch": 2.395778364116095, + "grad_norm": 0.3461508578510441, + "learning_rate": 0.00029602363395323883, + "loss": 3.16405987739563, + "step": 4087, + "token_acc": 0.27963235680849946 + }, + { + "epoch": 2.396364702433304, + "grad_norm": 0.287510992254556, + "learning_rate": 0.00029602030802264876, + "loss": 3.0820648670196533, + "step": 4088, + "token_acc": 0.29155795726978767 + }, + { + "epoch": 2.3969510407505132, + "grad_norm": 0.3568126469643849, + "learning_rate": 0.00029601698072039467, + "loss": 3.1050209999084473, + "step": 4089, + "token_acc": 0.288143938782429 + }, + { + "epoch": 2.397537379067722, + "grad_norm": 0.31551141678308997, + "learning_rate": 0.00029601365204650774, + "loss": 3.149054765701294, + "step": 4090, + "token_acc": 0.28167853753662936 + }, + { + "epoch": 2.398123717384931, + "grad_norm": 0.29643560180878287, + "learning_rate": 0.00029601032200101936, + "loss": 3.160217761993408, + "step": 4091, + "token_acc": 0.2808048465922728 + }, + { + "epoch": 2.39871005570214, + "grad_norm": 0.34984919943191994, + "learning_rate": 0.00029600699058396075, + "loss": 3.129704713821411, + "step": 4092, + "token_acc": 0.2846545691097856 + }, + { + "epoch": 2.3992963940193492, + "grad_norm": 0.322181122170411, + "learning_rate": 0.0002960036577953632, + "loss": 3.063467025756836, + "step": 4093, + "token_acc": 0.2933167576508652 + }, + { + "epoch": 2.3998827323365584, + "grad_norm": 0.3207150039030181, + "learning_rate": 0.00029600032363525806, + "loss": 3.1432952880859375, + "step": 4094, + "token_acc": 0.2822501724608153 + }, + { + "epoch": 2.4004690706537675, + "grad_norm": 0.299801657605114, + "learning_rate": 0.00029599698810367655, + "loss": 3.138869285583496, + "step": 4095, + "token_acc": 0.28362501828109005 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.3576568958487254, + "learning_rate": 0.0002959936512006501, + "loss": 3.1874008178710938, + "step": 4096, + "token_acc": 0.2764936752774024 + }, + { + "epoch": 2.4016417472881852, + "grad_norm": 0.3757895284805148, + "learning_rate": 0.00029599031292621005, + "loss": 3.1302578449249268, + "step": 4097, + "token_acc": 0.2847600886200164 + }, + { + "epoch": 2.4022280856053944, + "grad_norm": 0.4290185213415451, + "learning_rate": 0.00029598697328038774, + "loss": 3.1395249366760254, + "step": 4098, + "token_acc": 0.28380919067815136 + }, + { + "epoch": 2.4028144239226035, + "grad_norm": 0.32681236661979357, + "learning_rate": 0.0002959836322632145, + "loss": 3.146493911743164, + "step": 4099, + "token_acc": 0.28295107013974446 + }, + { + "epoch": 2.403400762239812, + "grad_norm": 0.35674182153429684, + "learning_rate": 0.00029598028987472177, + "loss": 3.1419923305511475, + "step": 4100, + "token_acc": 0.28420438104424195 + }, + { + "epoch": 2.4039871005570213, + "grad_norm": 0.33575957290197384, + "learning_rate": 0.00029597694611494094, + "loss": 3.1567134857177734, + "step": 4101, + "token_acc": 0.28247825707255175 + }, + { + "epoch": 2.4045734388742304, + "grad_norm": 0.31078131968223355, + "learning_rate": 0.00029597360098390337, + "loss": 3.104005813598633, + "step": 4102, + "token_acc": 0.2887838858796563 + }, + { + "epoch": 2.4051597771914395, + "grad_norm": 0.43178801483634843, + "learning_rate": 0.00029597025448164057, + "loss": 3.125809669494629, + "step": 4103, + "token_acc": 0.2842391360299795 + }, + { + "epoch": 2.4057461155086486, + "grad_norm": 0.3594254704889613, + "learning_rate": 0.00029596690660818386, + "loss": 3.1811869144439697, + "step": 4104, + "token_acc": 0.27947864742940565 + }, + { + "epoch": 2.4063324538258577, + "grad_norm": 0.39445662325112135, + "learning_rate": 0.0002959635573635648, + "loss": 3.1588358879089355, + "step": 4105, + "token_acc": 0.2812822816369751 + }, + { + "epoch": 2.4069187921430664, + "grad_norm": 0.3231290976746693, + "learning_rate": 0.00029596020674781484, + "loss": 3.1087775230407715, + "step": 4106, + "token_acc": 0.28906112825143604 + }, + { + "epoch": 2.4075051304602755, + "grad_norm": 0.3867762431882229, + "learning_rate": 0.00029595685476096535, + "loss": 3.101858139038086, + "step": 4107, + "token_acc": 0.2887637087163549 + }, + { + "epoch": 2.4080914687774846, + "grad_norm": 0.34439915688176387, + "learning_rate": 0.00029595350140304794, + "loss": 3.091704845428467, + "step": 4108, + "token_acc": 0.2893276706056091 + }, + { + "epoch": 2.4086778070946937, + "grad_norm": 0.32005193007743693, + "learning_rate": 0.00029595014667409405, + "loss": 3.1325976848602295, + "step": 4109, + "token_acc": 0.28550594876399144 + }, + { + "epoch": 2.409264145411903, + "grad_norm": 0.29270829279910404, + "learning_rate": 0.00029594679057413513, + "loss": 3.1128897666931152, + "step": 4110, + "token_acc": 0.28656458429255266 + }, + { + "epoch": 2.4098504837291115, + "grad_norm": 0.3075684205270804, + "learning_rate": 0.0002959434331032029, + "loss": 3.10868501663208, + "step": 4111, + "token_acc": 0.28729127516778524 + }, + { + "epoch": 2.4104368220463206, + "grad_norm": 0.3056223033297288, + "learning_rate": 0.0002959400742613287, + "loss": 3.127923011779785, + "step": 4112, + "token_acc": 0.2841130188752374 + }, + { + "epoch": 2.4110231603635297, + "grad_norm": 0.3362550594282149, + "learning_rate": 0.0002959367140485442, + "loss": 3.0786402225494385, + "step": 4113, + "token_acc": 0.29261380200213444 + }, + { + "epoch": 2.411609498680739, + "grad_norm": 0.3472997632138236, + "learning_rate": 0.00029593335246488086, + "loss": 3.125199317932129, + "step": 4114, + "token_acc": 0.2868676800398704 + }, + { + "epoch": 2.412195836997948, + "grad_norm": 0.27041974597209306, + "learning_rate": 0.0002959299895103704, + "loss": 3.1392602920532227, + "step": 4115, + "token_acc": 0.2850552054607795 + }, + { + "epoch": 2.412782175315157, + "grad_norm": 0.3239632986517963, + "learning_rate": 0.00029592662518504426, + "loss": 3.155673027038574, + "step": 4116, + "token_acc": 0.28069993760727796 + }, + { + "epoch": 2.4133685136323657, + "grad_norm": 0.33626305069370355, + "learning_rate": 0.00029592325948893416, + "loss": 3.1176717281341553, + "step": 4117, + "token_acc": 0.28786474361351083 + }, + { + "epoch": 2.413954851949575, + "grad_norm": 0.31075485068079367, + "learning_rate": 0.00029591989242207164, + "loss": 3.1246867179870605, + "step": 4118, + "token_acc": 0.2846070105547303 + }, + { + "epoch": 2.414541190266784, + "grad_norm": 0.3088628753353623, + "learning_rate": 0.0002959165239844884, + "loss": 3.1521925926208496, + "step": 4119, + "token_acc": 0.2830860155279586 + }, + { + "epoch": 2.415127528583993, + "grad_norm": 0.4136812844884011, + "learning_rate": 0.00029591315417621604, + "loss": 3.1566102504730225, + "step": 4120, + "token_acc": 0.27947669498006505 + }, + { + "epoch": 2.415713866901202, + "grad_norm": 0.40448314318857753, + "learning_rate": 0.0002959097829972862, + "loss": 3.1274571418762207, + "step": 4121, + "token_acc": 0.2851610273134937 + }, + { + "epoch": 2.416300205218411, + "grad_norm": 0.41654616543111284, + "learning_rate": 0.0002959064104477305, + "loss": 3.1164355278015137, + "step": 4122, + "token_acc": 0.28624112579068317 + }, + { + "epoch": 2.41688654353562, + "grad_norm": 0.3475172403706704, + "learning_rate": 0.0002959030365275808, + "loss": 3.093113422393799, + "step": 4123, + "token_acc": 0.2882727811314108 + }, + { + "epoch": 2.417472881852829, + "grad_norm": 0.35205889002219554, + "learning_rate": 0.0002958996612368686, + "loss": 3.1584134101867676, + "step": 4124, + "token_acc": 0.27981491298406197 + }, + { + "epoch": 2.418059220170038, + "grad_norm": 0.3546336626181856, + "learning_rate": 0.00029589628457562573, + "loss": 3.1534931659698486, + "step": 4125, + "token_acc": 0.2797648784003778 + }, + { + "epoch": 2.4186455584872473, + "grad_norm": 0.3755933268778347, + "learning_rate": 0.00029589290654388387, + "loss": 3.133286476135254, + "step": 4126, + "token_acc": 0.28505585293609786 + }, + { + "epoch": 2.4192318968044564, + "grad_norm": 0.3835454247674965, + "learning_rate": 0.00029588952714167474, + "loss": 3.1292192935943604, + "step": 4127, + "token_acc": 0.28310922821882484 + }, + { + "epoch": 2.419818235121665, + "grad_norm": 0.31516398047155963, + "learning_rate": 0.00029588614636903005, + "loss": 3.1010119915008545, + "step": 4128, + "token_acc": 0.2893087448070603 + }, + { + "epoch": 2.420404573438874, + "grad_norm": 0.32727412237861264, + "learning_rate": 0.0002958827642259816, + "loss": 3.1238107681274414, + "step": 4129, + "token_acc": 0.285604711395504 + }, + { + "epoch": 2.4209909117560833, + "grad_norm": 0.3358974419579676, + "learning_rate": 0.00029587938071256117, + "loss": 3.11397123336792, + "step": 4130, + "token_acc": 0.28599735305333523 + }, + { + "epoch": 2.4215772500732924, + "grad_norm": 0.3249341829871764, + "learning_rate": 0.0002958759958288006, + "loss": 3.1062324047088623, + "step": 4131, + "token_acc": 0.28723074201232285 + }, + { + "epoch": 2.4221635883905015, + "grad_norm": 0.3176484182382069, + "learning_rate": 0.00029587260957473154, + "loss": 3.098982334136963, + "step": 4132, + "token_acc": 0.28952504018238967 + }, + { + "epoch": 2.42274992670771, + "grad_norm": 0.3103169039732458, + "learning_rate": 0.0002958692219503859, + "loss": 3.1370491981506348, + "step": 4133, + "token_acc": 0.2850324688259432 + }, + { + "epoch": 2.4233362650249193, + "grad_norm": 0.30764324934479886, + "learning_rate": 0.0002958658329557955, + "loss": 3.0795207023620605, + "step": 4134, + "token_acc": 0.2911532715789686 + }, + { + "epoch": 2.4239226033421284, + "grad_norm": 0.3174245833609015, + "learning_rate": 0.00029586244259099216, + "loss": 3.112072467803955, + "step": 4135, + "token_acc": 0.28790613019401307 + }, + { + "epoch": 2.4245089416593375, + "grad_norm": 0.3591540576913985, + "learning_rate": 0.0002958590508560077, + "loss": 3.0682766437530518, + "step": 4136, + "token_acc": 0.29346222959518836 + }, + { + "epoch": 2.4250952799765466, + "grad_norm": 0.33661657399529404, + "learning_rate": 0.000295855657750874, + "loss": 3.168714761734009, + "step": 4137, + "token_acc": 0.2796815787343068 + }, + { + "epoch": 2.4256816182937557, + "grad_norm": 0.29427638079772583, + "learning_rate": 0.00029585226327562297, + "loss": 3.1352345943450928, + "step": 4138, + "token_acc": 0.28466872525391634 + }, + { + "epoch": 2.4262679566109644, + "grad_norm": 0.3470082969552848, + "learning_rate": 0.00029584886743028643, + "loss": 3.1168084144592285, + "step": 4139, + "token_acc": 0.28691008728684264 + }, + { + "epoch": 2.4268542949281735, + "grad_norm": 0.3416335169278841, + "learning_rate": 0.0002958454702148963, + "loss": 3.1049888134002686, + "step": 4140, + "token_acc": 0.2884488655976402 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.27829244556515836, + "learning_rate": 0.00029584207162948456, + "loss": 3.1409943103790283, + "step": 4141, + "token_acc": 0.282409726096716 + }, + { + "epoch": 2.4280269715625917, + "grad_norm": 0.3242347918917184, + "learning_rate": 0.00029583867167408303, + "loss": 3.104142665863037, + "step": 4142, + "token_acc": 0.28715826500788866 + }, + { + "epoch": 2.4286133098798004, + "grad_norm": 0.30335562857582377, + "learning_rate": 0.00029583527034872376, + "loss": 3.1046061515808105, + "step": 4143, + "token_acc": 0.2888985780164989 + }, + { + "epoch": 2.4291996481970095, + "grad_norm": 0.29787614919578354, + "learning_rate": 0.0002958318676534386, + "loss": 3.1188926696777344, + "step": 4144, + "token_acc": 0.28509726368729016 + }, + { + "epoch": 2.4297859865142186, + "grad_norm": 0.3163667983951163, + "learning_rate": 0.0002958284635882595, + "loss": 3.1389687061309814, + "step": 4145, + "token_acc": 0.28454566048500546 + }, + { + "epoch": 2.4303723248314277, + "grad_norm": 0.3853796205601756, + "learning_rate": 0.00029582505815321856, + "loss": 3.1683313846588135, + "step": 4146, + "token_acc": 0.2784516181484662 + }, + { + "epoch": 2.430958663148637, + "grad_norm": 0.3592960418412085, + "learning_rate": 0.00029582165134834775, + "loss": 3.1298131942749023, + "step": 4147, + "token_acc": 0.28345887700125116 + }, + { + "epoch": 2.431545001465846, + "grad_norm": 0.30809064151861587, + "learning_rate": 0.00029581824317367894, + "loss": 3.1052701473236084, + "step": 4148, + "token_acc": 0.28943949794290025 + }, + { + "epoch": 2.432131339783055, + "grad_norm": 0.35784324141670515, + "learning_rate": 0.0002958148336292443, + "loss": 3.1838088035583496, + "step": 4149, + "token_acc": 0.2778973586298084 + }, + { + "epoch": 2.4327176781002637, + "grad_norm": 0.3307604483002467, + "learning_rate": 0.0002958114227150757, + "loss": 3.136889934539795, + "step": 4150, + "token_acc": 0.2853696690702047 + }, + { + "epoch": 2.433304016417473, + "grad_norm": 0.3252739757065157, + "learning_rate": 0.0002958080104312053, + "loss": 3.165478229522705, + "step": 4151, + "token_acc": 0.2797660812679283 + }, + { + "epoch": 2.433890354734682, + "grad_norm": 0.32482038551705, + "learning_rate": 0.00029580459677766514, + "loss": 3.123816967010498, + "step": 4152, + "token_acc": 0.2862042351651999 + }, + { + "epoch": 2.434476693051891, + "grad_norm": 0.3435108623904546, + "learning_rate": 0.0002958011817544872, + "loss": 3.073190689086914, + "step": 4153, + "token_acc": 0.2925355156599344 + }, + { + "epoch": 2.4350630313690997, + "grad_norm": 0.32125023117817675, + "learning_rate": 0.00029579776536170374, + "loss": 3.0986857414245605, + "step": 4154, + "token_acc": 0.2880927477855091 + }, + { + "epoch": 2.435649369686309, + "grad_norm": 0.3120668006854101, + "learning_rate": 0.00029579434759934665, + "loss": 3.1393611431121826, + "step": 4155, + "token_acc": 0.2843403086630424 + }, + { + "epoch": 2.436235708003518, + "grad_norm": 0.28873752260934354, + "learning_rate": 0.00029579092846744815, + "loss": 3.107593297958374, + "step": 4156, + "token_acc": 0.28766502000984384 + }, + { + "epoch": 2.436822046320727, + "grad_norm": 0.34062789305957064, + "learning_rate": 0.00029578750796604037, + "loss": 3.1653482913970947, + "step": 4157, + "token_acc": 0.2784347401344037 + }, + { + "epoch": 2.437408384637936, + "grad_norm": 0.34622514319925485, + "learning_rate": 0.00029578408609515536, + "loss": 3.1186602115631104, + "step": 4158, + "token_acc": 0.2868462547886638 + }, + { + "epoch": 2.4379947229551453, + "grad_norm": 0.28138463398848174, + "learning_rate": 0.0002957806628548253, + "loss": 3.163783550262451, + "step": 4159, + "token_acc": 0.28030947653714033 + }, + { + "epoch": 2.438581061272354, + "grad_norm": 0.3067602183936032, + "learning_rate": 0.0002957772382450824, + "loss": 3.124952793121338, + "step": 4160, + "token_acc": 0.2844958357523336 + }, + { + "epoch": 2.439167399589563, + "grad_norm": 0.3243698449715789, + "learning_rate": 0.00029577381226595875, + "loss": 3.110630750656128, + "step": 4161, + "token_acc": 0.28887556847056983 + }, + { + "epoch": 2.439753737906772, + "grad_norm": 0.29078139061430774, + "learning_rate": 0.0002957703849174866, + "loss": 3.127685785293579, + "step": 4162, + "token_acc": 0.2845871197576688 + }, + { + "epoch": 2.4403400762239813, + "grad_norm": 0.3152966473782495, + "learning_rate": 0.000295766956199698, + "loss": 3.128417730331421, + "step": 4163, + "token_acc": 0.28486416347270455 + }, + { + "epoch": 2.4409264145411904, + "grad_norm": 0.29299622687503285, + "learning_rate": 0.0002957635261126254, + "loss": 3.1237001419067383, + "step": 4164, + "token_acc": 0.28450736414423566 + }, + { + "epoch": 2.441512752858399, + "grad_norm": 0.27848354177585716, + "learning_rate": 0.00029576009465630086, + "loss": 3.1376285552978516, + "step": 4165, + "token_acc": 0.28463043618879236 + }, + { + "epoch": 2.442099091175608, + "grad_norm": 0.3083234842261065, + "learning_rate": 0.00029575666183075664, + "loss": 3.114762306213379, + "step": 4166, + "token_acc": 0.2863711539741627 + }, + { + "epoch": 2.4426854294928173, + "grad_norm": 0.29221217452966763, + "learning_rate": 0.00029575322763602494, + "loss": 3.1787891387939453, + "step": 4167, + "token_acc": 0.2788001186707784 + }, + { + "epoch": 2.4432717678100264, + "grad_norm": 0.28847867120031484, + "learning_rate": 0.0002957497920721382, + "loss": 3.122265338897705, + "step": 4168, + "token_acc": 0.2864476797818153 + }, + { + "epoch": 2.4438581061272355, + "grad_norm": 0.2843944230967726, + "learning_rate": 0.00029574635513912844, + "loss": 3.1669845581054688, + "step": 4169, + "token_acc": 0.279684713733333 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.3236609233308892, + "learning_rate": 0.0002957429168370281, + "loss": 3.1467251777648926, + "step": 4170, + "token_acc": 0.28437059180004787 + }, + { + "epoch": 2.4450307827616533, + "grad_norm": 0.3280662803360395, + "learning_rate": 0.0002957394771658695, + "loss": 3.132016897201538, + "step": 4171, + "token_acc": 0.2850108107548907 + }, + { + "epoch": 2.4456171210788624, + "grad_norm": 0.3181695316937913, + "learning_rate": 0.0002957360361256848, + "loss": 3.144162178039551, + "step": 4172, + "token_acc": 0.28185316826731494 + }, + { + "epoch": 2.4462034593960715, + "grad_norm": 0.3250763790010029, + "learning_rate": 0.0002957325937165065, + "loss": 3.1430256366729736, + "step": 4173, + "token_acc": 0.2826527076075656 + }, + { + "epoch": 2.4467897977132806, + "grad_norm": 0.31958522365760733, + "learning_rate": 0.00029572914993836684, + "loss": 3.1445913314819336, + "step": 4174, + "token_acc": 0.2827902411174186 + }, + { + "epoch": 2.4473761360304898, + "grad_norm": 0.3243449422808361, + "learning_rate": 0.0002957257047912981, + "loss": 3.0831727981567383, + "step": 4175, + "token_acc": 0.2929087118226796 + }, + { + "epoch": 2.4479624743476984, + "grad_norm": 0.34987616331853155, + "learning_rate": 0.00029572225827533287, + "loss": 3.166348934173584, + "step": 4176, + "token_acc": 0.27772515383181057 + }, + { + "epoch": 2.4485488126649075, + "grad_norm": 0.3374678325520746, + "learning_rate": 0.00029571881039050334, + "loss": 3.1206789016723633, + "step": 4177, + "token_acc": 0.2860181635561445 + }, + { + "epoch": 2.4491351509821166, + "grad_norm": 0.29682492344740724, + "learning_rate": 0.0002957153611368419, + "loss": 3.128352403640747, + "step": 4178, + "token_acc": 0.28411640934848004 + }, + { + "epoch": 2.4497214892993258, + "grad_norm": 0.34839576578136816, + "learning_rate": 0.00029571191051438106, + "loss": 3.1155009269714355, + "step": 4179, + "token_acc": 0.2866963650262277 + }, + { + "epoch": 2.450307827616535, + "grad_norm": 0.3560622198599429, + "learning_rate": 0.00029570845852315314, + "loss": 3.1029927730560303, + "step": 4180, + "token_acc": 0.2887865535764464 + }, + { + "epoch": 2.450894165933744, + "grad_norm": 0.32997705390420234, + "learning_rate": 0.00029570500516319057, + "loss": 3.121243953704834, + "step": 4181, + "token_acc": 0.2856549212593468 + }, + { + "epoch": 2.4514805042509527, + "grad_norm": 0.360099242951122, + "learning_rate": 0.00029570155043452586, + "loss": 3.1484968662261963, + "step": 4182, + "token_acc": 0.2834438609469389 + }, + { + "epoch": 2.4520668425681618, + "grad_norm": 0.356534687717105, + "learning_rate": 0.00029569809433719143, + "loss": 3.163255214691162, + "step": 4183, + "token_acc": 0.27985202242116175 + }, + { + "epoch": 2.452653180885371, + "grad_norm": 0.3086882308229312, + "learning_rate": 0.0002956946368712197, + "loss": 3.1430625915527344, + "step": 4184, + "token_acc": 0.2842363300621709 + }, + { + "epoch": 2.45323951920258, + "grad_norm": 0.3763624685664077, + "learning_rate": 0.0002956911780366432, + "loss": 3.107767105102539, + "step": 4185, + "token_acc": 0.285120833716597 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.3669182500537939, + "learning_rate": 0.00029568771783349435, + "loss": 3.0617265701293945, + "step": 4186, + "token_acc": 0.2956620297705423 + }, + { + "epoch": 2.4544121958369978, + "grad_norm": 0.34354353619456457, + "learning_rate": 0.00029568425626180577, + "loss": 3.1100916862487793, + "step": 4187, + "token_acc": 0.28686218640670647 + }, + { + "epoch": 2.454998534154207, + "grad_norm": 0.40416474070091496, + "learning_rate": 0.00029568079332160994, + "loss": 3.177902936935425, + "step": 4188, + "token_acc": 0.2778932235509377 + }, + { + "epoch": 2.455584872471416, + "grad_norm": 0.33395624337927426, + "learning_rate": 0.0002956773290129393, + "loss": 3.084484100341797, + "step": 4189, + "token_acc": 0.2905265292671179 + }, + { + "epoch": 2.456171210788625, + "grad_norm": 0.3242802555969176, + "learning_rate": 0.0002956738633358265, + "loss": 3.149378776550293, + "step": 4190, + "token_acc": 0.28139552263807177 + }, + { + "epoch": 2.456757549105834, + "grad_norm": 0.3539448464617524, + "learning_rate": 0.000295670396290304, + "loss": 3.0530712604522705, + "step": 4191, + "token_acc": 0.2935181586561299 + }, + { + "epoch": 2.4573438874230433, + "grad_norm": 0.3411228719522272, + "learning_rate": 0.0002956669278764045, + "loss": 3.079084634780884, + "step": 4192, + "token_acc": 0.29214418521592067 + }, + { + "epoch": 2.457930225740252, + "grad_norm": 0.3277844080736803, + "learning_rate": 0.0002956634580941604, + "loss": 3.1283328533172607, + "step": 4193, + "token_acc": 0.28482219722229984 + }, + { + "epoch": 2.458516564057461, + "grad_norm": 0.33957628128592593, + "learning_rate": 0.00029565998694360446, + "loss": 3.1166884899139404, + "step": 4194, + "token_acc": 0.2857177948268662 + }, + { + "epoch": 2.45910290237467, + "grad_norm": 0.3484167500081368, + "learning_rate": 0.0002956565144247692, + "loss": 3.1315340995788574, + "step": 4195, + "token_acc": 0.287011826544021 + }, + { + "epoch": 2.4596892406918793, + "grad_norm": 0.34330509317864644, + "learning_rate": 0.0002956530405376873, + "loss": 3.150961399078369, + "step": 4196, + "token_acc": 0.2808462914892008 + }, + { + "epoch": 2.460275579009088, + "grad_norm": 0.41202732951419013, + "learning_rate": 0.00029564956528239125, + "loss": 3.1293752193450928, + "step": 4197, + "token_acc": 0.28497534499558436 + }, + { + "epoch": 2.460861917326297, + "grad_norm": 0.43297067789715016, + "learning_rate": 0.0002956460886589139, + "loss": 3.110107421875, + "step": 4198, + "token_acc": 0.28839363410099 + }, + { + "epoch": 2.4614482556435062, + "grad_norm": 0.36172252475118377, + "learning_rate": 0.00029564261066728783, + "loss": 3.1329855918884277, + "step": 4199, + "token_acc": 0.283375803307192 + }, + { + "epoch": 2.4620345939607153, + "grad_norm": 0.36295689629908084, + "learning_rate": 0.0002956391313075456, + "loss": 3.080261707305908, + "step": 4200, + "token_acc": 0.29044826615523445 + }, + { + "epoch": 2.4626209322779244, + "grad_norm": 0.36560847232008425, + "learning_rate": 0.00029563565057972, + "loss": 3.1188130378723145, + "step": 4201, + "token_acc": 0.2876926082848662 + }, + { + "epoch": 2.4632072705951336, + "grad_norm": 0.36342945049757625, + "learning_rate": 0.00029563216848384373, + "loss": 3.1749165058135986, + "step": 4202, + "token_acc": 0.27960442153960224 + }, + { + "epoch": 2.4637936089123427, + "grad_norm": 0.34904288908233316, + "learning_rate": 0.0002956286850199495, + "loss": 3.188642978668213, + "step": 4203, + "token_acc": 0.2780057345605467 + }, + { + "epoch": 2.4643799472295513, + "grad_norm": 0.3487022381206377, + "learning_rate": 0.00029562520018807, + "loss": 3.114593505859375, + "step": 4204, + "token_acc": 0.2870092885848881 + }, + { + "epoch": 2.4649662855467604, + "grad_norm": 0.32716583505502966, + "learning_rate": 0.000295621713988238, + "loss": 3.138054132461548, + "step": 4205, + "token_acc": 0.28321283118422536 + }, + { + "epoch": 2.4655526238639696, + "grad_norm": 0.35143450296374307, + "learning_rate": 0.00029561822642048613, + "loss": 3.1015212535858154, + "step": 4206, + "token_acc": 0.28827525409666044 + }, + { + "epoch": 2.4661389621811787, + "grad_norm": 0.33099857728937004, + "learning_rate": 0.00029561473748484737, + "loss": 3.117356777191162, + "step": 4207, + "token_acc": 0.2853810124069735 + }, + { + "epoch": 2.4667253004983873, + "grad_norm": 0.3614354394189754, + "learning_rate": 0.0002956112471813543, + "loss": 3.1570494174957275, + "step": 4208, + "token_acc": 0.28168092876219547 + }, + { + "epoch": 2.4673116388155965, + "grad_norm": 0.28021653797624285, + "learning_rate": 0.00029560775551003976, + "loss": 3.094374418258667, + "step": 4209, + "token_acc": 0.28876766764975187 + }, + { + "epoch": 2.4678979771328056, + "grad_norm": 0.3315945644434655, + "learning_rate": 0.0002956042624709366, + "loss": 3.121507167816162, + "step": 4210, + "token_acc": 0.2856035916366763 + }, + { + "epoch": 2.4684843154500147, + "grad_norm": 0.2872596819165396, + "learning_rate": 0.0002956007680640776, + "loss": 3.13769793510437, + "step": 4211, + "token_acc": 0.2836105192037836 + }, + { + "epoch": 2.469070653767224, + "grad_norm": 0.32233988174839634, + "learning_rate": 0.00029559727228949557, + "loss": 3.1804490089416504, + "step": 4212, + "token_acc": 0.2776913749815006 + }, + { + "epoch": 2.469656992084433, + "grad_norm": 0.29495945382982525, + "learning_rate": 0.00029559377514722335, + "loss": 3.1457433700561523, + "step": 4213, + "token_acc": 0.2807951485922437 + }, + { + "epoch": 2.4702433304016416, + "grad_norm": 0.32294445777366326, + "learning_rate": 0.0002955902766372938, + "loss": 3.1555449962615967, + "step": 4214, + "token_acc": 0.28004664058522327 + }, + { + "epoch": 2.4708296687188507, + "grad_norm": 0.3333422555111827, + "learning_rate": 0.00029558677675973983, + "loss": 3.1423935890197754, + "step": 4215, + "token_acc": 0.2835082075886914 + }, + { + "epoch": 2.47141600703606, + "grad_norm": 0.31116082835263803, + "learning_rate": 0.00029558327551459424, + "loss": 3.0912275314331055, + "step": 4216, + "token_acc": 0.28784021930683373 + }, + { + "epoch": 2.472002345353269, + "grad_norm": 0.32525185116017097, + "learning_rate": 0.0002955797729018899, + "loss": 3.1119723320007324, + "step": 4217, + "token_acc": 0.2880186869450072 + }, + { + "epoch": 2.472588683670478, + "grad_norm": 0.3061107417915048, + "learning_rate": 0.0002955762689216599, + "loss": 3.0874617099761963, + "step": 4218, + "token_acc": 0.29053891318193736 + }, + { + "epoch": 2.4731750219876867, + "grad_norm": 0.3524930550252353, + "learning_rate": 0.0002955727635739369, + "loss": 3.1435797214508057, + "step": 4219, + "token_acc": 0.28136873087755826 + }, + { + "epoch": 2.473761360304896, + "grad_norm": 0.3253360138174695, + "learning_rate": 0.00029556925685875397, + "loss": 3.1105964183807373, + "step": 4220, + "token_acc": 0.2891587646342109 + }, + { + "epoch": 2.474347698622105, + "grad_norm": 0.3519338953765347, + "learning_rate": 0.00029556574877614414, + "loss": 3.1222329139709473, + "step": 4221, + "token_acc": 0.2862146354733406 + }, + { + "epoch": 2.474934036939314, + "grad_norm": 0.31087372963778626, + "learning_rate": 0.0002955622393261401, + "loss": 3.107698917388916, + "step": 4222, + "token_acc": 0.2850278306957142 + }, + { + "epoch": 2.475520375256523, + "grad_norm": 0.32991025649227784, + "learning_rate": 0.0002955587285087751, + "loss": 3.135448455810547, + "step": 4223, + "token_acc": 0.28400683129854276 + }, + { + "epoch": 2.4761067135737322, + "grad_norm": 0.36558085572983107, + "learning_rate": 0.0002955552163240819, + "loss": 3.1406960487365723, + "step": 4224, + "token_acc": 0.28434973724837065 + }, + { + "epoch": 2.476693051890941, + "grad_norm": 0.3510819427215274, + "learning_rate": 0.0002955517027720936, + "loss": 3.135343074798584, + "step": 4225, + "token_acc": 0.2850491544200243 + }, + { + "epoch": 2.47727939020815, + "grad_norm": 0.30928077986996405, + "learning_rate": 0.0002955481878528431, + "loss": 3.0975117683410645, + "step": 4226, + "token_acc": 0.2896353814107506 + }, + { + "epoch": 2.477865728525359, + "grad_norm": 0.34303876384753146, + "learning_rate": 0.0002955446715663636, + "loss": 3.1281652450561523, + "step": 4227, + "token_acc": 0.2843875799000313 + }, + { + "epoch": 2.4784520668425682, + "grad_norm": 0.288192394799522, + "learning_rate": 0.00029554115391268806, + "loss": 3.1197304725646973, + "step": 4228, + "token_acc": 0.2858469970737145 + }, + { + "epoch": 2.4790384051597774, + "grad_norm": 0.3202206229172926, + "learning_rate": 0.00029553763489184945, + "loss": 3.145132303237915, + "step": 4229, + "token_acc": 0.2832567360074038 + }, + { + "epoch": 2.479624743476986, + "grad_norm": 0.34367550376943795, + "learning_rate": 0.00029553411450388095, + "loss": 3.036426305770874, + "step": 4230, + "token_acc": 0.29639427888143616 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.33272174101248536, + "learning_rate": 0.0002955305927488155, + "loss": 3.1014623641967773, + "step": 4231, + "token_acc": 0.287030085893128 + }, + { + "epoch": 2.4807974201114043, + "grad_norm": 0.2935401257400599, + "learning_rate": 0.0002955270696266862, + "loss": 3.1379213333129883, + "step": 4232, + "token_acc": 0.284338440517109 + }, + { + "epoch": 2.4813837584286134, + "grad_norm": 0.3113287323525847, + "learning_rate": 0.0002955235451375262, + "loss": 3.109300136566162, + "step": 4233, + "token_acc": 0.2875254626534641 + }, + { + "epoch": 2.4819700967458225, + "grad_norm": 0.355477084486217, + "learning_rate": 0.00029552001928136856, + "loss": 3.1507697105407715, + "step": 4234, + "token_acc": 0.28038723835859497 + }, + { + "epoch": 2.4825564350630316, + "grad_norm": 0.3335133025940925, + "learning_rate": 0.0002955164920582465, + "loss": 3.1125264167785645, + "step": 4235, + "token_acc": 0.28860916893668365 + }, + { + "epoch": 2.4831427733802403, + "grad_norm": 0.29447035353746276, + "learning_rate": 0.00029551296346819303, + "loss": 3.146637439727783, + "step": 4236, + "token_acc": 0.28278621073820404 + }, + { + "epoch": 2.4837291116974494, + "grad_norm": 0.3200699913914711, + "learning_rate": 0.00029550943351124134, + "loss": 3.1082613468170166, + "step": 4237, + "token_acc": 0.28849520710417476 + }, + { + "epoch": 2.4843154500146585, + "grad_norm": 0.3356610793835404, + "learning_rate": 0.00029550590218742465, + "loss": 3.1045634746551514, + "step": 4238, + "token_acc": 0.28982759931778435 + }, + { + "epoch": 2.4849017883318676, + "grad_norm": 0.29371548077260956, + "learning_rate": 0.0002955023694967761, + "loss": 3.129265785217285, + "step": 4239, + "token_acc": 0.2829359402722526 + }, + { + "epoch": 2.4854881266490767, + "grad_norm": 0.3162517737406707, + "learning_rate": 0.0002954988354393288, + "loss": 3.174086332321167, + "step": 4240, + "token_acc": 0.2786354225924685 + }, + { + "epoch": 2.4860744649662854, + "grad_norm": 0.3508128545201341, + "learning_rate": 0.000295495300015116, + "loss": 3.128365993499756, + "step": 4241, + "token_acc": 0.283498046133179 + }, + { + "epoch": 2.4866608032834945, + "grad_norm": 0.2696902572173051, + "learning_rate": 0.0002954917632241709, + "loss": 3.097126007080078, + "step": 4242, + "token_acc": 0.2902416885065304 + }, + { + "epoch": 2.4872471416007036, + "grad_norm": 0.315232384988462, + "learning_rate": 0.00029548822506652683, + "loss": 3.114626884460449, + "step": 4243, + "token_acc": 0.2863126863525797 + }, + { + "epoch": 2.4878334799179127, + "grad_norm": 0.3145377426585128, + "learning_rate": 0.00029548468554221684, + "loss": 3.122138500213623, + "step": 4244, + "token_acc": 0.2859493749043632 + }, + { + "epoch": 2.488419818235122, + "grad_norm": 0.34108521453424073, + "learning_rate": 0.0002954811446512743, + "loss": 3.1309239864349365, + "step": 4245, + "token_acc": 0.2838607985159848 + }, + { + "epoch": 2.489006156552331, + "grad_norm": 0.30384387372361843, + "learning_rate": 0.0002954776023937325, + "loss": 3.104358434677124, + "step": 4246, + "token_acc": 0.28820151101750774 + }, + { + "epoch": 2.4895924948695396, + "grad_norm": 0.31865600447866366, + "learning_rate": 0.0002954740587696246, + "loss": 3.1377036571502686, + "step": 4247, + "token_acc": 0.2829648969062782 + }, + { + "epoch": 2.4901788331867487, + "grad_norm": 0.32152378678503973, + "learning_rate": 0.0002954705137789839, + "loss": 3.120666027069092, + "step": 4248, + "token_acc": 0.2863717732936955 + }, + { + "epoch": 2.490765171503958, + "grad_norm": 0.31906300216078254, + "learning_rate": 0.00029546696742184384, + "loss": 3.1133618354797363, + "step": 4249, + "token_acc": 0.28531728022638086 + }, + { + "epoch": 2.491351509821167, + "grad_norm": 0.31299503493825803, + "learning_rate": 0.0002954634196982376, + "loss": 3.124864101409912, + "step": 4250, + "token_acc": 0.28509427853409597 + }, + { + "epoch": 2.4919378481383756, + "grad_norm": 0.2897941975931226, + "learning_rate": 0.0002954598706081985, + "loss": 3.1135191917419434, + "step": 4251, + "token_acc": 0.28712027120576616 + }, + { + "epoch": 2.4925241864555847, + "grad_norm": 0.3128995613373998, + "learning_rate": 0.00029545632015176, + "loss": 3.1187100410461426, + "step": 4252, + "token_acc": 0.2847487420397716 + }, + { + "epoch": 2.493110524772794, + "grad_norm": 0.31561856367490715, + "learning_rate": 0.00029545276832895534, + "loss": 3.1421236991882324, + "step": 4253, + "token_acc": 0.28182060835839107 + }, + { + "epoch": 2.493696863090003, + "grad_norm": 0.27803548320666305, + "learning_rate": 0.00029544921513981794, + "loss": 3.1107163429260254, + "step": 4254, + "token_acc": 0.2889379748124876 + }, + { + "epoch": 2.494283201407212, + "grad_norm": 0.314744966893636, + "learning_rate": 0.00029544566058438117, + "loss": 3.1327579021453857, + "step": 4255, + "token_acc": 0.28558351043289704 + }, + { + "epoch": 2.494869539724421, + "grad_norm": 0.33870765981748074, + "learning_rate": 0.0002954421046626784, + "loss": 3.112725019454956, + "step": 4256, + "token_acc": 0.28596951160306827 + }, + { + "epoch": 2.49545587804163, + "grad_norm": 0.3553777146326446, + "learning_rate": 0.00029543854737474305, + "loss": 3.1192259788513184, + "step": 4257, + "token_acc": 0.2855761277597506 + }, + { + "epoch": 2.496042216358839, + "grad_norm": 0.30900343562019056, + "learning_rate": 0.0002954349887206085, + "loss": 3.1153345108032227, + "step": 4258, + "token_acc": 0.28593679053019505 + }, + { + "epoch": 2.496628554676048, + "grad_norm": 0.32860506662179256, + "learning_rate": 0.0002954314287003082, + "loss": 3.067028522491455, + "step": 4259, + "token_acc": 0.29180275610508105 + }, + { + "epoch": 2.497214892993257, + "grad_norm": 0.31094513519671807, + "learning_rate": 0.00029542786731387563, + "loss": 3.0821661949157715, + "step": 4260, + "token_acc": 0.2925090483676379 + }, + { + "epoch": 2.4978012313104663, + "grad_norm": 0.3201587033810314, + "learning_rate": 0.00029542430456134416, + "loss": 3.166276454925537, + "step": 4261, + "token_acc": 0.280776188723648 + }, + { + "epoch": 2.498387569627675, + "grad_norm": 0.3270154881297289, + "learning_rate": 0.0002954207404427474, + "loss": 3.091919422149658, + "step": 4262, + "token_acc": 0.289708317076 + }, + { + "epoch": 2.498973907944884, + "grad_norm": 0.33923472568021196, + "learning_rate": 0.00029541717495811865, + "loss": 3.122898578643799, + "step": 4263, + "token_acc": 0.2856605681503482 + }, + { + "epoch": 2.499560246262093, + "grad_norm": 0.308202116241944, + "learning_rate": 0.00029541360810749157, + "loss": 3.1659488677978516, + "step": 4264, + "token_acc": 0.28156263896871236 + }, + { + "epoch": 2.5001465845793023, + "grad_norm": 0.3614750298077898, + "learning_rate": 0.0002954100398908995, + "loss": 3.114520788192749, + "step": 4265, + "token_acc": 0.28730132965772687 + }, + { + "epoch": 2.5007329228965114, + "grad_norm": 0.316542868780964, + "learning_rate": 0.00029540647030837613, + "loss": 3.1104397773742676, + "step": 4266, + "token_acc": 0.2872000652990724 + }, + { + "epoch": 2.5013192612137205, + "grad_norm": 0.32958127722448066, + "learning_rate": 0.0002954028993599549, + "loss": 3.1017701625823975, + "step": 4267, + "token_acc": 0.28864066111072206 + }, + { + "epoch": 2.5019055995309296, + "grad_norm": 0.2699079753546184, + "learning_rate": 0.00029539932704566936, + "loss": 3.1212515830993652, + "step": 4268, + "token_acc": 0.2850652823319054 + }, + { + "epoch": 2.5024919378481383, + "grad_norm": 0.32981919795996034, + "learning_rate": 0.00029539575336555305, + "loss": 3.1233763694763184, + "step": 4269, + "token_acc": 0.28456277541132735 + }, + { + "epoch": 2.5030782761653474, + "grad_norm": 0.28232892417559213, + "learning_rate": 0.00029539217831963955, + "loss": 3.158236503601074, + "step": 4270, + "token_acc": 0.28058296519549153 + }, + { + "epoch": 2.5036646144825565, + "grad_norm": 0.3192177218110458, + "learning_rate": 0.0002953886019079625, + "loss": 3.143063545227051, + "step": 4271, + "token_acc": 0.2840551930575431 + }, + { + "epoch": 2.5042509527997656, + "grad_norm": 0.28279937988709414, + "learning_rate": 0.00029538502413055536, + "loss": 3.093001127243042, + "step": 4272, + "token_acc": 0.2891821231001556 + }, + { + "epoch": 2.5048372911169743, + "grad_norm": 0.31487389756325257, + "learning_rate": 0.0002953814449874519, + "loss": 3.128673791885376, + "step": 4273, + "token_acc": 0.2821220661037056 + }, + { + "epoch": 2.5054236294341834, + "grad_norm": 0.31742831419528206, + "learning_rate": 0.0002953778644786856, + "loss": 3.1556921005249023, + "step": 4274, + "token_acc": 0.28313281848353333 + }, + { + "epoch": 2.5060099677513925, + "grad_norm": 0.31815936401486744, + "learning_rate": 0.0002953742826042903, + "loss": 3.1049160957336426, + "step": 4275, + "token_acc": 0.28871999872758025 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.3503645021974508, + "learning_rate": 0.00029537069936429937, + "loss": 3.157719135284424, + "step": 4276, + "token_acc": 0.28207544399627343 + }, + { + "epoch": 2.5071826443858107, + "grad_norm": 0.3594402892783687, + "learning_rate": 0.00029536711475874666, + "loss": 3.1241660118103027, + "step": 4277, + "token_acc": 0.2852489383913784 + }, + { + "epoch": 2.50776898270302, + "grad_norm": 0.34849343226314317, + "learning_rate": 0.0002953635287876658, + "loss": 3.1454551219940186, + "step": 4278, + "token_acc": 0.28227923769095253 + }, + { + "epoch": 2.5083553210202285, + "grad_norm": 0.3585555438063583, + "learning_rate": 0.00029535994145109046, + "loss": 3.1272215843200684, + "step": 4279, + "token_acc": 0.28581487898855157 + }, + { + "epoch": 2.5089416593374376, + "grad_norm": 0.3351317717773386, + "learning_rate": 0.00029535635274905434, + "loss": 3.1277172565460205, + "step": 4280, + "token_acc": 0.2848978675410932 + }, + { + "epoch": 2.5095279976546467, + "grad_norm": 0.30607508534974665, + "learning_rate": 0.0002953527626815912, + "loss": 3.1315340995788574, + "step": 4281, + "token_acc": 0.28478914624250173 + }, + { + "epoch": 2.510114335971856, + "grad_norm": 0.2918488155099912, + "learning_rate": 0.0002953491712487347, + "loss": 3.1241111755371094, + "step": 4282, + "token_acc": 0.2853506817766011 + }, + { + "epoch": 2.5107006742890645, + "grad_norm": 0.2994312593529727, + "learning_rate": 0.0002953455784505185, + "loss": 3.1180672645568848, + "step": 4283, + "token_acc": 0.285654876787409 + }, + { + "epoch": 2.5112870126062736, + "grad_norm": 0.30416809637775405, + "learning_rate": 0.00029534198428697654, + "loss": 3.1356091499328613, + "step": 4284, + "token_acc": 0.28221616205439687 + }, + { + "epoch": 2.5118733509234827, + "grad_norm": 0.32496246652529037, + "learning_rate": 0.0002953383887581425, + "loss": 3.149949312210083, + "step": 4285, + "token_acc": 0.2832623024591683 + }, + { + "epoch": 2.512459689240692, + "grad_norm": 0.2956962743833167, + "learning_rate": 0.0002953347918640501, + "loss": 3.131580352783203, + "step": 4286, + "token_acc": 0.2859376464943388 + }, + { + "epoch": 2.513046027557901, + "grad_norm": 0.2784091394795092, + "learning_rate": 0.00029533119360473323, + "loss": 3.1316494941711426, + "step": 4287, + "token_acc": 0.2856677482653807 + }, + { + "epoch": 2.51363236587511, + "grad_norm": 0.34004882817463417, + "learning_rate": 0.00029532759398022553, + "loss": 3.1386194229125977, + "step": 4288, + "token_acc": 0.2831550623167646 + }, + { + "epoch": 2.514218704192319, + "grad_norm": 0.3702305124962639, + "learning_rate": 0.000295323992990561, + "loss": 3.1292080879211426, + "step": 4289, + "token_acc": 0.2844236817228404 + }, + { + "epoch": 2.514805042509528, + "grad_norm": 0.33489668508845444, + "learning_rate": 0.00029532039063577336, + "loss": 3.0976943969726562, + "step": 4290, + "token_acc": 0.28858561562522583 + }, + { + "epoch": 2.515391380826737, + "grad_norm": 0.300478838592618, + "learning_rate": 0.00029531678691589643, + "loss": 3.109555244445801, + "step": 4291, + "token_acc": 0.2864804367558189 + }, + { + "epoch": 2.515977719143946, + "grad_norm": 0.31555278485035515, + "learning_rate": 0.0002953131818309642, + "loss": 3.131467819213867, + "step": 4292, + "token_acc": 0.2852478495595795 + }, + { + "epoch": 2.516564057461155, + "grad_norm": 0.32770524376508636, + "learning_rate": 0.00029530957538101036, + "loss": 3.1313812732696533, + "step": 4293, + "token_acc": 0.28433728077215836 + }, + { + "epoch": 2.517150395778364, + "grad_norm": 0.3269669908293961, + "learning_rate": 0.00029530596756606887, + "loss": 3.1337761878967285, + "step": 4294, + "token_acc": 0.28435513657739203 + }, + { + "epoch": 2.517736734095573, + "grad_norm": 0.34938551979373583, + "learning_rate": 0.00029530235838617366, + "loss": 3.1000711917877197, + "step": 4295, + "token_acc": 0.28843630325915215 + }, + { + "epoch": 2.518323072412782, + "grad_norm": 0.2869628877369993, + "learning_rate": 0.00029529874784135855, + "loss": 3.114349365234375, + "step": 4296, + "token_acc": 0.28658154597579105 + }, + { + "epoch": 2.518909410729991, + "grad_norm": 0.3050272365898088, + "learning_rate": 0.00029529513593165753, + "loss": 3.1176042556762695, + "step": 4297, + "token_acc": 0.2866637739605401 + }, + { + "epoch": 2.5194957490472003, + "grad_norm": 0.30225819434537354, + "learning_rate": 0.0002952915226571045, + "loss": 3.159348964691162, + "step": 4298, + "token_acc": 0.2810411175675745 + }, + { + "epoch": 2.5200820873644094, + "grad_norm": 0.2856017547883984, + "learning_rate": 0.00029528790801773336, + "loss": 3.13706636428833, + "step": 4299, + "token_acc": 0.28298534217747157 + }, + { + "epoch": 2.5206684256816185, + "grad_norm": 0.31889563939603155, + "learning_rate": 0.00029528429201357813, + "loss": 3.126056671142578, + "step": 4300, + "token_acc": 0.28436832246339155 + }, + { + "epoch": 2.521254763998827, + "grad_norm": 0.29718437847148615, + "learning_rate": 0.0002952806746446727, + "loss": 3.1044211387634277, + "step": 4301, + "token_acc": 0.287364331491237 + }, + { + "epoch": 2.5218411023160363, + "grad_norm": 0.2812343284948562, + "learning_rate": 0.0002952770559110511, + "loss": 3.110229015350342, + "step": 4302, + "token_acc": 0.2855062331047399 + }, + { + "epoch": 2.5224274406332454, + "grad_norm": 0.29906768468566935, + "learning_rate": 0.0002952734358127474, + "loss": 3.117032527923584, + "step": 4303, + "token_acc": 0.28555354610657846 + }, + { + "epoch": 2.5230137789504545, + "grad_norm": 0.3017265818305121, + "learning_rate": 0.00029526981434979546, + "loss": 3.098659038543701, + "step": 4304, + "token_acc": 0.28831436756292356 + }, + { + "epoch": 2.523600117267663, + "grad_norm": 0.3150195391158688, + "learning_rate": 0.0002952661915222294, + "loss": 3.1114182472229004, + "step": 4305, + "token_acc": 0.28765731802027455 + }, + { + "epoch": 2.5241864555848723, + "grad_norm": 0.3178407889061973, + "learning_rate": 0.0002952625673300832, + "loss": 3.1247172355651855, + "step": 4306, + "token_acc": 0.28514862513791056 + }, + { + "epoch": 2.5247727939020814, + "grad_norm": 0.295426154146563, + "learning_rate": 0.00029525894177339095, + "loss": 3.1134142875671387, + "step": 4307, + "token_acc": 0.285136732603042 + }, + { + "epoch": 2.5253591322192905, + "grad_norm": 0.27540679341412566, + "learning_rate": 0.0002952553148521867, + "loss": 3.139166831970215, + "step": 4308, + "token_acc": 0.28229051997874627 + }, + { + "epoch": 2.5259454705364996, + "grad_norm": 0.3074991356813386, + "learning_rate": 0.0002952516865665045, + "loss": 3.1365439891815186, + "step": 4309, + "token_acc": 0.2835927002281179 + }, + { + "epoch": 2.5265318088537088, + "grad_norm": 0.3355890767173872, + "learning_rate": 0.00029524805691637837, + "loss": 3.131884813308716, + "step": 4310, + "token_acc": 0.2829848938746893 + }, + { + "epoch": 2.527118147170918, + "grad_norm": 0.32943775594173325, + "learning_rate": 0.0002952444259018425, + "loss": 3.0950441360473633, + "step": 4311, + "token_acc": 0.2897810374262723 + }, + { + "epoch": 2.5277044854881265, + "grad_norm": 0.29195384204391867, + "learning_rate": 0.000295240793522931, + "loss": 3.135244607925415, + "step": 4312, + "token_acc": 0.2836666639925338 + }, + { + "epoch": 2.5282908238053357, + "grad_norm": 0.32707942250435523, + "learning_rate": 0.0002952371597796779, + "loss": 3.110598564147949, + "step": 4313, + "token_acc": 0.2868764627569894 + }, + { + "epoch": 2.5288771621225448, + "grad_norm": 0.28930101968404526, + "learning_rate": 0.00029523352467211744, + "loss": 3.1911144256591797, + "step": 4314, + "token_acc": 0.27720302026228594 + }, + { + "epoch": 2.529463500439754, + "grad_norm": 0.29720059197993975, + "learning_rate": 0.00029522988820028366, + "loss": 3.142454147338867, + "step": 4315, + "token_acc": 0.28396613648024627 + }, + { + "epoch": 2.5300498387569625, + "grad_norm": 0.3176909678364232, + "learning_rate": 0.00029522625036421086, + "loss": 3.125840663909912, + "step": 4316, + "token_acc": 0.2853479948927743 + }, + { + "epoch": 2.5306361770741717, + "grad_norm": 0.323385488028823, + "learning_rate": 0.0002952226111639331, + "loss": 3.1766505241394043, + "step": 4317, + "token_acc": 0.28011322871417577 + }, + { + "epoch": 2.5312225153913808, + "grad_norm": 0.5711545762521465, + "learning_rate": 0.0002952189705994846, + "loss": 3.1467244625091553, + "step": 4318, + "token_acc": 0.2823093290865852 + }, + { + "epoch": 2.53180885370859, + "grad_norm": 0.42517476164799983, + "learning_rate": 0.00029521532867089956, + "loss": 3.148420810699463, + "step": 4319, + "token_acc": 0.2810075343025155 + }, + { + "epoch": 2.532395192025799, + "grad_norm": 0.4531341666926811, + "learning_rate": 0.00029521168537821217, + "loss": 3.0976853370666504, + "step": 4320, + "token_acc": 0.2895007854124222 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.3601458314074524, + "learning_rate": 0.0002952080407214567, + "loss": 3.1023125648498535, + "step": 4321, + "token_acc": 0.2887278768129832 + }, + { + "epoch": 2.533567868660217, + "grad_norm": 0.3055958516578653, + "learning_rate": 0.00029520439470066736, + "loss": 3.102931499481201, + "step": 4322, + "token_acc": 0.2879292025876562 + }, + { + "epoch": 2.534154206977426, + "grad_norm": 0.34252233101167223, + "learning_rate": 0.00029520074731587836, + "loss": 3.08762788772583, + "step": 4323, + "token_acc": 0.29119557842140703 + }, + { + "epoch": 2.534740545294635, + "grad_norm": 0.31070446588425027, + "learning_rate": 0.000295197098567124, + "loss": 3.13425350189209, + "step": 4324, + "token_acc": 0.28301842390139054 + }, + { + "epoch": 2.535326883611844, + "grad_norm": 0.33073183951612883, + "learning_rate": 0.0002951934484544386, + "loss": 3.094320297241211, + "step": 4325, + "token_acc": 0.2906551310783343 + }, + { + "epoch": 2.535913221929053, + "grad_norm": 0.2871215998342306, + "learning_rate": 0.00029518979697785633, + "loss": 3.0834999084472656, + "step": 4326, + "token_acc": 0.2900523161316538 + }, + { + "epoch": 2.536499560246262, + "grad_norm": 0.32334953741881967, + "learning_rate": 0.0002951861441374116, + "loss": 3.1239235401153564, + "step": 4327, + "token_acc": 0.28674789274014995 + }, + { + "epoch": 2.537085898563471, + "grad_norm": 0.2962183376528778, + "learning_rate": 0.0002951824899331387, + "loss": 3.1008574962615967, + "step": 4328, + "token_acc": 0.2891705696900502 + }, + { + "epoch": 2.53767223688068, + "grad_norm": 0.28892001601227174, + "learning_rate": 0.00029517883436507193, + "loss": 3.1234235763549805, + "step": 4329, + "token_acc": 0.28476881600551357 + }, + { + "epoch": 2.538258575197889, + "grad_norm": 0.322772654724156, + "learning_rate": 0.0002951751774332456, + "loss": 3.1584339141845703, + "step": 4330, + "token_acc": 0.2800853961649642 + }, + { + "epoch": 2.5388449135150983, + "grad_norm": 0.26843708580399667, + "learning_rate": 0.0002951715191376942, + "loss": 3.148487091064453, + "step": 4331, + "token_acc": 0.2811636856424633 + }, + { + "epoch": 2.5394312518323074, + "grad_norm": 0.3221691230651787, + "learning_rate": 0.00029516785947845194, + "loss": 3.1144909858703613, + "step": 4332, + "token_acc": 0.2861130662275401 + }, + { + "epoch": 2.540017590149516, + "grad_norm": 0.32212569801984964, + "learning_rate": 0.00029516419845555326, + "loss": 3.081233024597168, + "step": 4333, + "token_acc": 0.29166847744115837 + }, + { + "epoch": 2.5406039284667252, + "grad_norm": 0.34875309863963444, + "learning_rate": 0.0002951605360690326, + "loss": 3.1339614391326904, + "step": 4334, + "token_acc": 0.2838891260692883 + }, + { + "epoch": 2.5411902667839343, + "grad_norm": 0.3005455802712372, + "learning_rate": 0.00029515687231892427, + "loss": 3.112187385559082, + "step": 4335, + "token_acc": 0.28716115101761586 + }, + { + "epoch": 2.5417766051011434, + "grad_norm": 0.2948602043787114, + "learning_rate": 0.0002951532072052627, + "loss": 3.1442360877990723, + "step": 4336, + "token_acc": 0.28142212673434025 + }, + { + "epoch": 2.542362943418352, + "grad_norm": 0.30669703887629524, + "learning_rate": 0.00029514954072808235, + "loss": 3.0923445224761963, + "step": 4337, + "token_acc": 0.2883934763983518 + }, + { + "epoch": 2.5429492817355612, + "grad_norm": 0.314359106506302, + "learning_rate": 0.0002951458728874177, + "loss": 3.1754751205444336, + "step": 4338, + "token_acc": 0.27958526614560164 + }, + { + "epoch": 2.5435356200527703, + "grad_norm": 0.3019288974426342, + "learning_rate": 0.00029514220368330305, + "loss": 3.116985559463501, + "step": 4339, + "token_acc": 0.2853228603019849 + }, + { + "epoch": 2.5441219583699795, + "grad_norm": 0.30992999749565403, + "learning_rate": 0.0002951385331157731, + "loss": 3.1041176319122314, + "step": 4340, + "token_acc": 0.28871285875515523 + }, + { + "epoch": 2.5447082966871886, + "grad_norm": 0.2951211858558363, + "learning_rate": 0.00029513486118486215, + "loss": 3.1062018871307373, + "step": 4341, + "token_acc": 0.28826703160146605 + }, + { + "epoch": 2.5452946350043977, + "grad_norm": 0.3043654689407853, + "learning_rate": 0.0002951311878906048, + "loss": 3.128478527069092, + "step": 4342, + "token_acc": 0.2852092350241292 + }, + { + "epoch": 2.545880973321607, + "grad_norm": 0.2949057042583338, + "learning_rate": 0.00029512751323303545, + "loss": 3.1382389068603516, + "step": 4343, + "token_acc": 0.28298161174873504 + }, + { + "epoch": 2.5464673116388155, + "grad_norm": 0.31525590252254637, + "learning_rate": 0.0002951238372121887, + "loss": 3.114595890045166, + "step": 4344, + "token_acc": 0.2859545366785558 + }, + { + "epoch": 2.5470536499560246, + "grad_norm": 0.26685994626090065, + "learning_rate": 0.00029512015982809906, + "loss": 3.1080551147460938, + "step": 4345, + "token_acc": 0.286880634280277 + }, + { + "epoch": 2.5476399882732337, + "grad_norm": 0.31901019579041623, + "learning_rate": 0.00029511648108080106, + "loss": 3.1524410247802734, + "step": 4346, + "token_acc": 0.28102772270503384 + }, + { + "epoch": 2.548226326590443, + "grad_norm": 0.32382884197008066, + "learning_rate": 0.0002951128009703293, + "loss": 3.185955286026001, + "step": 4347, + "token_acc": 0.2775910971879729 + }, + { + "epoch": 2.5488126649076515, + "grad_norm": 0.3107217353447862, + "learning_rate": 0.00029510911949671824, + "loss": 3.1457128524780273, + "step": 4348, + "token_acc": 0.28192732778592894 + }, + { + "epoch": 2.5493990032248606, + "grad_norm": 0.28768705683471113, + "learning_rate": 0.00029510543666000263, + "loss": 3.098903179168701, + "step": 4349, + "token_acc": 0.2870544628757129 + }, + { + "epoch": 2.5499853415420697, + "grad_norm": 0.346000343460998, + "learning_rate": 0.00029510175246021694, + "loss": 3.140838146209717, + "step": 4350, + "token_acc": 0.28419521924130386 + }, + { + "epoch": 2.550571679859279, + "grad_norm": 0.30412145309418237, + "learning_rate": 0.0002950980668973958, + "loss": 3.1386942863464355, + "step": 4351, + "token_acc": 0.2822545925172673 + }, + { + "epoch": 2.551158018176488, + "grad_norm": 0.2917771024965471, + "learning_rate": 0.0002950943799715738, + "loss": 3.127626419067383, + "step": 4352, + "token_acc": 0.28530725946619473 + }, + { + "epoch": 2.551744356493697, + "grad_norm": 0.3184346330605679, + "learning_rate": 0.0002950906916827857, + "loss": 3.1056160926818848, + "step": 4353, + "token_acc": 0.28769003902564966 + }, + { + "epoch": 2.552330694810906, + "grad_norm": 0.3366031887115573, + "learning_rate": 0.000295087002031066, + "loss": 3.10898494720459, + "step": 4354, + "token_acc": 0.287010877908339 + }, + { + "epoch": 2.552917033128115, + "grad_norm": 0.3900294131392648, + "learning_rate": 0.0002950833110164495, + "loss": 3.1653246879577637, + "step": 4355, + "token_acc": 0.2783011304417542 + }, + { + "epoch": 2.553503371445324, + "grad_norm": 0.39600210284819887, + "learning_rate": 0.00029507961863897074, + "loss": 3.1169683933258057, + "step": 4356, + "token_acc": 0.28607577422590713 + }, + { + "epoch": 2.554089709762533, + "grad_norm": 0.354412183900009, + "learning_rate": 0.0002950759248986645, + "loss": 3.1187214851379395, + "step": 4357, + "token_acc": 0.2864059706782161 + }, + { + "epoch": 2.554676048079742, + "grad_norm": 0.33466426404644734, + "learning_rate": 0.0002950722297955654, + "loss": 3.0871427059173584, + "step": 4358, + "token_acc": 0.28971643320806284 + }, + { + "epoch": 2.555262386396951, + "grad_norm": 0.35524089748517096, + "learning_rate": 0.00029506853332970814, + "loss": 3.147824287414551, + "step": 4359, + "token_acc": 0.28206004240680804 + }, + { + "epoch": 2.55584872471416, + "grad_norm": 0.3279110375968813, + "learning_rate": 0.0002950648355011276, + "loss": 3.1067605018615723, + "step": 4360, + "token_acc": 0.28561821373443513 + }, + { + "epoch": 2.556435063031369, + "grad_norm": 0.3370991569712934, + "learning_rate": 0.0002950611363098583, + "loss": 3.0919687747955322, + "step": 4361, + "token_acc": 0.2899748481999949 + }, + { + "epoch": 2.557021401348578, + "grad_norm": 0.3023634549399652, + "learning_rate": 0.0002950574357559352, + "loss": 3.170109272003174, + "step": 4362, + "token_acc": 0.27823921136804575 + }, + { + "epoch": 2.5576077396657872, + "grad_norm": 0.31447414054696804, + "learning_rate": 0.0002950537338393929, + "loss": 3.1639931201934814, + "step": 4363, + "token_acc": 0.2782095334120488 + }, + { + "epoch": 2.5581940779829964, + "grad_norm": 0.2977151656534409, + "learning_rate": 0.0002950500305602662, + "loss": 3.1461853981018066, + "step": 4364, + "token_acc": 0.2817006955111794 + }, + { + "epoch": 2.5587804163002055, + "grad_norm": 0.27639854561917093, + "learning_rate": 0.00029504632591859, + "loss": 3.1195225715637207, + "step": 4365, + "token_acc": 0.2858139642220092 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.2717349340870264, + "learning_rate": 0.000295042619914399, + "loss": 3.1506540775299072, + "step": 4366, + "token_acc": 0.2804789012968571 + }, + { + "epoch": 2.5599530929346233, + "grad_norm": 0.28384034115080187, + "learning_rate": 0.000295038912547728, + "loss": 3.1091511249542236, + "step": 4367, + "token_acc": 0.2861622933026879 + }, + { + "epoch": 2.5605394312518324, + "grad_norm": 0.29981542517857024, + "learning_rate": 0.00029503520381861186, + "loss": 3.1431097984313965, + "step": 4368, + "token_acc": 0.2806266597448236 + }, + { + "epoch": 2.5611257695690415, + "grad_norm": 0.3211722355759345, + "learning_rate": 0.00029503149372708543, + "loss": 3.1561903953552246, + "step": 4369, + "token_acc": 0.2825811623246493 + }, + { + "epoch": 2.56171210788625, + "grad_norm": 0.3318296788898076, + "learning_rate": 0.0002950277822731835, + "loss": 3.1563777923583984, + "step": 4370, + "token_acc": 0.28234208552935985 + }, + { + "epoch": 2.5622984462034593, + "grad_norm": 0.28784693971253433, + "learning_rate": 0.0002950240694569411, + "loss": 3.125364303588867, + "step": 4371, + "token_acc": 0.2840758917697542 + }, + { + "epoch": 2.5628847845206684, + "grad_norm": 0.29734017209625707, + "learning_rate": 0.0002950203552783929, + "loss": 3.1248130798339844, + "step": 4372, + "token_acc": 0.2842547828602124 + }, + { + "epoch": 2.5634711228378775, + "grad_norm": 0.2602852303075442, + "learning_rate": 0.0002950166397375739, + "loss": 3.1469712257385254, + "step": 4373, + "token_acc": 0.2832765909301297 + }, + { + "epoch": 2.5640574611550866, + "grad_norm": 0.2692801805455761, + "learning_rate": 0.000295012922834519, + "loss": 3.1314332485198975, + "step": 4374, + "token_acc": 0.28464791848314097 + }, + { + "epoch": 2.5646437994722957, + "grad_norm": 0.3151564119152561, + "learning_rate": 0.00029500920456926305, + "loss": 3.105403184890747, + "step": 4375, + "token_acc": 0.2869601566110531 + }, + { + "epoch": 2.565230137789505, + "grad_norm": 0.33683323513831975, + "learning_rate": 0.0002950054849418411, + "loss": 3.165574550628662, + "step": 4376, + "token_acc": 0.2798517244633663 + }, + { + "epoch": 2.5658164761067135, + "grad_norm": 0.28715701717167186, + "learning_rate": 0.00029500176395228796, + "loss": 3.12180233001709, + "step": 4377, + "token_acc": 0.28493311429241963 + }, + { + "epoch": 2.5664028144239226, + "grad_norm": 0.30395840474823715, + "learning_rate": 0.00029499804160063866, + "loss": 3.117748975753784, + "step": 4378, + "token_acc": 0.28422642507335777 + }, + { + "epoch": 2.5669891527411317, + "grad_norm": 0.29727573089296977, + "learning_rate": 0.0002949943178869281, + "loss": 3.0939571857452393, + "step": 4379, + "token_acc": 0.28889755940047895 + }, + { + "epoch": 2.567575491058341, + "grad_norm": 0.36473946332686935, + "learning_rate": 0.0002949905928111914, + "loss": 3.110833168029785, + "step": 4380, + "token_acc": 0.28693505104984307 + }, + { + "epoch": 2.5681618293755495, + "grad_norm": 0.32293143268439933, + "learning_rate": 0.0002949868663734634, + "loss": 3.1611194610595703, + "step": 4381, + "token_acc": 0.2802599594953906 + }, + { + "epoch": 2.5687481676927586, + "grad_norm": 0.29880298500069874, + "learning_rate": 0.00029498313857377915, + "loss": 3.092435359954834, + "step": 4382, + "token_acc": 0.2878354451244477 + }, + { + "epoch": 2.5693345060099677, + "grad_norm": 0.3148009530606465, + "learning_rate": 0.0002949794094121737, + "loss": 3.1207785606384277, + "step": 4383, + "token_acc": 0.2864595771258519 + }, + { + "epoch": 2.569920844327177, + "grad_norm": 0.35304362688843116, + "learning_rate": 0.0002949756788886821, + "loss": 3.138930320739746, + "step": 4384, + "token_acc": 0.2838769967886166 + }, + { + "epoch": 2.570507182644386, + "grad_norm": 0.3799587311765748, + "learning_rate": 0.0002949719470033393, + "loss": 3.119027614593506, + "step": 4385, + "token_acc": 0.28666809973091195 + }, + { + "epoch": 2.571093520961595, + "grad_norm": 0.3515453375963114, + "learning_rate": 0.0002949682137561804, + "loss": 3.1399941444396973, + "step": 4386, + "token_acc": 0.2856207556131815 + }, + { + "epoch": 2.5716798592788037, + "grad_norm": 0.3072085172667316, + "learning_rate": 0.0002949644791472405, + "loss": 3.1417970657348633, + "step": 4387, + "token_acc": 0.2811423744232894 + }, + { + "epoch": 2.572266197596013, + "grad_norm": 0.3462903309993942, + "learning_rate": 0.0002949607431765547, + "loss": 3.1345481872558594, + "step": 4388, + "token_acc": 0.28480670591964746 + }, + { + "epoch": 2.572852535913222, + "grad_norm": 0.3250746652358085, + "learning_rate": 0.000294957005844158, + "loss": 3.113795042037964, + "step": 4389, + "token_acc": 0.28636478558566 + }, + { + "epoch": 2.573438874230431, + "grad_norm": 0.32276943273505265, + "learning_rate": 0.00029495326715008556, + "loss": 3.1472067832946777, + "step": 4390, + "token_acc": 0.2831517702726464 + }, + { + "epoch": 2.5740252125476397, + "grad_norm": 0.30551538617941953, + "learning_rate": 0.0002949495270943725, + "loss": 3.120004653930664, + "step": 4391, + "token_acc": 0.28606939064943976 + }, + { + "epoch": 2.574611550864849, + "grad_norm": 0.29785636449092207, + "learning_rate": 0.000294945785677054, + "loss": 3.1246328353881836, + "step": 4392, + "token_acc": 0.28412799501084773 + }, + { + "epoch": 2.575197889182058, + "grad_norm": 0.29342010828032744, + "learning_rate": 0.00029494204289816513, + "loss": 3.086207866668701, + "step": 4393, + "token_acc": 0.29039409759050483 + }, + { + "epoch": 2.575784227499267, + "grad_norm": 0.27444581647952726, + "learning_rate": 0.0002949382987577411, + "loss": 3.1350343227386475, + "step": 4394, + "token_acc": 0.28434525901826513 + }, + { + "epoch": 2.576370565816476, + "grad_norm": 0.29473224835262446, + "learning_rate": 0.00029493455325581703, + "loss": 3.120664596557617, + "step": 4395, + "token_acc": 0.2857770704635971 + }, + { + "epoch": 2.5769569041336853, + "grad_norm": 0.29736177203859415, + "learning_rate": 0.00029493080639242814, + "loss": 3.154418468475342, + "step": 4396, + "token_acc": 0.2791622644606058 + }, + { + "epoch": 2.5775432424508944, + "grad_norm": 0.29434797814593494, + "learning_rate": 0.0002949270581676096, + "loss": 3.202594757080078, + "step": 4397, + "token_acc": 0.2738442415260315 + }, + { + "epoch": 2.578129580768103, + "grad_norm": 0.3057331415442335, + "learning_rate": 0.0002949233085813967, + "loss": 3.1284661293029785, + "step": 4398, + "token_acc": 0.2839789315117761 + }, + { + "epoch": 2.578715919085312, + "grad_norm": 0.34712178785429587, + "learning_rate": 0.0002949195576338246, + "loss": 3.1357181072235107, + "step": 4399, + "token_acc": 0.2839457627118644 + }, + { + "epoch": 2.5793022574025213, + "grad_norm": 0.30680029040518214, + "learning_rate": 0.0002949158053249285, + "loss": 3.1408510208129883, + "step": 4400, + "token_acc": 0.28282620331864783 + }, + { + "epoch": 2.5798885957197304, + "grad_norm": 0.2839143343176179, + "learning_rate": 0.00029491205165474367, + "loss": 3.0932655334472656, + "step": 4401, + "token_acc": 0.28915264836470517 + }, + { + "epoch": 2.580474934036939, + "grad_norm": 0.351168746015683, + "learning_rate": 0.00029490829662330543, + "loss": 3.125319004058838, + "step": 4402, + "token_acc": 0.2865990348191583 + }, + { + "epoch": 2.581061272354148, + "grad_norm": 0.32370112940416346, + "learning_rate": 0.000294904540230649, + "loss": 3.1333746910095215, + "step": 4403, + "token_acc": 0.2832029194006807 + }, + { + "epoch": 2.5816476106713573, + "grad_norm": 0.3021806819239285, + "learning_rate": 0.0002949007824768097, + "loss": 3.127964973449707, + "step": 4404, + "token_acc": 0.28453553027597817 + }, + { + "epoch": 2.5822339489885664, + "grad_norm": 0.2892606214211373, + "learning_rate": 0.00029489702336182275, + "loss": 3.065701723098755, + "step": 4405, + "token_acc": 0.29287432489960047 + }, + { + "epoch": 2.5828202873057755, + "grad_norm": 0.3253737997067806, + "learning_rate": 0.00029489326288572356, + "loss": 3.085606098175049, + "step": 4406, + "token_acc": 0.29013771407885547 + }, + { + "epoch": 2.5834066256229846, + "grad_norm": 0.3208388678077559, + "learning_rate": 0.0002948895010485473, + "loss": 3.082479953765869, + "step": 4407, + "token_acc": 0.2925389328567781 + }, + { + "epoch": 2.5839929639401937, + "grad_norm": 0.40104695294976167, + "learning_rate": 0.00029488573785032955, + "loss": 3.125638008117676, + "step": 4408, + "token_acc": 0.2870836203261635 + }, + { + "epoch": 2.5845793022574024, + "grad_norm": 0.336878912008137, + "learning_rate": 0.0002948819732911055, + "loss": 3.0985116958618164, + "step": 4409, + "token_acc": 0.2889409385709314 + }, + { + "epoch": 2.5851656405746115, + "grad_norm": 0.3775450529158059, + "learning_rate": 0.00029487820737091053, + "loss": 3.1670422554016113, + "step": 4410, + "token_acc": 0.2793131373489429 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.363154698614877, + "learning_rate": 0.00029487444008978003, + "loss": 3.065610408782959, + "step": 4411, + "token_acc": 0.29208049587087187 + }, + { + "epoch": 2.5863383172090297, + "grad_norm": 0.3270768057389953, + "learning_rate": 0.0002948706714477494, + "loss": 3.159055471420288, + "step": 4412, + "token_acc": 0.28089671274863 + }, + { + "epoch": 2.5869246555262384, + "grad_norm": 0.3609294287854356, + "learning_rate": 0.000294866901444854, + "loss": 3.093126058578491, + "step": 4413, + "token_acc": 0.2893595756531516 + }, + { + "epoch": 2.5875109938434475, + "grad_norm": 0.3052366351885031, + "learning_rate": 0.00029486313008112927, + "loss": 3.1404781341552734, + "step": 4414, + "token_acc": 0.2833371942423317 + }, + { + "epoch": 2.5880973321606566, + "grad_norm": 0.30651872213848946, + "learning_rate": 0.00029485935735661063, + "loss": 3.103621482849121, + "step": 4415, + "token_acc": 0.2880036348572116 + }, + { + "epoch": 2.5886836704778657, + "grad_norm": 0.32659771474955773, + "learning_rate": 0.0002948555832713336, + "loss": 3.0994062423706055, + "step": 4416, + "token_acc": 0.28908911532117587 + }, + { + "epoch": 2.589270008795075, + "grad_norm": 0.3286291408074442, + "learning_rate": 0.0002948518078253335, + "loss": 3.1457695960998535, + "step": 4417, + "token_acc": 0.28193285221211173 + }, + { + "epoch": 2.589856347112284, + "grad_norm": 0.2847441453989785, + "learning_rate": 0.00029484803101864583, + "loss": 3.173375368118286, + "step": 4418, + "token_acc": 0.2788848465881678 + }, + { + "epoch": 2.590442685429493, + "grad_norm": 0.3270679224976089, + "learning_rate": 0.00029484425285130613, + "loss": 3.0986316204071045, + "step": 4419, + "token_acc": 0.28881507581661103 + }, + { + "epoch": 2.5910290237467017, + "grad_norm": 0.27752761878569976, + "learning_rate": 0.00029484047332334985, + "loss": 3.118795871734619, + "step": 4420, + "token_acc": 0.2868318628986941 + }, + { + "epoch": 2.591615362063911, + "grad_norm": 0.3178909534396568, + "learning_rate": 0.00029483669243481254, + "loss": 3.115995168685913, + "step": 4421, + "token_acc": 0.28604252667653246 + }, + { + "epoch": 2.59220170038112, + "grad_norm": 0.3080345373551721, + "learning_rate": 0.0002948329101857296, + "loss": 3.121906280517578, + "step": 4422, + "token_acc": 0.2859240930678054 + }, + { + "epoch": 2.592788038698329, + "grad_norm": 0.348775077062953, + "learning_rate": 0.0002948291265761367, + "loss": 3.074571132659912, + "step": 4423, + "token_acc": 0.29275791091761905 + }, + { + "epoch": 2.5933743770155377, + "grad_norm": 0.27911861216986267, + "learning_rate": 0.0002948253416060693, + "loss": 3.147961139678955, + "step": 4424, + "token_acc": 0.28228315979965446 + }, + { + "epoch": 2.593960715332747, + "grad_norm": 0.3384160526771754, + "learning_rate": 0.00029482155527556296, + "loss": 3.1292662620544434, + "step": 4425, + "token_acc": 0.2843295357991295 + }, + { + "epoch": 2.594547053649956, + "grad_norm": 0.3408272751847346, + "learning_rate": 0.00029481776758465323, + "loss": 3.1380972862243652, + "step": 4426, + "token_acc": 0.28467994501235155 + }, + { + "epoch": 2.595133391967165, + "grad_norm": 0.3126027704893727, + "learning_rate": 0.00029481397853337575, + "loss": 3.09269380569458, + "step": 4427, + "token_acc": 0.2888217349658504 + }, + { + "epoch": 2.595719730284374, + "grad_norm": 0.2881480346729404, + "learning_rate": 0.00029481018812176605, + "loss": 3.0983381271362305, + "step": 4428, + "token_acc": 0.2885320098234606 + }, + { + "epoch": 2.5963060686015833, + "grad_norm": 0.344508246663763, + "learning_rate": 0.0002948063963498598, + "loss": 3.105874538421631, + "step": 4429, + "token_acc": 0.28835819038261623 + }, + { + "epoch": 2.5968924069187924, + "grad_norm": 0.3583760620853102, + "learning_rate": 0.0002948026032176926, + "loss": 3.129619598388672, + "step": 4430, + "token_acc": 0.28583458011551943 + }, + { + "epoch": 2.597478745236001, + "grad_norm": 0.31829518365844917, + "learning_rate": 0.0002947988087253, + "loss": 3.1357240676879883, + "step": 4431, + "token_acc": 0.28329746321843136 + }, + { + "epoch": 2.59806508355321, + "grad_norm": 0.3003434767787419, + "learning_rate": 0.00029479501287271774, + "loss": 3.1525259017944336, + "step": 4432, + "token_acc": 0.2789810927358752 + }, + { + "epoch": 2.5986514218704193, + "grad_norm": 0.3385922992762416, + "learning_rate": 0.0002947912156599815, + "loss": 3.121633529663086, + "step": 4433, + "token_acc": 0.28573407792650696 + }, + { + "epoch": 2.5992377601876284, + "grad_norm": 0.3716887587928683, + "learning_rate": 0.00029478741708712685, + "loss": 3.115966558456421, + "step": 4434, + "token_acc": 0.2847943947346327 + }, + { + "epoch": 2.599824098504837, + "grad_norm": 0.3004205571408154, + "learning_rate": 0.00029478361715418953, + "loss": 3.1105775833129883, + "step": 4435, + "token_acc": 0.28791223738176946 + }, + { + "epoch": 2.600410436822046, + "grad_norm": 0.31006673035742477, + "learning_rate": 0.0002947798158612052, + "loss": 3.109954833984375, + "step": 4436, + "token_acc": 0.28654567446696766 + }, + { + "epoch": 2.6009967751392553, + "grad_norm": 0.30977658998328045, + "learning_rate": 0.0002947760132082096, + "loss": 3.082505941390991, + "step": 4437, + "token_acc": 0.2898430184810299 + }, + { + "epoch": 2.6015831134564644, + "grad_norm": 0.3059373897227026, + "learning_rate": 0.0002947722091952385, + "loss": 3.123225450515747, + "step": 4438, + "token_acc": 0.28598095803425677 + }, + { + "epoch": 2.6021694517736735, + "grad_norm": 0.30179314557423453, + "learning_rate": 0.0002947684038223275, + "loss": 3.104154348373413, + "step": 4439, + "token_acc": 0.2862873429747704 + }, + { + "epoch": 2.6027557900908826, + "grad_norm": 0.2845127059378352, + "learning_rate": 0.0002947645970895125, + "loss": 3.113457441329956, + "step": 4440, + "token_acc": 0.2861876554083447 + }, + { + "epoch": 2.6033421284080913, + "grad_norm": 0.3001924954573138, + "learning_rate": 0.0002947607889968291, + "loss": 3.130687713623047, + "step": 4441, + "token_acc": 0.28544855596861896 + }, + { + "epoch": 2.6039284667253004, + "grad_norm": 0.3334417464516731, + "learning_rate": 0.0002947569795443132, + "loss": 3.160916805267334, + "step": 4442, + "token_acc": 0.28023702705599535 + }, + { + "epoch": 2.6045148050425095, + "grad_norm": 0.3241429607638957, + "learning_rate": 0.00029475316873200057, + "loss": 3.135272979736328, + "step": 4443, + "token_acc": 0.2831368738257011 + }, + { + "epoch": 2.6051011433597187, + "grad_norm": 0.30111457204944936, + "learning_rate": 0.00029474935655992695, + "loss": 3.103388786315918, + "step": 4444, + "token_acc": 0.28825775451542324 + }, + { + "epoch": 2.6056874816769273, + "grad_norm": 0.3710035222902008, + "learning_rate": 0.00029474554302812817, + "loss": 3.15543270111084, + "step": 4445, + "token_acc": 0.2791620079167837 + }, + { + "epoch": 2.6062738199941364, + "grad_norm": 0.3334417052254881, + "learning_rate": 0.00029474172813664007, + "loss": 3.1264641284942627, + "step": 4446, + "token_acc": 0.28281221666709844 + }, + { + "epoch": 2.6068601583113455, + "grad_norm": 0.3298689803948548, + "learning_rate": 0.0002947379118854984, + "loss": 3.1219534873962402, + "step": 4447, + "token_acc": 0.2847492237910333 + }, + { + "epoch": 2.6074464966285547, + "grad_norm": 0.3077807031188826, + "learning_rate": 0.0002947340942747392, + "loss": 3.1514925956726074, + "step": 4448, + "token_acc": 0.2815697623571877 + }, + { + "epoch": 2.6080328349457638, + "grad_norm": 0.29836743433400387, + "learning_rate": 0.00029473027530439814, + "loss": 3.0916762351989746, + "step": 4449, + "token_acc": 0.29010443749401754 + }, + { + "epoch": 2.608619173262973, + "grad_norm": 0.3293911453983814, + "learning_rate": 0.00029472645497451123, + "loss": 3.0977602005004883, + "step": 4450, + "token_acc": 0.28738383956538127 + }, + { + "epoch": 2.609205511580182, + "grad_norm": 0.31931957019786134, + "learning_rate": 0.00029472263328511426, + "loss": 3.120115280151367, + "step": 4451, + "token_acc": 0.28677817613357254 + }, + { + "epoch": 2.6097918498973907, + "grad_norm": 0.26689231153298926, + "learning_rate": 0.0002947188102362432, + "loss": 3.1187400817871094, + "step": 4452, + "token_acc": 0.28688228711691877 + }, + { + "epoch": 2.6103781882145998, + "grad_norm": 0.3180540684089845, + "learning_rate": 0.0002947149858279339, + "loss": 3.1094088554382324, + "step": 4453, + "token_acc": 0.2863388846933503 + }, + { + "epoch": 2.610964526531809, + "grad_norm": 0.29389254654758384, + "learning_rate": 0.0002947111600602223, + "loss": 3.1249537467956543, + "step": 4454, + "token_acc": 0.28545080604314926 + }, + { + "epoch": 2.611550864849018, + "grad_norm": 0.28507612272920185, + "learning_rate": 0.00029470733293314433, + "loss": 3.097743511199951, + "step": 4455, + "token_acc": 0.2896060779681503 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.30219065686933205, + "learning_rate": 0.000294703504446736, + "loss": 3.0970048904418945, + "step": 4456, + "token_acc": 0.2881431963432649 + }, + { + "epoch": 2.6127235414834358, + "grad_norm": 0.2873429390782826, + "learning_rate": 0.00029469967460103323, + "loss": 3.1402952671051025, + "step": 4457, + "token_acc": 0.28191646528708464 + }, + { + "epoch": 2.613309879800645, + "grad_norm": 0.28657919296288137, + "learning_rate": 0.00029469584339607204, + "loss": 3.1217007637023926, + "step": 4458, + "token_acc": 0.2844849986734172 + }, + { + "epoch": 2.613896218117854, + "grad_norm": 0.2880679747077227, + "learning_rate": 0.0002946920108318884, + "loss": 3.134267807006836, + "step": 4459, + "token_acc": 0.2840379782770984 + }, + { + "epoch": 2.614482556435063, + "grad_norm": 0.3012431186024602, + "learning_rate": 0.0002946881769085182, + "loss": 3.173287868499756, + "step": 4460, + "token_acc": 0.27856562753795217 + }, + { + "epoch": 2.615068894752272, + "grad_norm": 0.27696098180741774, + "learning_rate": 0.0002946843416259976, + "loss": 3.1313071250915527, + "step": 4461, + "token_acc": 0.283089091717668 + }, + { + "epoch": 2.6156552330694813, + "grad_norm": 0.28783535693619433, + "learning_rate": 0.00029468050498436256, + "loss": 3.100653886795044, + "step": 4462, + "token_acc": 0.2871162687039043 + }, + { + "epoch": 2.61624157138669, + "grad_norm": 0.2986841496382395, + "learning_rate": 0.00029467666698364915, + "loss": 3.082475423812866, + "step": 4463, + "token_acc": 0.290224406625505 + }, + { + "epoch": 2.616827909703899, + "grad_norm": 0.3123607023323966, + "learning_rate": 0.0002946728276238934, + "loss": 3.0614943504333496, + "step": 4464, + "token_acc": 0.2940618170465189 + }, + { + "epoch": 2.6174142480211082, + "grad_norm": 0.3483738296595411, + "learning_rate": 0.00029466898690513134, + "loss": 3.1398444175720215, + "step": 4465, + "token_acc": 0.28255523145476197 + }, + { + "epoch": 2.6180005863383173, + "grad_norm": 0.34501401573811336, + "learning_rate": 0.00029466514482739915, + "loss": 3.130096435546875, + "step": 4466, + "token_acc": 0.28659926040596195 + }, + { + "epoch": 2.618586924655526, + "grad_norm": 0.34134563497421655, + "learning_rate": 0.0002946613013907329, + "loss": 3.142551898956299, + "step": 4467, + "token_acc": 0.2841445986364808 + }, + { + "epoch": 2.619173262972735, + "grad_norm": 0.28983829480261003, + "learning_rate": 0.00029465745659516856, + "loss": 3.0939371585845947, + "step": 4468, + "token_acc": 0.28951004604113284 + }, + { + "epoch": 2.6197596012899442, + "grad_norm": 0.26830473908993374, + "learning_rate": 0.0002946536104407424, + "loss": 3.0971415042877197, + "step": 4469, + "token_acc": 0.28951914182479777 + }, + { + "epoch": 2.6203459396071533, + "grad_norm": 0.2982651301678548, + "learning_rate": 0.00029464976292749046, + "loss": 3.1109986305236816, + "step": 4470, + "token_acc": 0.2855461937679911 + }, + { + "epoch": 2.6209322779243625, + "grad_norm": 0.30722198286480995, + "learning_rate": 0.00029464591405544896, + "loss": 3.0927956104278564, + "step": 4471, + "token_acc": 0.2898912792042563 + }, + { + "epoch": 2.6215186162415716, + "grad_norm": 0.30232189063020365, + "learning_rate": 0.00029464206382465397, + "loss": 3.0823330879211426, + "step": 4472, + "token_acc": 0.2922501769275899 + }, + { + "epoch": 2.6221049545587807, + "grad_norm": 0.27353034870617754, + "learning_rate": 0.00029463821223514174, + "loss": 3.1145758628845215, + "step": 4473, + "token_acc": 0.28739488671749736 + }, + { + "epoch": 2.6226912928759893, + "grad_norm": 0.2972198770220653, + "learning_rate": 0.0002946343592869484, + "loss": 3.168844699859619, + "step": 4474, + "token_acc": 0.27997429636293536 + }, + { + "epoch": 2.6232776311931985, + "grad_norm": 0.3669913922981606, + "learning_rate": 0.00029463050498011007, + "loss": 3.1001155376434326, + "step": 4475, + "token_acc": 0.2881258315757477 + }, + { + "epoch": 2.6238639695104076, + "grad_norm": 0.36798295599954267, + "learning_rate": 0.00029462664931466316, + "loss": 3.1225318908691406, + "step": 4476, + "token_acc": 0.2845547470087015 + }, + { + "epoch": 2.6244503078276167, + "grad_norm": 0.3046180335034568, + "learning_rate": 0.00029462279229064365, + "loss": 3.1383934020996094, + "step": 4477, + "token_acc": 0.2831128585744542 + }, + { + "epoch": 2.6250366461448253, + "grad_norm": 0.35624790695411007, + "learning_rate": 0.000294618933908088, + "loss": 3.0853304862976074, + "step": 4478, + "token_acc": 0.2901535511573708 + }, + { + "epoch": 2.6256229844620345, + "grad_norm": 0.3970435949496295, + "learning_rate": 0.0002946150741670323, + "loss": 3.115877151489258, + "step": 4479, + "token_acc": 0.2864246762280501 + }, + { + "epoch": 2.6262093227792436, + "grad_norm": 0.313722321332007, + "learning_rate": 0.0002946112130675128, + "loss": 3.1043930053710938, + "step": 4480, + "token_acc": 0.288219149840869 + }, + { + "epoch": 2.6267956610964527, + "grad_norm": 0.3308439121741794, + "learning_rate": 0.00029460735060956586, + "loss": 3.1465582847595215, + "step": 4481, + "token_acc": 0.2827420566766853 + }, + { + "epoch": 2.627381999413662, + "grad_norm": 0.32563490913710724, + "learning_rate": 0.00029460348679322774, + "loss": 3.106356143951416, + "step": 4482, + "token_acc": 0.28769485101831094 + }, + { + "epoch": 2.627968337730871, + "grad_norm": 0.329831870898015, + "learning_rate": 0.0002945996216185347, + "loss": 3.0992166996002197, + "step": 4483, + "token_acc": 0.28703242758301234 + }, + { + "epoch": 2.62855467604808, + "grad_norm": 0.31865717732894344, + "learning_rate": 0.00029459575508552306, + "loss": 3.1208362579345703, + "step": 4484, + "token_acc": 0.2859584060500291 + }, + { + "epoch": 2.6291410143652887, + "grad_norm": 0.3259990229745666, + "learning_rate": 0.00029459188719422913, + "loss": 3.15392804145813, + "step": 4485, + "token_acc": 0.2807935548016113 + }, + { + "epoch": 2.629727352682498, + "grad_norm": 0.31743075077054056, + "learning_rate": 0.0002945880179446893, + "loss": 3.113034725189209, + "step": 4486, + "token_acc": 0.2870712455896948 + }, + { + "epoch": 2.630313690999707, + "grad_norm": 0.26805614139764844, + "learning_rate": 0.0002945841473369399, + "loss": 3.109351396560669, + "step": 4487, + "token_acc": 0.2884074907008025 + }, + { + "epoch": 2.630900029316916, + "grad_norm": 0.2759313414608668, + "learning_rate": 0.0002945802753710172, + "loss": 3.1152775287628174, + "step": 4488, + "token_acc": 0.2862446420729621 + }, + { + "epoch": 2.6314863676341247, + "grad_norm": 0.28283616148595353, + "learning_rate": 0.0002945764020469576, + "loss": 3.146833658218384, + "step": 4489, + "token_acc": 0.28164509082028916 + }, + { + "epoch": 2.632072705951334, + "grad_norm": 0.2667966763143784, + "learning_rate": 0.0002945725273647976, + "loss": 3.077296495437622, + "step": 4490, + "token_acc": 0.2898549954146121 + }, + { + "epoch": 2.632659044268543, + "grad_norm": 0.29665867311703914, + "learning_rate": 0.0002945686513245735, + "loss": 3.1546359062194824, + "step": 4491, + "token_acc": 0.28165909336795414 + }, + { + "epoch": 2.633245382585752, + "grad_norm": 0.33504356040719213, + "learning_rate": 0.00029456477392632177, + "loss": 3.1974258422851562, + "step": 4492, + "token_acc": 0.27598467416993766 + }, + { + "epoch": 2.633831720902961, + "grad_norm": 0.33291768005297473, + "learning_rate": 0.0002945608951700787, + "loss": 3.099846601486206, + "step": 4493, + "token_acc": 0.2892484418789476 + }, + { + "epoch": 2.6344180592201702, + "grad_norm": 0.29610885311088064, + "learning_rate": 0.0002945570150558809, + "loss": 3.1082544326782227, + "step": 4494, + "token_acc": 0.28674563058727837 + }, + { + "epoch": 2.635004397537379, + "grad_norm": 0.2908536717429911, + "learning_rate": 0.00029455313358376473, + "loss": 3.132927656173706, + "step": 4495, + "token_acc": 0.28427612525683515 + }, + { + "epoch": 2.635590735854588, + "grad_norm": 0.28378918250306284, + "learning_rate": 0.00029454925075376656, + "loss": 3.116621971130371, + "step": 4496, + "token_acc": 0.2869566856308949 + }, + { + "epoch": 2.636177074171797, + "grad_norm": 0.3087509984504465, + "learning_rate": 0.0002945453665659231, + "loss": 3.1072750091552734, + "step": 4497, + "token_acc": 0.28671364544521843 + }, + { + "epoch": 2.6367634124890063, + "grad_norm": 0.3262741522741202, + "learning_rate": 0.0002945414810202706, + "loss": 3.1088879108428955, + "step": 4498, + "token_acc": 0.28843046386910604 + }, + { + "epoch": 2.637349750806215, + "grad_norm": 0.29049905294695677, + "learning_rate": 0.00029453759411684566, + "loss": 3.1208126544952393, + "step": 4499, + "token_acc": 0.2851344120270098 + }, + { + "epoch": 2.637936089123424, + "grad_norm": 0.26623810362851663, + "learning_rate": 0.00029453370585568486, + "loss": 3.104092597961426, + "step": 4500, + "token_acc": 0.2886932090190631 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.315278410294714, + "learning_rate": 0.00029452981623682463, + "loss": 3.1607017517089844, + "step": 4501, + "token_acc": 0.28145554448245214 + }, + { + "epoch": 2.6391087657578423, + "grad_norm": 0.2944853279733132, + "learning_rate": 0.0002945259252603015, + "loss": 3.0929694175720215, + "step": 4502, + "token_acc": 0.2894278193899704 + }, + { + "epoch": 2.6396951040750514, + "grad_norm": 0.3269992913761978, + "learning_rate": 0.00029452203292615206, + "loss": 3.123652935028076, + "step": 4503, + "token_acc": 0.28667023347057774 + }, + { + "epoch": 2.6402814423922605, + "grad_norm": 0.32976511746561943, + "learning_rate": 0.0002945181392344129, + "loss": 3.0861825942993164, + "step": 4504, + "token_acc": 0.29020750232942827 + }, + { + "epoch": 2.6408677807094696, + "grad_norm": 0.29676788652328, + "learning_rate": 0.00029451424418512053, + "loss": 3.1230692863464355, + "step": 4505, + "token_acc": 0.28456207902301867 + }, + { + "epoch": 2.6414541190266783, + "grad_norm": 0.35589622171526314, + "learning_rate": 0.00029451034777831157, + "loss": 3.123164415359497, + "step": 4506, + "token_acc": 0.2836371358603467 + }, + { + "epoch": 2.6420404573438874, + "grad_norm": 0.3013563670644299, + "learning_rate": 0.00029450645001402267, + "loss": 3.1289210319519043, + "step": 4507, + "token_acc": 0.28232574437887126 + }, + { + "epoch": 2.6426267956610965, + "grad_norm": 0.29141965059391456, + "learning_rate": 0.00029450255089229037, + "loss": 3.106834888458252, + "step": 4508, + "token_acc": 0.28829949758241064 + }, + { + "epoch": 2.6432131339783056, + "grad_norm": 0.3355742036917519, + "learning_rate": 0.0002944986504131513, + "loss": 3.1411213874816895, + "step": 4509, + "token_acc": 0.28313926346818186 + }, + { + "epoch": 2.6437994722955143, + "grad_norm": 0.31581921994962364, + "learning_rate": 0.00029449474857664215, + "loss": 3.120835781097412, + "step": 4510, + "token_acc": 0.28425769020880604 + }, + { + "epoch": 2.6443858106127234, + "grad_norm": 0.3078142545872994, + "learning_rate": 0.0002944908453827995, + "loss": 3.143155097961426, + "step": 4511, + "token_acc": 0.2822982052251781 + }, + { + "epoch": 2.6449721489299325, + "grad_norm": 0.3020452815068232, + "learning_rate": 0.00029448694083166014, + "loss": 3.137824773788452, + "step": 4512, + "token_acc": 0.28304180368773274 + }, + { + "epoch": 2.6455584872471416, + "grad_norm": 0.31357739076310065, + "learning_rate": 0.00029448303492326063, + "loss": 3.1432530879974365, + "step": 4513, + "token_acc": 0.2817049622750285 + }, + { + "epoch": 2.6461448255643507, + "grad_norm": 0.3057451241488454, + "learning_rate": 0.0002944791276576377, + "loss": 3.1200733184814453, + "step": 4514, + "token_acc": 0.2836271147975634 + }, + { + "epoch": 2.64673116388156, + "grad_norm": 0.3048074265509225, + "learning_rate": 0.000294475219034828, + "loss": 3.127669334411621, + "step": 4515, + "token_acc": 0.2838378237939504 + }, + { + "epoch": 2.647317502198769, + "grad_norm": 0.29286276645956855, + "learning_rate": 0.0002944713090548684, + "loss": 3.141080379486084, + "step": 4516, + "token_acc": 0.2828118099636592 + }, + { + "epoch": 2.6479038405159776, + "grad_norm": 0.32012666472721935, + "learning_rate": 0.00029446739771779546, + "loss": 3.1143648624420166, + "step": 4517, + "token_acc": 0.28570622637606485 + }, + { + "epoch": 2.6484901788331867, + "grad_norm": 0.3098964459752691, + "learning_rate": 0.000294463485023646, + "loss": 3.1328814029693604, + "step": 4518, + "token_acc": 0.28421250759882283 + }, + { + "epoch": 2.649076517150396, + "grad_norm": 0.36122983490793154, + "learning_rate": 0.00029445957097245677, + "loss": 3.133930206298828, + "step": 4519, + "token_acc": 0.2832190942150153 + }, + { + "epoch": 2.649662855467605, + "grad_norm": 0.3248845972291869, + "learning_rate": 0.00029445565556426455, + "loss": 3.110642433166504, + "step": 4520, + "token_acc": 0.28643859417388917 + }, + { + "epoch": 2.6502491937848136, + "grad_norm": 0.3001539251248104, + "learning_rate": 0.00029445173879910614, + "loss": 3.1062188148498535, + "step": 4521, + "token_acc": 0.2881914808060001 + }, + { + "epoch": 2.6508355321020227, + "grad_norm": 0.32272285485525526, + "learning_rate": 0.0002944478206770182, + "loss": 3.125244140625, + "step": 4522, + "token_acc": 0.28406570716263135 + }, + { + "epoch": 2.651421870419232, + "grad_norm": 0.29863892966832806, + "learning_rate": 0.0002944439011980377, + "loss": 3.145256519317627, + "step": 4523, + "token_acc": 0.28213299288013854 + }, + { + "epoch": 2.652008208736441, + "grad_norm": 0.34975891055417735, + "learning_rate": 0.0002944399803622014, + "loss": 3.1016664505004883, + "step": 4524, + "token_acc": 0.28941927004023277 + }, + { + "epoch": 2.65259454705365, + "grad_norm": 0.34158336586814353, + "learning_rate": 0.0002944360581695461, + "loss": 3.12544584274292, + "step": 4525, + "token_acc": 0.283081654905672 + }, + { + "epoch": 2.653180885370859, + "grad_norm": 0.32150714508136635, + "learning_rate": 0.0002944321346201086, + "loss": 3.0926437377929688, + "step": 4526, + "token_acc": 0.2888411742084348 + }, + { + "epoch": 2.6537672236880683, + "grad_norm": 0.3014131286141054, + "learning_rate": 0.00029442820971392587, + "loss": 3.14554500579834, + "step": 4527, + "token_acc": 0.2824575775931772 + }, + { + "epoch": 2.654353562005277, + "grad_norm": 0.34735523697570025, + "learning_rate": 0.0002944242834510347, + "loss": 3.1762454509735107, + "step": 4528, + "token_acc": 0.2777713078783965 + }, + { + "epoch": 2.654939900322486, + "grad_norm": 0.29553390604927027, + "learning_rate": 0.0002944203558314721, + "loss": 3.112521171569824, + "step": 4529, + "token_acc": 0.2859794551637222 + }, + { + "epoch": 2.655526238639695, + "grad_norm": 0.2914088940190923, + "learning_rate": 0.00029441642685527474, + "loss": 3.120229721069336, + "step": 4530, + "token_acc": 0.28803966571102585 + }, + { + "epoch": 2.6561125769569043, + "grad_norm": 0.2824668571007022, + "learning_rate": 0.00029441249652247973, + "loss": 3.10982084274292, + "step": 4531, + "token_acc": 0.2857464670280986 + }, + { + "epoch": 2.656698915274113, + "grad_norm": 0.30158480374992164, + "learning_rate": 0.00029440856483312387, + "loss": 3.167297840118408, + "step": 4532, + "token_acc": 0.28153036634759465 + }, + { + "epoch": 2.657285253591322, + "grad_norm": 0.2824233444090316, + "learning_rate": 0.00029440463178724417, + "loss": 3.132747173309326, + "step": 4533, + "token_acc": 0.28391464191510757 + }, + { + "epoch": 2.657871591908531, + "grad_norm": 0.2933281838432, + "learning_rate": 0.00029440069738487755, + "loss": 3.1250252723693848, + "step": 4534, + "token_acc": 0.28506260467341893 + }, + { + "epoch": 2.6584579302257403, + "grad_norm": 0.3115483896501584, + "learning_rate": 0.00029439676162606093, + "loss": 3.159391164779663, + "step": 4535, + "token_acc": 0.28141971758993034 + }, + { + "epoch": 2.6590442685429494, + "grad_norm": 0.258356620037849, + "learning_rate": 0.00029439282451083134, + "loss": 3.081395149230957, + "step": 4536, + "token_acc": 0.29090324230687087 + }, + { + "epoch": 2.6596306068601585, + "grad_norm": 0.299475317586854, + "learning_rate": 0.0002943888860392257, + "loss": 3.1337337493896484, + "step": 4537, + "token_acc": 0.28276083860396023 + }, + { + "epoch": 2.660216945177367, + "grad_norm": 0.2966300883076466, + "learning_rate": 0.00029438494621128106, + "loss": 3.1177215576171875, + "step": 4538, + "token_acc": 0.2865350099286042 + }, + { + "epoch": 2.6608032834945763, + "grad_norm": 0.28548170475634843, + "learning_rate": 0.00029438100502703437, + "loss": 3.1744470596313477, + "step": 4539, + "token_acc": 0.27856233433884203 + }, + { + "epoch": 2.6613896218117854, + "grad_norm": 0.3181981594401044, + "learning_rate": 0.0002943770624865228, + "loss": 3.1670281887054443, + "step": 4540, + "token_acc": 0.27970499421160433 + }, + { + "epoch": 2.6619759601289945, + "grad_norm": 0.2919753612172915, + "learning_rate": 0.0002943731185897832, + "loss": 3.124089241027832, + "step": 4541, + "token_acc": 0.2854420072268309 + }, + { + "epoch": 2.6625622984462036, + "grad_norm": 0.28565784700829333, + "learning_rate": 0.0002943691733368527, + "loss": 3.1125121116638184, + "step": 4542, + "token_acc": 0.28676200953428677 + }, + { + "epoch": 2.6631486367634123, + "grad_norm": 0.2764752263679167, + "learning_rate": 0.0002943652267277684, + "loss": 3.0768280029296875, + "step": 4543, + "token_acc": 0.2912662667856624 + }, + { + "epoch": 2.6637349750806214, + "grad_norm": 0.2745505550541349, + "learning_rate": 0.00029436127876256727, + "loss": 3.1277854442596436, + "step": 4544, + "token_acc": 0.284402669935976 + }, + { + "epoch": 2.6643213133978305, + "grad_norm": 0.2898922636639875, + "learning_rate": 0.0002943573294412865, + "loss": 3.143904209136963, + "step": 4545, + "token_acc": 0.28221834967341997 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.3042838509865545, + "learning_rate": 0.0002943533787639631, + "loss": 3.1115176677703857, + "step": 4546, + "token_acc": 0.28923420078307366 + }, + { + "epoch": 2.6654939900322487, + "grad_norm": 0.29094634315956897, + "learning_rate": 0.00029434942673063424, + "loss": 3.14797306060791, + "step": 4547, + "token_acc": 0.2822168026944795 + }, + { + "epoch": 2.666080328349458, + "grad_norm": 0.31502645124894507, + "learning_rate": 0.00029434547334133705, + "loss": 3.147404670715332, + "step": 4548, + "token_acc": 0.2816955424377957 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.3160527127138627, + "learning_rate": 0.00029434151859610865, + "loss": 3.081361770629883, + "step": 4549, + "token_acc": 0.2909261903381999 + }, + { + "epoch": 2.6672530049838756, + "grad_norm": 0.2572537006245006, + "learning_rate": 0.0002943375624949862, + "loss": 3.132317066192627, + "step": 4550, + "token_acc": 0.2829082721944024 + }, + { + "epoch": 2.6678393433010847, + "grad_norm": 0.2918468640805991, + "learning_rate": 0.00029433360503800676, + "loss": 3.1263396739959717, + "step": 4551, + "token_acc": 0.2842974030968952 + }, + { + "epoch": 2.668425681618294, + "grad_norm": 0.29744403577202044, + "learning_rate": 0.0002943296462252077, + "loss": 3.1271743774414062, + "step": 4552, + "token_acc": 0.2835396825822292 + }, + { + "epoch": 2.6690120199355025, + "grad_norm": 0.38012510218807855, + "learning_rate": 0.000294325686056626, + "loss": 3.115926742553711, + "step": 4553, + "token_acc": 0.2847607948379371 + }, + { + "epoch": 2.6695983582527116, + "grad_norm": 0.35113155142498303, + "learning_rate": 0.000294321724532299, + "loss": 3.1405162811279297, + "step": 4554, + "token_acc": 0.28350620608063315 + }, + { + "epoch": 2.6701846965699207, + "grad_norm": 0.3201227436916076, + "learning_rate": 0.00029431776165226393, + "loss": 3.1165192127227783, + "step": 4555, + "token_acc": 0.2846724354169979 + }, + { + "epoch": 2.67077103488713, + "grad_norm": 0.31382199303138547, + "learning_rate": 0.0002943137974165579, + "loss": 3.1102471351623535, + "step": 4556, + "token_acc": 0.28637897879321705 + }, + { + "epoch": 2.671357373204339, + "grad_norm": 0.33368374697118924, + "learning_rate": 0.0002943098318252182, + "loss": 3.159630298614502, + "step": 4557, + "token_acc": 0.27996664474059035 + }, + { + "epoch": 2.671943711521548, + "grad_norm": 0.3111470864297513, + "learning_rate": 0.0002943058648782822, + "loss": 3.1030399799346924, + "step": 4558, + "token_acc": 0.2888672534005064 + }, + { + "epoch": 2.672530049838757, + "grad_norm": 0.3348002387553156, + "learning_rate": 0.0002943018965757869, + "loss": 3.1501152515411377, + "step": 4559, + "token_acc": 0.28112098731050206 + }, + { + "epoch": 2.673116388155966, + "grad_norm": 0.3128416889766457, + "learning_rate": 0.0002942979269177698, + "loss": 3.107067584991455, + "step": 4560, + "token_acc": 0.285710372443745 + }, + { + "epoch": 2.673702726473175, + "grad_norm": 0.29321016410978984, + "learning_rate": 0.0002942939559042681, + "loss": 3.141199827194214, + "step": 4561, + "token_acc": 0.2836139344688846 + }, + { + "epoch": 2.674289064790384, + "grad_norm": 0.3416250639555042, + "learning_rate": 0.0002942899835353192, + "loss": 3.159151077270508, + "step": 4562, + "token_acc": 0.28059380355684166 + }, + { + "epoch": 2.674875403107593, + "grad_norm": 0.30018955031664274, + "learning_rate": 0.00029428600981096025, + "loss": 3.1631321907043457, + "step": 4563, + "token_acc": 0.279160543600081 + }, + { + "epoch": 2.675461741424802, + "grad_norm": 0.31514239892808615, + "learning_rate": 0.00029428203473122873, + "loss": 3.1528992652893066, + "step": 4564, + "token_acc": 0.2829176559203813 + }, + { + "epoch": 2.676048079742011, + "grad_norm": 0.33376870192314295, + "learning_rate": 0.0002942780582961619, + "loss": 3.160259485244751, + "step": 4565, + "token_acc": 0.2799497980319985 + }, + { + "epoch": 2.67663441805922, + "grad_norm": 0.3470464281182145, + "learning_rate": 0.00029427408050579704, + "loss": 3.1318957805633545, + "step": 4566, + "token_acc": 0.28352849762122223 + }, + { + "epoch": 2.677220756376429, + "grad_norm": 0.3004837373864079, + "learning_rate": 0.0002942701013601717, + "loss": 3.1211414337158203, + "step": 4567, + "token_acc": 0.2859289719626168 + }, + { + "epoch": 2.6778070946936383, + "grad_norm": 0.3040355758432666, + "learning_rate": 0.00029426612085932315, + "loss": 3.180159568786621, + "step": 4568, + "token_acc": 0.27545773601239054 + }, + { + "epoch": 2.6783934330108474, + "grad_norm": 0.33718999541708533, + "learning_rate": 0.00029426213900328875, + "loss": 3.1247029304504395, + "step": 4569, + "token_acc": 0.286448449922859 + }, + { + "epoch": 2.6789797713280565, + "grad_norm": 0.3200281064176358, + "learning_rate": 0.00029425815579210604, + "loss": 3.114863395690918, + "step": 4570, + "token_acc": 0.2854968983481148 + }, + { + "epoch": 2.679566109645265, + "grad_norm": 0.35095037381926103, + "learning_rate": 0.00029425417122581226, + "loss": 3.1148738861083984, + "step": 4571, + "token_acc": 0.28560933313949866 + }, + { + "epoch": 2.6801524479624743, + "grad_norm": 0.3265773209290008, + "learning_rate": 0.000294250185304445, + "loss": 3.1360034942626953, + "step": 4572, + "token_acc": 0.283788254109655 + }, + { + "epoch": 2.6807387862796834, + "grad_norm": 0.31954779452074156, + "learning_rate": 0.00029424619802804157, + "loss": 3.0899851322174072, + "step": 4573, + "token_acc": 0.29093935893890344 + }, + { + "epoch": 2.6813251245968925, + "grad_norm": 0.2872310802996861, + "learning_rate": 0.00029424220939663947, + "loss": 3.0752291679382324, + "step": 4574, + "token_acc": 0.2909343805814149 + }, + { + "epoch": 2.681911462914101, + "grad_norm": 0.31919044819298287, + "learning_rate": 0.0002942382194102762, + "loss": 3.075417995452881, + "step": 4575, + "token_acc": 0.29182453862364033 + }, + { + "epoch": 2.6824978012313103, + "grad_norm": 0.29941200689216924, + "learning_rate": 0.00029423422806898925, + "loss": 3.1275949478149414, + "step": 4576, + "token_acc": 0.28610502997810794 + }, + { + "epoch": 2.6830841395485194, + "grad_norm": 0.29151606232241184, + "learning_rate": 0.0002942302353728161, + "loss": 3.1258411407470703, + "step": 4577, + "token_acc": 0.28495871803412914 + }, + { + "epoch": 2.6836704778657285, + "grad_norm": 0.24880192570872103, + "learning_rate": 0.0002942262413217942, + "loss": 3.0912928581237793, + "step": 4578, + "token_acc": 0.28886121095741685 + }, + { + "epoch": 2.6842568161829377, + "grad_norm": 0.2718167543627989, + "learning_rate": 0.0002942222459159611, + "loss": 3.135986804962158, + "step": 4579, + "token_acc": 0.28230148575606445 + }, + { + "epoch": 2.6848431545001468, + "grad_norm": 0.28227648718612036, + "learning_rate": 0.0002942182491553544, + "loss": 3.1465559005737305, + "step": 4580, + "token_acc": 0.2813216095192045 + }, + { + "epoch": 2.685429492817356, + "grad_norm": 0.284518310211154, + "learning_rate": 0.00029421425104001153, + "loss": 3.147820472717285, + "step": 4581, + "token_acc": 0.2810195191443211 + }, + { + "epoch": 2.6860158311345645, + "grad_norm": 0.3222536309073751, + "learning_rate": 0.00029421025156997014, + "loss": 3.194807767868042, + "step": 4582, + "token_acc": 0.2752409518921526 + }, + { + "epoch": 2.6866021694517737, + "grad_norm": 0.26185464885492393, + "learning_rate": 0.00029420625074526774, + "loss": 3.122466802597046, + "step": 4583, + "token_acc": 0.28468331592441365 + }, + { + "epoch": 2.6871885077689828, + "grad_norm": 0.33723361443121597, + "learning_rate": 0.00029420224856594194, + "loss": 3.1059765815734863, + "step": 4584, + "token_acc": 0.288374512074808 + }, + { + "epoch": 2.687774846086192, + "grad_norm": 0.3464168293876269, + "learning_rate": 0.00029419824503203033, + "loss": 3.1103920936584473, + "step": 4585, + "token_acc": 0.28679637249810236 + }, + { + "epoch": 2.6883611844034006, + "grad_norm": 0.2958503924369842, + "learning_rate": 0.0002941942401435705, + "loss": 3.151400566101074, + "step": 4586, + "token_acc": 0.28271026791190856 + }, + { + "epoch": 2.6889475227206097, + "grad_norm": 0.2728700541346181, + "learning_rate": 0.0002941902339006001, + "loss": 3.1337239742279053, + "step": 4587, + "token_acc": 0.2827778402182465 + }, + { + "epoch": 2.6895338610378188, + "grad_norm": 0.263208801100596, + "learning_rate": 0.00029418622630315676, + "loss": 3.1237175464630127, + "step": 4588, + "token_acc": 0.28492662321487877 + }, + { + "epoch": 2.690120199355028, + "grad_norm": 0.26093437139826986, + "learning_rate": 0.0002941822173512781, + "loss": 3.086075782775879, + "step": 4589, + "token_acc": 0.2904150103925594 + }, + { + "epoch": 2.690706537672237, + "grad_norm": 0.2898445953710013, + "learning_rate": 0.00029417820704500183, + "loss": 3.1390974521636963, + "step": 4590, + "token_acc": 0.28185563473144354 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.2902540866849508, + "learning_rate": 0.00029417419538436555, + "loss": 3.1016061305999756, + "step": 4591, + "token_acc": 0.28946653101482595 + }, + { + "epoch": 2.6918792143066548, + "grad_norm": 0.31348994761380244, + "learning_rate": 0.000294170182369407, + "loss": 3.1089634895324707, + "step": 4592, + "token_acc": 0.2880087621206783 + }, + { + "epoch": 2.692465552623864, + "grad_norm": 0.27741190874366206, + "learning_rate": 0.00029416616800016386, + "loss": 3.13444447517395, + "step": 4593, + "token_acc": 0.2828385013788928 + }, + { + "epoch": 2.693051890941073, + "grad_norm": 0.33208293481334733, + "learning_rate": 0.00029416215227667385, + "loss": 3.058577537536621, + "step": 4594, + "token_acc": 0.2934651338475849 + }, + { + "epoch": 2.693638229258282, + "grad_norm": 0.30616947866275346, + "learning_rate": 0.00029415813519897467, + "loss": 3.0959835052490234, + "step": 4595, + "token_acc": 0.28847534065978814 + }, + { + "epoch": 2.6942245675754912, + "grad_norm": 0.286437077281594, + "learning_rate": 0.00029415411676710405, + "loss": 3.167523145675659, + "step": 4596, + "token_acc": 0.2796301770624153 + }, + { + "epoch": 2.6948109058927, + "grad_norm": 0.29924377581762973, + "learning_rate": 0.00029415009698109977, + "loss": 3.1396360397338867, + "step": 4597, + "token_acc": 0.2816969191802673 + }, + { + "epoch": 2.695397244209909, + "grad_norm": 0.2701862296498904, + "learning_rate": 0.00029414607584099956, + "loss": 3.129110813140869, + "step": 4598, + "token_acc": 0.2849161109916111 + }, + { + "epoch": 2.695983582527118, + "grad_norm": 0.2843923889168176, + "learning_rate": 0.0002941420533468412, + "loss": 3.1341147422790527, + "step": 4599, + "token_acc": 0.2832001532191992 + }, + { + "epoch": 2.6965699208443272, + "grad_norm": 0.30006524930281325, + "learning_rate": 0.0002941380294986625, + "loss": 3.079439401626587, + "step": 4600, + "token_acc": 0.2907850875107639 + }, + { + "epoch": 2.6971562591615363, + "grad_norm": 0.3025171730965367, + "learning_rate": 0.0002941340042965013, + "loss": 3.124406099319458, + "step": 4601, + "token_acc": 0.2837750176003218 + }, + { + "epoch": 2.6977425974787455, + "grad_norm": 0.33872771422576137, + "learning_rate": 0.0002941299777403953, + "loss": 3.1456832885742188, + "step": 4602, + "token_acc": 0.2810418654975834 + }, + { + "epoch": 2.698328935795954, + "grad_norm": 0.3042748401569349, + "learning_rate": 0.0002941259498303823, + "loss": 3.1666271686553955, + "step": 4603, + "token_acc": 0.27840925284422874 + }, + { + "epoch": 2.6989152741131632, + "grad_norm": 0.340930164597336, + "learning_rate": 0.0002941219205665003, + "loss": 3.0995116233825684, + "step": 4604, + "token_acc": 0.2895581002520422 + }, + { + "epoch": 2.6995016124303723, + "grad_norm": 0.34628017761157465, + "learning_rate": 0.000294117889948787, + "loss": 3.054107189178467, + "step": 4605, + "token_acc": 0.2951046765961879 + }, + { + "epoch": 2.7000879507475815, + "grad_norm": 0.2877401208491686, + "learning_rate": 0.0002941138579772804, + "loss": 3.091055393218994, + "step": 4606, + "token_acc": 0.2883067474514638 + }, + { + "epoch": 2.70067428906479, + "grad_norm": 0.3297390062514481, + "learning_rate": 0.0002941098246520183, + "loss": 3.0988073348999023, + "step": 4607, + "token_acc": 0.28947751226543594 + }, + { + "epoch": 2.7012606273819992, + "grad_norm": 0.33553873920979604, + "learning_rate": 0.0002941057899730385, + "loss": 3.1095736026763916, + "step": 4608, + "token_acc": 0.288848005268412 + }, + { + "epoch": 2.7018469656992083, + "grad_norm": 0.32228078392927617, + "learning_rate": 0.00029410175394037905, + "loss": 3.129594564437866, + "step": 4609, + "token_acc": 0.2851215445842745 + }, + { + "epoch": 2.7024333040164175, + "grad_norm": 0.3448477440355411, + "learning_rate": 0.0002940977165540778, + "loss": 3.077892780303955, + "step": 4610, + "token_acc": 0.29062265767212764 + }, + { + "epoch": 2.7030196423336266, + "grad_norm": 0.27954202740077044, + "learning_rate": 0.00029409367781417273, + "loss": 3.1068811416625977, + "step": 4611, + "token_acc": 0.28706848740891816 + }, + { + "epoch": 2.7036059806508357, + "grad_norm": 0.32141087691506653, + "learning_rate": 0.0002940896377207016, + "loss": 3.1368956565856934, + "step": 4612, + "token_acc": 0.2827881879569259 + }, + { + "epoch": 2.704192318968045, + "grad_norm": 0.2911450110690061, + "learning_rate": 0.0002940855962737026, + "loss": 3.0660271644592285, + "step": 4613, + "token_acc": 0.29300428735752376 + }, + { + "epoch": 2.7047786572852535, + "grad_norm": 0.31185629640258006, + "learning_rate": 0.0002940815534732135, + "loss": 3.116438388824463, + "step": 4614, + "token_acc": 0.2852406170849643 + }, + { + "epoch": 2.7053649956024626, + "grad_norm": 0.29122025550652547, + "learning_rate": 0.0002940775093192724, + "loss": 3.12277889251709, + "step": 4615, + "token_acc": 0.28492211576762616 + }, + { + "epoch": 2.7059513339196717, + "grad_norm": 0.2881624350459654, + "learning_rate": 0.00029407346381191726, + "loss": 3.1263279914855957, + "step": 4616, + "token_acc": 0.285623343282336 + }, + { + "epoch": 2.706537672236881, + "grad_norm": 0.3008507715935431, + "learning_rate": 0.000294069416951186, + "loss": 3.167377471923828, + "step": 4617, + "token_acc": 0.2780916980946489 + }, + { + "epoch": 2.7071240105540895, + "grad_norm": 0.32366992038099324, + "learning_rate": 0.0002940653687371168, + "loss": 3.1608505249023438, + "step": 4618, + "token_acc": 0.28029813297103034 + }, + { + "epoch": 2.7077103488712986, + "grad_norm": 0.32944104256304296, + "learning_rate": 0.0002940613191697475, + "loss": 3.153795003890991, + "step": 4619, + "token_acc": 0.2800048865565112 + }, + { + "epoch": 2.7082966871885077, + "grad_norm": 0.3132669147662173, + "learning_rate": 0.00029405726824911635, + "loss": 3.1011006832122803, + "step": 4620, + "token_acc": 0.2858072110839274 + }, + { + "epoch": 2.708883025505717, + "grad_norm": 0.3158063532826846, + "learning_rate": 0.0002940532159752612, + "loss": 3.1045751571655273, + "step": 4621, + "token_acc": 0.28744945595996574 + }, + { + "epoch": 2.709469363822926, + "grad_norm": 0.31561703360670684, + "learning_rate": 0.00029404916234822016, + "loss": 3.124743938446045, + "step": 4622, + "token_acc": 0.2840490877896088 + }, + { + "epoch": 2.710055702140135, + "grad_norm": 0.28716632716623874, + "learning_rate": 0.0002940451073680314, + "loss": 3.1051926612854004, + "step": 4623, + "token_acc": 0.2871723146805589 + }, + { + "epoch": 2.710642040457344, + "grad_norm": 0.27228519216772534, + "learning_rate": 0.00029404105103473296, + "loss": 3.1210622787475586, + "step": 4624, + "token_acc": 0.2863287616775989 + }, + { + "epoch": 2.711228378774553, + "grad_norm": 0.3155644597533601, + "learning_rate": 0.00029403699334836294, + "loss": 3.1130270957946777, + "step": 4625, + "token_acc": 0.28519649488531906 + }, + { + "epoch": 2.711814717091762, + "grad_norm": 0.3575816525196397, + "learning_rate": 0.00029403293430895947, + "loss": 3.1579549312591553, + "step": 4626, + "token_acc": 0.279542061741164 + }, + { + "epoch": 2.712401055408971, + "grad_norm": 0.3347961806023059, + "learning_rate": 0.00029402887391656064, + "loss": 3.160306215286255, + "step": 4627, + "token_acc": 0.279350511260523 + }, + { + "epoch": 2.71298739372618, + "grad_norm": 0.338563928218276, + "learning_rate": 0.0002940248121712047, + "loss": 3.1232380867004395, + "step": 4628, + "token_acc": 0.2859852748136589 + }, + { + "epoch": 2.713573732043389, + "grad_norm": 0.3000185767301935, + "learning_rate": 0.00029402074907292964, + "loss": 3.108776330947876, + "step": 4629, + "token_acc": 0.28754937054211105 + }, + { + "epoch": 2.714160070360598, + "grad_norm": 0.4008103726388896, + "learning_rate": 0.00029401668462177374, + "loss": 3.0986952781677246, + "step": 4630, + "token_acc": 0.28661991599434417 + }, + { + "epoch": 2.714746408677807, + "grad_norm": 0.3666210972814654, + "learning_rate": 0.00029401261881777514, + "loss": 3.1116297245025635, + "step": 4631, + "token_acc": 0.2866642986291199 + }, + { + "epoch": 2.715332746995016, + "grad_norm": 0.3323319142942056, + "learning_rate": 0.00029400855166097207, + "loss": 3.1436498165130615, + "step": 4632, + "token_acc": 0.28320200428334746 + }, + { + "epoch": 2.7159190853122253, + "grad_norm": 0.27891262522231886, + "learning_rate": 0.0002940044831514027, + "loss": 3.13948130607605, + "step": 4633, + "token_acc": 0.2839069748597127 + }, + { + "epoch": 2.7165054236294344, + "grad_norm": 0.3283168390709002, + "learning_rate": 0.00029400041328910524, + "loss": 3.1330699920654297, + "step": 4634, + "token_acc": 0.2830496060139244 + }, + { + "epoch": 2.7170917619466435, + "grad_norm": 0.32418274608104314, + "learning_rate": 0.000293996342074118, + "loss": 3.177962303161621, + "step": 4635, + "token_acc": 0.27783361338379475 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.32545263476977715, + "learning_rate": 0.0002939922695064791, + "loss": 3.0984511375427246, + "step": 4636, + "token_acc": 0.2885143395913631 + }, + { + "epoch": 2.7182644385810613, + "grad_norm": 0.35986563853196196, + "learning_rate": 0.00029398819558622687, + "loss": 3.148623466491699, + "step": 4637, + "token_acc": 0.28201401478084526 + }, + { + "epoch": 2.7188507768982704, + "grad_norm": 0.3089624440242346, + "learning_rate": 0.00029398412031339955, + "loss": 3.0953633785247803, + "step": 4638, + "token_acc": 0.28773547000681754 + }, + { + "epoch": 2.7194371152154795, + "grad_norm": 0.32095235996202737, + "learning_rate": 0.0002939800436880355, + "loss": 3.1473608016967773, + "step": 4639, + "token_acc": 0.2838720527036514 + }, + { + "epoch": 2.720023453532688, + "grad_norm": 0.3260751610274036, + "learning_rate": 0.00029397596571017294, + "loss": 3.1489624977111816, + "step": 4640, + "token_acc": 0.27905301691696366 + }, + { + "epoch": 2.7206097918498973, + "grad_norm": 0.30844978599401995, + "learning_rate": 0.0002939718863798502, + "loss": 3.1121456623077393, + "step": 4641, + "token_acc": 0.28609820464225305 + }, + { + "epoch": 2.7211961301671064, + "grad_norm": 0.29474267611248495, + "learning_rate": 0.00029396780569710556, + "loss": 3.1310932636260986, + "step": 4642, + "token_acc": 0.2837264882320861 + }, + { + "epoch": 2.7217824684843155, + "grad_norm": 0.317081254279625, + "learning_rate": 0.0002939637236619774, + "loss": 3.131783962249756, + "step": 4643, + "token_acc": 0.2827566545783293 + }, + { + "epoch": 2.7223688068015246, + "grad_norm": 0.3121630638752819, + "learning_rate": 0.00029395964027450404, + "loss": 3.1202597618103027, + "step": 4644, + "token_acc": 0.28604182787882576 + }, + { + "epoch": 2.7229551451187337, + "grad_norm": 0.31638898176387137, + "learning_rate": 0.00029395555553472384, + "loss": 3.131974220275879, + "step": 4645, + "token_acc": 0.28377970016850795 + }, + { + "epoch": 2.7235414834359424, + "grad_norm": 0.28476223234764236, + "learning_rate": 0.0002939514694426752, + "loss": 3.129000663757324, + "step": 4646, + "token_acc": 0.2834111813269485 + }, + { + "epoch": 2.7241278217531515, + "grad_norm": 0.3022637605139887, + "learning_rate": 0.0002939473819983965, + "loss": 3.1290998458862305, + "step": 4647, + "token_acc": 0.2857560049645908 + }, + { + "epoch": 2.7247141600703606, + "grad_norm": 0.2750131357613476, + "learning_rate": 0.0002939432932019261, + "loss": 3.110006332397461, + "step": 4648, + "token_acc": 0.287471804316785 + }, + { + "epoch": 2.7253004983875697, + "grad_norm": 0.3036775922568293, + "learning_rate": 0.00029393920305330237, + "loss": 3.07769513130188, + "step": 4649, + "token_acc": 0.29264439583181595 + }, + { + "epoch": 2.7258868367047784, + "grad_norm": 0.3329766059425245, + "learning_rate": 0.00029393511155256384, + "loss": 3.0733118057250977, + "step": 4650, + "token_acc": 0.29306160784324015 + }, + { + "epoch": 2.7264731750219875, + "grad_norm": 0.3084486390969691, + "learning_rate": 0.0002939310186997489, + "loss": 3.072504997253418, + "step": 4651, + "token_acc": 0.2913754838984699 + }, + { + "epoch": 2.7270595133391966, + "grad_norm": 0.3091198226888238, + "learning_rate": 0.00029392692449489597, + "loss": 3.111243486404419, + "step": 4652, + "token_acc": 0.2863049000768136 + }, + { + "epoch": 2.7276458516564057, + "grad_norm": 0.3164573224658852, + "learning_rate": 0.00029392282893804354, + "loss": 3.0988736152648926, + "step": 4653, + "token_acc": 0.287824435534923 + }, + { + "epoch": 2.728232189973615, + "grad_norm": 0.26437116060976923, + "learning_rate": 0.00029391873202923004, + "loss": 3.1457619667053223, + "step": 4654, + "token_acc": 0.2821215959637617 + }, + { + "epoch": 2.728818528290824, + "grad_norm": 0.27401766139301315, + "learning_rate": 0.000293914633768494, + "loss": 3.1506428718566895, + "step": 4655, + "token_acc": 0.28271964909066855 + }, + { + "epoch": 2.729404866608033, + "grad_norm": 0.2596916474162231, + "learning_rate": 0.0002939105341558739, + "loss": 3.0618858337402344, + "step": 4656, + "token_acc": 0.2929765367254163 + }, + { + "epoch": 2.7299912049252417, + "grad_norm": 0.29477469416101254, + "learning_rate": 0.0002939064331914083, + "loss": 3.1261134147644043, + "step": 4657, + "token_acc": 0.2840787849626577 + }, + { + "epoch": 2.730577543242451, + "grad_norm": 0.28319840184156814, + "learning_rate": 0.00029390233087513563, + "loss": 3.1006863117218018, + "step": 4658, + "token_acc": 0.288687423398402 + }, + { + "epoch": 2.73116388155966, + "grad_norm": 0.2663723395472693, + "learning_rate": 0.0002938982272070945, + "loss": 3.0868067741394043, + "step": 4659, + "token_acc": 0.290051392512273 + }, + { + "epoch": 2.731750219876869, + "grad_norm": 0.2938101319897106, + "learning_rate": 0.0002938941221873234, + "loss": 3.1115784645080566, + "step": 4660, + "token_acc": 0.2865728321521849 + }, + { + "epoch": 2.7323365581940777, + "grad_norm": 0.3141015403145631, + "learning_rate": 0.00029389001581586093, + "loss": 3.110257625579834, + "step": 4661, + "token_acc": 0.28689001981235485 + }, + { + "epoch": 2.732922896511287, + "grad_norm": 0.2687667440427265, + "learning_rate": 0.00029388590809274566, + "loss": 3.0621681213378906, + "step": 4662, + "token_acc": 0.29392864527827706 + }, + { + "epoch": 2.733509234828496, + "grad_norm": 0.31748704360173763, + "learning_rate": 0.0002938817990180162, + "loss": 3.1268672943115234, + "step": 4663, + "token_acc": 0.28368625936671604 + }, + { + "epoch": 2.734095573145705, + "grad_norm": 0.31611398418617415, + "learning_rate": 0.00029387768859171105, + "loss": 3.0960335731506348, + "step": 4664, + "token_acc": 0.2881754589608089 + }, + { + "epoch": 2.734681911462914, + "grad_norm": 0.29942489696869135, + "learning_rate": 0.000293873576813869, + "loss": 3.1195664405822754, + "step": 4665, + "token_acc": 0.2861620548774987 + }, + { + "epoch": 2.7352682497801233, + "grad_norm": 0.28150922018175023, + "learning_rate": 0.0002938694636845285, + "loss": 3.125598907470703, + "step": 4666, + "token_acc": 0.2856954678736179 + }, + { + "epoch": 2.7358545880973324, + "grad_norm": 0.28428432362677086, + "learning_rate": 0.00029386534920372825, + "loss": 3.1247076988220215, + "step": 4667, + "token_acc": 0.28557349290849854 + }, + { + "epoch": 2.736440926414541, + "grad_norm": 0.2594797636814349, + "learning_rate": 0.00029386123337150693, + "loss": 3.057495594024658, + "step": 4668, + "token_acc": 0.2927059683185887 + }, + { + "epoch": 2.73702726473175, + "grad_norm": 0.3065962824255641, + "learning_rate": 0.00029385711618790317, + "loss": 3.156785011291504, + "step": 4669, + "token_acc": 0.27995013537670266 + }, + { + "epoch": 2.7376136030489593, + "grad_norm": 0.27372840091448825, + "learning_rate": 0.00029385299765295563, + "loss": 3.1196084022521973, + "step": 4670, + "token_acc": 0.28438188177382445 + }, + { + "epoch": 2.7381999413661684, + "grad_norm": 0.3288698636674791, + "learning_rate": 0.00029384887776670305, + "loss": 3.0815818309783936, + "step": 4671, + "token_acc": 0.2903944110318866 + }, + { + "epoch": 2.738786279683377, + "grad_norm": 0.3264973956483734, + "learning_rate": 0.0002938447565291841, + "loss": 3.1288208961486816, + "step": 4672, + "token_acc": 0.28316162101881504 + }, + { + "epoch": 2.739372618000586, + "grad_norm": 0.3503381530088445, + "learning_rate": 0.0002938406339404375, + "loss": 3.142727851867676, + "step": 4673, + "token_acc": 0.2821260637682379 + }, + { + "epoch": 2.7399589563177953, + "grad_norm": 0.34017410493154704, + "learning_rate": 0.0002938365100005019, + "loss": 3.1012144088745117, + "step": 4674, + "token_acc": 0.29087555720454045 + }, + { + "epoch": 2.7405452946350044, + "grad_norm": 0.29149865338024367, + "learning_rate": 0.0002938323847094162, + "loss": 3.14150333404541, + "step": 4675, + "token_acc": 0.28126029664892677 + }, + { + "epoch": 2.7411316329522135, + "grad_norm": 0.35213799633887566, + "learning_rate": 0.000293828258067219, + "loss": 3.1303629875183105, + "step": 4676, + "token_acc": 0.2847607238642509 + }, + { + "epoch": 2.7417179712694226, + "grad_norm": 0.34775927761089853, + "learning_rate": 0.0002938241300739492, + "loss": 3.109248638153076, + "step": 4677, + "token_acc": 0.28661050043390224 + }, + { + "epoch": 2.7423043095866317, + "grad_norm": 0.39523054656265644, + "learning_rate": 0.0002938200007296455, + "loss": 3.1547470092773438, + "step": 4678, + "token_acc": 0.28048252251876127 + }, + { + "epoch": 2.7428906479038404, + "grad_norm": 0.34660825206441775, + "learning_rate": 0.0002938158700343466, + "loss": 3.140289306640625, + "step": 4679, + "token_acc": 0.282660073261933 + }, + { + "epoch": 2.7434769862210495, + "grad_norm": 0.3330637214132319, + "learning_rate": 0.0002938117379880915, + "loss": 3.0921826362609863, + "step": 4680, + "token_acc": 0.2889187917716845 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.2655881374935521, + "learning_rate": 0.0002938076045909188, + "loss": 3.160306930541992, + "step": 4681, + "token_acc": 0.28060509611454043 + }, + { + "epoch": 2.7446496628554677, + "grad_norm": 0.31934861018589944, + "learning_rate": 0.00029380346984286755, + "loss": 3.131010055541992, + "step": 4682, + "token_acc": 0.2826089779719435 + }, + { + "epoch": 2.7452360011726764, + "grad_norm": 0.28366571160299364, + "learning_rate": 0.00029379933374397644, + "loss": 3.071028709411621, + "step": 4683, + "token_acc": 0.2892310308314204 + }, + { + "epoch": 2.7458223394898855, + "grad_norm": 0.3547433045237246, + "learning_rate": 0.00029379519629428434, + "loss": 3.091519355773926, + "step": 4684, + "token_acc": 0.29102558353622027 + }, + { + "epoch": 2.7464086778070946, + "grad_norm": 0.32588485524047744, + "learning_rate": 0.0002937910574938302, + "loss": 3.130218505859375, + "step": 4685, + "token_acc": 0.283876237324726 + }, + { + "epoch": 2.7469950161243037, + "grad_norm": 0.32484636975519876, + "learning_rate": 0.0002937869173426527, + "loss": 3.1131410598754883, + "step": 4686, + "token_acc": 0.2862504735617254 + }, + { + "epoch": 2.747581354441513, + "grad_norm": 0.27828495224118727, + "learning_rate": 0.00029378277584079095, + "loss": 3.0764341354370117, + "step": 4687, + "token_acc": 0.29348980090230203 + }, + { + "epoch": 2.748167692758722, + "grad_norm": 0.3088473060138778, + "learning_rate": 0.00029377863298828377, + "loss": 3.1043457984924316, + "step": 4688, + "token_acc": 0.28819144954188164 + }, + { + "epoch": 2.748754031075931, + "grad_norm": 0.28115386770389267, + "learning_rate": 0.0002937744887851701, + "loss": 3.134852170944214, + "step": 4689, + "token_acc": 0.28398032690201364 + }, + { + "epoch": 2.7493403693931397, + "grad_norm": 0.2819556622422523, + "learning_rate": 0.0002937703432314888, + "loss": 3.1036086082458496, + "step": 4690, + "token_acc": 0.28956922088952414 + }, + { + "epoch": 2.749926707710349, + "grad_norm": 0.27960691946809385, + "learning_rate": 0.0002937661963272789, + "loss": 3.097256660461426, + "step": 4691, + "token_acc": 0.28850732543051866 + }, + { + "epoch": 2.750513046027558, + "grad_norm": 0.26774711744024093, + "learning_rate": 0.0002937620480725793, + "loss": 3.1116981506347656, + "step": 4692, + "token_acc": 0.28684383637153116 + }, + { + "epoch": 2.751099384344767, + "grad_norm": 0.28687798158990896, + "learning_rate": 0.00029375789846742894, + "loss": 3.147639751434326, + "step": 4693, + "token_acc": 0.2801852447931092 + }, + { + "epoch": 2.7516857226619758, + "grad_norm": 0.2702603211912033, + "learning_rate": 0.0002937537475118669, + "loss": 3.162505626678467, + "step": 4694, + "token_acc": 0.27960436032210456 + }, + { + "epoch": 2.752272060979185, + "grad_norm": 0.2838100637330248, + "learning_rate": 0.0002937495952059321, + "loss": 3.112856149673462, + "step": 4695, + "token_acc": 0.28609278079254713 + }, + { + "epoch": 2.752858399296394, + "grad_norm": 0.2812387065732931, + "learning_rate": 0.0002937454415496635, + "loss": 3.0824429988861084, + "step": 4696, + "token_acc": 0.28975167610410435 + }, + { + "epoch": 2.753444737613603, + "grad_norm": 0.3048882371607763, + "learning_rate": 0.00029374128654310026, + "loss": 3.1206917762756348, + "step": 4697, + "token_acc": 0.28408926966229786 + }, + { + "epoch": 2.754031075930812, + "grad_norm": 0.2839292561526516, + "learning_rate": 0.0002937371301862813, + "loss": 3.1614980697631836, + "step": 4698, + "token_acc": 0.27967022483225035 + }, + { + "epoch": 2.7546174142480213, + "grad_norm": 0.3299972666673433, + "learning_rate": 0.0002937329724792457, + "loss": 3.1126513481140137, + "step": 4699, + "token_acc": 0.28618007769221676 + }, + { + "epoch": 2.75520375256523, + "grad_norm": 0.32858639445157345, + "learning_rate": 0.00029372881342203247, + "loss": 3.123180389404297, + "step": 4700, + "token_acc": 0.2840429417972238 + }, + { + "epoch": 2.755790090882439, + "grad_norm": 0.31993113931543704, + "learning_rate": 0.0002937246530146807, + "loss": 3.1160054206848145, + "step": 4701, + "token_acc": 0.286098999743524 + }, + { + "epoch": 2.756376429199648, + "grad_norm": 0.35570265038608545, + "learning_rate": 0.0002937204912572296, + "loss": 3.1265673637390137, + "step": 4702, + "token_acc": 0.28413944391584733 + }, + { + "epoch": 2.7569627675168573, + "grad_norm": 0.3135345615694619, + "learning_rate": 0.00029371632814971803, + "loss": 3.1235218048095703, + "step": 4703, + "token_acc": 0.2851160978987687 + }, + { + "epoch": 2.757549105834066, + "grad_norm": 0.26694159703073106, + "learning_rate": 0.0002937121636921852, + "loss": 3.1407878398895264, + "step": 4704, + "token_acc": 0.28156576127812005 + }, + { + "epoch": 2.758135444151275, + "grad_norm": 0.28208073015796237, + "learning_rate": 0.00029370799788467033, + "loss": 3.1072981357574463, + "step": 4705, + "token_acc": 0.2886720396123051 + }, + { + "epoch": 2.758721782468484, + "grad_norm": 0.2960137923653525, + "learning_rate": 0.0002937038307272124, + "loss": 3.1451096534729004, + "step": 4706, + "token_acc": 0.28318405904215443 + }, + { + "epoch": 2.7593081207856933, + "grad_norm": 0.31617252197368517, + "learning_rate": 0.0002936996622198507, + "loss": 3.101792335510254, + "step": 4707, + "token_acc": 0.28962171868090636 + }, + { + "epoch": 2.7598944591029024, + "grad_norm": 0.31785461636081586, + "learning_rate": 0.0002936954923626243, + "loss": 3.0967421531677246, + "step": 4708, + "token_acc": 0.28813238923905354 + }, + { + "epoch": 2.7604807974201115, + "grad_norm": 0.30820592681765696, + "learning_rate": 0.00029369132115557235, + "loss": 3.11812162399292, + "step": 4709, + "token_acc": 0.2870730017426358 + }, + { + "epoch": 2.7610671357373207, + "grad_norm": 0.3311623308280372, + "learning_rate": 0.00029368714859873406, + "loss": 3.122946262359619, + "step": 4710, + "token_acc": 0.28461078304392684 + }, + { + "epoch": 2.7616534740545293, + "grad_norm": 0.33037428075273734, + "learning_rate": 0.00029368297469214863, + "loss": 3.1733055114746094, + "step": 4711, + "token_acc": 0.2784378586376831 + }, + { + "epoch": 2.7622398123717384, + "grad_norm": 0.319498511795853, + "learning_rate": 0.0002936787994358553, + "loss": 3.1295394897460938, + "step": 4712, + "token_acc": 0.2837164147113336 + }, + { + "epoch": 2.7628261506889475, + "grad_norm": 0.2674550887606505, + "learning_rate": 0.00029367462282989324, + "loss": 3.122802257537842, + "step": 4713, + "token_acc": 0.28452427572220307 + }, + { + "epoch": 2.7634124890061567, + "grad_norm": 0.26906438881942324, + "learning_rate": 0.0002936704448743017, + "loss": 3.1374967098236084, + "step": 4714, + "token_acc": 0.2831088194939622 + }, + { + "epoch": 2.7639988273233653, + "grad_norm": 0.3127751302166687, + "learning_rate": 0.0002936662655691199, + "loss": 3.1238365173339844, + "step": 4715, + "token_acc": 0.2840411002616632 + }, + { + "epoch": 2.7645851656405744, + "grad_norm": 0.3025758184607278, + "learning_rate": 0.0002936620849143872, + "loss": 3.1010937690734863, + "step": 4716, + "token_acc": 0.287815059612283 + }, + { + "epoch": 2.7651715039577835, + "grad_norm": 0.30843863847090003, + "learning_rate": 0.0002936579029101427, + "loss": 3.132065773010254, + "step": 4717, + "token_acc": 0.28480036571812906 + }, + { + "epoch": 2.7657578422749927, + "grad_norm": 0.3256695391131964, + "learning_rate": 0.0002936537195564259, + "loss": 3.1091110706329346, + "step": 4718, + "token_acc": 0.2887694879229047 + }, + { + "epoch": 2.7663441805922018, + "grad_norm": 0.2895605700306016, + "learning_rate": 0.00029364953485327587, + "loss": 3.1116220951080322, + "step": 4719, + "token_acc": 0.2867558742083909 + }, + { + "epoch": 2.766930518909411, + "grad_norm": 0.29863988416421283, + "learning_rate": 0.0002936453488007321, + "loss": 3.1028687953948975, + "step": 4720, + "token_acc": 0.28861646139508595 + }, + { + "epoch": 2.76751685722662, + "grad_norm": 0.2720282939457676, + "learning_rate": 0.00029364116139883384, + "loss": 3.0520944595336914, + "step": 4721, + "token_acc": 0.2949303692347004 + }, + { + "epoch": 2.7681031955438287, + "grad_norm": 0.28722570662591207, + "learning_rate": 0.0002936369726476204, + "loss": 3.0994036197662354, + "step": 4722, + "token_acc": 0.2882835377715889 + }, + { + "epoch": 2.7686895338610378, + "grad_norm": 0.2872564167044778, + "learning_rate": 0.00029363278254713115, + "loss": 3.086535930633545, + "step": 4723, + "token_acc": 0.29169510984237307 + }, + { + "epoch": 2.769275872178247, + "grad_norm": 0.2935175474865802, + "learning_rate": 0.0002936285910974055, + "loss": 3.1063809394836426, + "step": 4724, + "token_acc": 0.28766629857495923 + }, + { + "epoch": 2.769862210495456, + "grad_norm": 0.3183247809477237, + "learning_rate": 0.0002936243982984827, + "loss": 3.1144137382507324, + "step": 4725, + "token_acc": 0.28615390039309896 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.27868157717593633, + "learning_rate": 0.0002936202041504023, + "loss": 3.1402106285095215, + "step": 4726, + "token_acc": 0.28274125978223885 + }, + { + "epoch": 2.771034887129874, + "grad_norm": 0.3002997259143058, + "learning_rate": 0.00029361600865320355, + "loss": 3.1407697200775146, + "step": 4727, + "token_acc": 0.2826597646667498 + }, + { + "epoch": 2.771621225447083, + "grad_norm": 0.35127357628587, + "learning_rate": 0.000293611811806926, + "loss": 3.1081745624542236, + "step": 4728, + "token_acc": 0.2871727064881066 + }, + { + "epoch": 2.772207563764292, + "grad_norm": 0.34780037211783155, + "learning_rate": 0.00029360761361160893, + "loss": 3.1781954765319824, + "step": 4729, + "token_acc": 0.2780793865629513 + }, + { + "epoch": 2.772793902081501, + "grad_norm": 0.3109802659415708, + "learning_rate": 0.0002936034140672918, + "loss": 3.1407785415649414, + "step": 4730, + "token_acc": 0.28216742668199385 + }, + { + "epoch": 2.7733802403987102, + "grad_norm": 0.2945505158065512, + "learning_rate": 0.00029359921317401416, + "loss": 3.0765645503997803, + "step": 4731, + "token_acc": 0.29188713894422313 + }, + { + "epoch": 2.7739665787159193, + "grad_norm": 0.2944579371613482, + "learning_rate": 0.00029359501093181547, + "loss": 3.100515842437744, + "step": 4732, + "token_acc": 0.2893097380107402 + }, + { + "epoch": 2.774552917033128, + "grad_norm": 0.30057316987428545, + "learning_rate": 0.0002935908073407351, + "loss": 3.1424355506896973, + "step": 4733, + "token_acc": 0.28182488831109875 + }, + { + "epoch": 2.775139255350337, + "grad_norm": 0.2758452216064872, + "learning_rate": 0.00029358660240081253, + "loss": 3.093410015106201, + "step": 4734, + "token_acc": 0.2886025613241068 + }, + { + "epoch": 2.7757255936675462, + "grad_norm": 0.27295858313573956, + "learning_rate": 0.00029358239611208744, + "loss": 3.1155290603637695, + "step": 4735, + "token_acc": 0.2855256120035591 + }, + { + "epoch": 2.7763119319847553, + "grad_norm": 0.3090229536327913, + "learning_rate": 0.0002935781884745991, + "loss": 3.1167690753936768, + "step": 4736, + "token_acc": 0.28563907620603424 + }, + { + "epoch": 2.776898270301964, + "grad_norm": 0.3131120023804289, + "learning_rate": 0.00029357397948838725, + "loss": 3.117898464202881, + "step": 4737, + "token_acc": 0.2858612347363679 + }, + { + "epoch": 2.777484608619173, + "grad_norm": 0.3249700613060513, + "learning_rate": 0.00029356976915349126, + "loss": 3.152148485183716, + "step": 4738, + "token_acc": 0.28047536017613073 + }, + { + "epoch": 2.7780709469363822, + "grad_norm": 0.2904561331351648, + "learning_rate": 0.00029356555746995076, + "loss": 3.0702338218688965, + "step": 4739, + "token_acc": 0.2927292264555936 + }, + { + "epoch": 2.7786572852535913, + "grad_norm": 0.2570788723343336, + "learning_rate": 0.00029356134443780533, + "loss": 3.0861129760742188, + "step": 4740, + "token_acc": 0.29051091618643665 + }, + { + "epoch": 2.7792436235708005, + "grad_norm": 0.3137862369768742, + "learning_rate": 0.0002935571300570945, + "loss": 3.151841163635254, + "step": 4741, + "token_acc": 0.2815171892867259 + }, + { + "epoch": 2.7798299618880096, + "grad_norm": 0.27598643946103335, + "learning_rate": 0.0002935529143278579, + "loss": 3.1331369876861572, + "step": 4742, + "token_acc": 0.2831885082321636 + }, + { + "epoch": 2.7804163002052187, + "grad_norm": 0.24798211082959642, + "learning_rate": 0.0002935486972501351, + "loss": 3.1088995933532715, + "step": 4743, + "token_acc": 0.28703560091406355 + }, + { + "epoch": 2.7810026385224274, + "grad_norm": 0.30227551263695224, + "learning_rate": 0.00029354447882396574, + "loss": 3.112123489379883, + "step": 4744, + "token_acc": 0.284384686878547 + }, + { + "epoch": 2.7815889768396365, + "grad_norm": 0.2636358798251168, + "learning_rate": 0.0002935402590493894, + "loss": 3.0898818969726562, + "step": 4745, + "token_acc": 0.29021375916803793 + }, + { + "epoch": 2.7821753151568456, + "grad_norm": 0.2796744194081281, + "learning_rate": 0.00029353603792644573, + "loss": 3.1446924209594727, + "step": 4746, + "token_acc": 0.28264945951908227 + }, + { + "epoch": 2.7827616534740547, + "grad_norm": 0.2943004441812503, + "learning_rate": 0.00029353181545517445, + "loss": 3.1216249465942383, + "step": 4747, + "token_acc": 0.2849114084084323 + }, + { + "epoch": 2.7833479917912634, + "grad_norm": 0.33636066193253317, + "learning_rate": 0.00029352759163561514, + "loss": 3.1289706230163574, + "step": 4748, + "token_acc": 0.28333978584587244 + }, + { + "epoch": 2.7839343301084725, + "grad_norm": 0.2893669039609805, + "learning_rate": 0.00029352336646780756, + "loss": 3.127286434173584, + "step": 4749, + "token_acc": 0.2824601788154205 + }, + { + "epoch": 2.7845206684256816, + "grad_norm": 0.3374730069499942, + "learning_rate": 0.0002935191399517913, + "loss": 3.092006206512451, + "step": 4750, + "token_acc": 0.2904185272882335 + }, + { + "epoch": 2.7851070067428907, + "grad_norm": 0.3057522873725431, + "learning_rate": 0.00029351491208760616, + "loss": 3.156360149383545, + "step": 4751, + "token_acc": 0.28291448413866166 + }, + { + "epoch": 2.7856933450601, + "grad_norm": 0.3643003397270055, + "learning_rate": 0.0002935106828752917, + "loss": 3.106804847717285, + "step": 4752, + "token_acc": 0.28813194141636955 + }, + { + "epoch": 2.786279683377309, + "grad_norm": 0.2996529395680403, + "learning_rate": 0.00029350645231488793, + "loss": 3.099357843399048, + "step": 4753, + "token_acc": 0.2861523795593194 + }, + { + "epoch": 2.7868660216945176, + "grad_norm": 0.3345417177081686, + "learning_rate": 0.0002935022204064343, + "loss": 3.0993685722351074, + "step": 4754, + "token_acc": 0.2868112464576929 + }, + { + "epoch": 2.7874523600117267, + "grad_norm": 0.3385383767789251, + "learning_rate": 0.0002934979871499707, + "loss": 3.1421968936920166, + "step": 4755, + "token_acc": 0.28173475794912967 + }, + { + "epoch": 2.788038698328936, + "grad_norm": 0.3466329116364996, + "learning_rate": 0.0002934937525455369, + "loss": 3.175919532775879, + "step": 4756, + "token_acc": 0.2766175439677685 + }, + { + "epoch": 2.788625036646145, + "grad_norm": 0.32674806265435374, + "learning_rate": 0.00029348951659317267, + "loss": 3.128875255584717, + "step": 4757, + "token_acc": 0.28340863858856435 + }, + { + "epoch": 2.7892113749633536, + "grad_norm": 0.3008083369200603, + "learning_rate": 0.00029348527929291775, + "loss": 3.107044219970703, + "step": 4758, + "token_acc": 0.28552801101205677 + }, + { + "epoch": 2.7897977132805627, + "grad_norm": 0.25271317538884214, + "learning_rate": 0.00029348104064481196, + "loss": 3.1241354942321777, + "step": 4759, + "token_acc": 0.28352744812962216 + }, + { + "epoch": 2.790384051597772, + "grad_norm": 0.3112187424202964, + "learning_rate": 0.0002934768006488952, + "loss": 3.0990638732910156, + "step": 4760, + "token_acc": 0.2885161603682938 + }, + { + "epoch": 2.790970389914981, + "grad_norm": 0.2902737715054049, + "learning_rate": 0.0002934725593052072, + "loss": 3.0783305168151855, + "step": 4761, + "token_acc": 0.2897023372812074 + }, + { + "epoch": 2.79155672823219, + "grad_norm": 0.25972198582048506, + "learning_rate": 0.0002934683166137878, + "loss": 3.1035685539245605, + "step": 4762, + "token_acc": 0.2892015099860235 + }, + { + "epoch": 2.792143066549399, + "grad_norm": 0.3057722291314643, + "learning_rate": 0.0002934640725746769, + "loss": 3.105126142501831, + "step": 4763, + "token_acc": 0.28782926816411086 + }, + { + "epoch": 2.7927294048666083, + "grad_norm": 0.3463992417814066, + "learning_rate": 0.00029345982718791445, + "loss": 3.0963521003723145, + "step": 4764, + "token_acc": 0.2886228188744163 + }, + { + "epoch": 2.793315743183817, + "grad_norm": 0.3477265486662584, + "learning_rate": 0.0002934555804535402, + "loss": 3.1557483673095703, + "step": 4765, + "token_acc": 0.28115957432674804 + }, + { + "epoch": 2.793902081501026, + "grad_norm": 0.3340291466908199, + "learning_rate": 0.000293451332371594, + "loss": 3.1081161499023438, + "step": 4766, + "token_acc": 0.2871892972932686 + }, + { + "epoch": 2.794488419818235, + "grad_norm": 0.2779540440360137, + "learning_rate": 0.0002934470829421159, + "loss": 3.1169052124023438, + "step": 4767, + "token_acc": 0.28599897630164284 + }, + { + "epoch": 2.7950747581354443, + "grad_norm": 0.3122197730224559, + "learning_rate": 0.00029344283216514575, + "loss": 3.1311275959014893, + "step": 4768, + "token_acc": 0.2826014770121008 + }, + { + "epoch": 2.795661096452653, + "grad_norm": 0.30956000606854917, + "learning_rate": 0.0002934385800407235, + "loss": 3.1180009841918945, + "step": 4769, + "token_acc": 0.28413777903718396 + }, + { + "epoch": 2.796247434769862, + "grad_norm": 0.3233400791004117, + "learning_rate": 0.00029343432656888903, + "loss": 3.1452269554138184, + "step": 4770, + "token_acc": 0.28249778293572153 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.2903950503319801, + "learning_rate": 0.0002934300717496824, + "loss": 3.1585071086883545, + "step": 4771, + "token_acc": 0.27954393943785555 + }, + { + "epoch": 2.7974201114042803, + "grad_norm": 0.28406839651697097, + "learning_rate": 0.0002934258155831435, + "loss": 3.1356396675109863, + "step": 4772, + "token_acc": 0.2832308692734767 + }, + { + "epoch": 2.7980064497214894, + "grad_norm": 0.3049935977384799, + "learning_rate": 0.00029342155806931226, + "loss": 3.1156716346740723, + "step": 4773, + "token_acc": 0.286786067069376 + }, + { + "epoch": 2.7985927880386985, + "grad_norm": 0.29805189566895046, + "learning_rate": 0.0002934172992082288, + "loss": 3.1358954906463623, + "step": 4774, + "token_acc": 0.28252401427854723 + }, + { + "epoch": 2.7991791263559076, + "grad_norm": 0.2780179268006769, + "learning_rate": 0.00029341303899993313, + "loss": 3.1307790279388428, + "step": 4775, + "token_acc": 0.28324462258193395 + }, + { + "epoch": 2.7997654646731163, + "grad_norm": 0.25264320977355403, + "learning_rate": 0.00029340877744446514, + "loss": 3.109707832336426, + "step": 4776, + "token_acc": 0.2857253570999078 + }, + { + "epoch": 2.8003518029903254, + "grad_norm": 0.2633520661089611, + "learning_rate": 0.0002934045145418649, + "loss": 3.1307430267333984, + "step": 4777, + "token_acc": 0.283065705588884 + }, + { + "epoch": 2.8009381413075345, + "grad_norm": 0.30291166318612983, + "learning_rate": 0.00029340025029217254, + "loss": 3.1443891525268555, + "step": 4778, + "token_acc": 0.28039302540973704 + }, + { + "epoch": 2.8015244796247436, + "grad_norm": 0.27249199423193987, + "learning_rate": 0.00029339598469542807, + "loss": 3.100367546081543, + "step": 4779, + "token_acc": 0.28946103819826363 + }, + { + "epoch": 2.8021108179419523, + "grad_norm": 0.3247226014044503, + "learning_rate": 0.0002933917177516715, + "loss": 3.148219108581543, + "step": 4780, + "token_acc": 0.28245314892461926 + }, + { + "epoch": 2.8026971562591614, + "grad_norm": 0.32783183596603255, + "learning_rate": 0.00029338744946094306, + "loss": 3.1216397285461426, + "step": 4781, + "token_acc": 0.28523994568546046 + }, + { + "epoch": 2.8032834945763705, + "grad_norm": 0.27548878007703953, + "learning_rate": 0.00029338317982328265, + "loss": 3.0968270301818848, + "step": 4782, + "token_acc": 0.2869531156777397 + }, + { + "epoch": 2.8038698328935796, + "grad_norm": 0.31312100757531597, + "learning_rate": 0.0002933789088387306, + "loss": 3.1257262229919434, + "step": 4783, + "token_acc": 0.2835693752142838 + }, + { + "epoch": 2.8044561712107887, + "grad_norm": 0.2941055296799855, + "learning_rate": 0.00029337463650732677, + "loss": 3.1181437969207764, + "step": 4784, + "token_acc": 0.28493374896379303 + }, + { + "epoch": 2.805042509527998, + "grad_norm": 0.2901696878023032, + "learning_rate": 0.0002933703628291115, + "loss": 3.1176421642303467, + "step": 4785, + "token_acc": 0.2856336055203573 + }, + { + "epoch": 2.805628847845207, + "grad_norm": 0.3111190769110065, + "learning_rate": 0.00029336608780412485, + "loss": 3.090714454650879, + "step": 4786, + "token_acc": 0.2897794207255149 + }, + { + "epoch": 2.8062151861624156, + "grad_norm": 0.3001351590770675, + "learning_rate": 0.00029336181143240696, + "loss": 3.086665153503418, + "step": 4787, + "token_acc": 0.2895817036895571 + }, + { + "epoch": 2.8068015244796247, + "grad_norm": 0.2707266552105435, + "learning_rate": 0.0002933575337139981, + "loss": 3.123156785964966, + "step": 4788, + "token_acc": 0.2847980163348326 + }, + { + "epoch": 2.807387862796834, + "grad_norm": 0.30095325177062243, + "learning_rate": 0.0002933532546489383, + "loss": 3.124725341796875, + "step": 4789, + "token_acc": 0.28469981777954617 + }, + { + "epoch": 2.807974201114043, + "grad_norm": 0.3459558116493978, + "learning_rate": 0.000293348974237268, + "loss": 3.120318651199341, + "step": 4790, + "token_acc": 0.28479895492329105 + }, + { + "epoch": 2.8085605394312516, + "grad_norm": 0.3167117940720076, + "learning_rate": 0.0002933446924790271, + "loss": 3.1116929054260254, + "step": 4791, + "token_acc": 0.28677492521273684 + }, + { + "epoch": 2.8091468777484607, + "grad_norm": 0.2635443532595926, + "learning_rate": 0.000293340409374256, + "loss": 3.1197524070739746, + "step": 4792, + "token_acc": 0.2830235439900867 + }, + { + "epoch": 2.80973321606567, + "grad_norm": 0.26755544258369857, + "learning_rate": 0.00029333612492299496, + "loss": 3.120893716812134, + "step": 4793, + "token_acc": 0.2855528925686526 + }, + { + "epoch": 2.810319554382879, + "grad_norm": 0.2915483329710046, + "learning_rate": 0.0002933318391252841, + "loss": 3.1189095973968506, + "step": 4794, + "token_acc": 0.2845259044267203 + }, + { + "epoch": 2.810905892700088, + "grad_norm": 0.30296234677748024, + "learning_rate": 0.0002933275519811638, + "loss": 3.1456851959228516, + "step": 4795, + "token_acc": 0.28068083675356736 + }, + { + "epoch": 2.811492231017297, + "grad_norm": 0.27627042563219706, + "learning_rate": 0.00029332326349067433, + "loss": 3.095336437225342, + "step": 4796, + "token_acc": 0.28799177033443357 + }, + { + "epoch": 2.8120785693345063, + "grad_norm": 0.2902708852995741, + "learning_rate": 0.0002933189736538559, + "loss": 3.0547122955322266, + "step": 4797, + "token_acc": 0.2945795135364472 + }, + { + "epoch": 2.812664907651715, + "grad_norm": 0.3061769651257533, + "learning_rate": 0.0002933146824707488, + "loss": 3.1480655670166016, + "step": 4798, + "token_acc": 0.28096164514137933 + }, + { + "epoch": 2.813251245968924, + "grad_norm": 0.25348439848553617, + "learning_rate": 0.0002933103899413934, + "loss": 3.12548828125, + "step": 4799, + "token_acc": 0.28507425277273013 + }, + { + "epoch": 2.813837584286133, + "grad_norm": 0.27607788125427285, + "learning_rate": 0.00029330609606583, + "loss": 3.1160919666290283, + "step": 4800, + "token_acc": 0.28696695038225806 + }, + { + "epoch": 2.8144239226033423, + "grad_norm": 0.2884032026910037, + "learning_rate": 0.0002933018008440989, + "loss": 3.125800609588623, + "step": 4801, + "token_acc": 0.28430531841534074 + }, + { + "epoch": 2.815010260920551, + "grad_norm": 0.3033818339819906, + "learning_rate": 0.00029329750427624054, + "loss": 3.0998997688293457, + "step": 4802, + "token_acc": 0.2864337751157555 + }, + { + "epoch": 2.81559659923776, + "grad_norm": 0.2818880447259213, + "learning_rate": 0.00029329320636229517, + "loss": 3.116429328918457, + "step": 4803, + "token_acc": 0.2863871521777834 + }, + { + "epoch": 2.816182937554969, + "grad_norm": 0.3005505760779356, + "learning_rate": 0.00029328890710230327, + "loss": 3.072636604309082, + "step": 4804, + "token_acc": 0.29073063659580667 + }, + { + "epoch": 2.8167692758721783, + "grad_norm": 0.3096254094953349, + "learning_rate": 0.00029328460649630516, + "loss": 3.088721752166748, + "step": 4805, + "token_acc": 0.2900315601511565 + }, + { + "epoch": 2.8173556141893874, + "grad_norm": 0.3229874730198362, + "learning_rate": 0.0002932803045443412, + "loss": 3.1045455932617188, + "step": 4806, + "token_acc": 0.286118006886878 + }, + { + "epoch": 2.8179419525065965, + "grad_norm": 0.32704077078975075, + "learning_rate": 0.0002932760012464519, + "loss": 3.1075010299682617, + "step": 4807, + "token_acc": 0.2876784284526766 + }, + { + "epoch": 2.818528290823805, + "grad_norm": 0.2995854010589223, + "learning_rate": 0.0002932716966026776, + "loss": 3.160248279571533, + "step": 4808, + "token_acc": 0.27959627337537923 + }, + { + "epoch": 2.8191146291410143, + "grad_norm": 0.2938102615170772, + "learning_rate": 0.0002932673906130588, + "loss": 3.0863037109375, + "step": 4809, + "token_acc": 0.2891724337175678 + }, + { + "epoch": 2.8197009674582234, + "grad_norm": 0.3118335607623899, + "learning_rate": 0.0002932630832776359, + "loss": 3.0921542644500732, + "step": 4810, + "token_acc": 0.28992817631783746 + }, + { + "epoch": 2.8202873057754325, + "grad_norm": 0.3094380873333429, + "learning_rate": 0.00029325877459644944, + "loss": 3.1427063941955566, + "step": 4811, + "token_acc": 0.28129413438189005 + }, + { + "epoch": 2.820873644092641, + "grad_norm": 0.2870615448386523, + "learning_rate": 0.0002932544645695398, + "loss": 3.13639497756958, + "step": 4812, + "token_acc": 0.28281617872187675 + }, + { + "epoch": 2.8214599824098503, + "grad_norm": 0.34447847084531236, + "learning_rate": 0.00029325015319694753, + "loss": 3.123680591583252, + "step": 4813, + "token_acc": 0.28504291601681525 + }, + { + "epoch": 2.8220463207270594, + "grad_norm": 0.3116127655316353, + "learning_rate": 0.000293245840478713, + "loss": 3.1213512420654297, + "step": 4814, + "token_acc": 0.28631178804491025 + }, + { + "epoch": 2.8226326590442685, + "grad_norm": 0.25576737437872527, + "learning_rate": 0.00029324152641487693, + "loss": 3.138035297393799, + "step": 4815, + "token_acc": 0.2847776384496868 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.25787697324500075, + "learning_rate": 0.0002932372110054797, + "loss": 3.12288498878479, + "step": 4816, + "token_acc": 0.2847317122186495 + }, + { + "epoch": 2.8238053356786867, + "grad_norm": 0.2992994459623034, + "learning_rate": 0.0002932328942505619, + "loss": 3.1149678230285645, + "step": 4817, + "token_acc": 0.2863905972393057 + }, + { + "epoch": 2.824391673995896, + "grad_norm": 0.2731976209267566, + "learning_rate": 0.00029322857615016407, + "loss": 3.118638753890991, + "step": 4818, + "token_acc": 0.2851220271218267 + }, + { + "epoch": 2.8249780123131045, + "grad_norm": 0.2523618647440558, + "learning_rate": 0.00029322425670432676, + "loss": 3.1266844272613525, + "step": 4819, + "token_acc": 0.28406320305551197 + }, + { + "epoch": 2.8255643506303136, + "grad_norm": 0.2741901247567916, + "learning_rate": 0.0002932199359130905, + "loss": 3.121201992034912, + "step": 4820, + "token_acc": 0.2859980293522792 + }, + { + "epoch": 2.8261506889475227, + "grad_norm": 0.2512626865801087, + "learning_rate": 0.00029321561377649604, + "loss": 3.0935726165771484, + "step": 4821, + "token_acc": 0.2865146230211967 + }, + { + "epoch": 2.826737027264732, + "grad_norm": 0.27293136031161014, + "learning_rate": 0.0002932112902945838, + "loss": 3.071885585784912, + "step": 4822, + "token_acc": 0.2919371286632777 + }, + { + "epoch": 2.8273233655819405, + "grad_norm": 0.3274905247227663, + "learning_rate": 0.0002932069654673945, + "loss": 3.0848751068115234, + "step": 4823, + "token_acc": 0.2890657331818529 + }, + { + "epoch": 2.8279097038991496, + "grad_norm": 0.2800857854942314, + "learning_rate": 0.00029320263929496874, + "loss": 3.117919445037842, + "step": 4824, + "token_acc": 0.28667495596311265 + }, + { + "epoch": 2.8284960422163588, + "grad_norm": 0.29551091979071137, + "learning_rate": 0.0002931983117773471, + "loss": 3.14457106590271, + "step": 4825, + "token_acc": 0.2818612118657492 + }, + { + "epoch": 2.829082380533568, + "grad_norm": 0.2992764607681541, + "learning_rate": 0.00029319398291457034, + "loss": 3.126473903656006, + "step": 4826, + "token_acc": 0.28361635593555795 + }, + { + "epoch": 2.829668718850777, + "grad_norm": 0.27993474524943124, + "learning_rate": 0.00029318965270667903, + "loss": 3.081585645675659, + "step": 4827, + "token_acc": 0.2912369465640949 + }, + { + "epoch": 2.830255057167986, + "grad_norm": 0.28834165412562, + "learning_rate": 0.0002931853211537139, + "loss": 3.1001791954040527, + "step": 4828, + "token_acc": 0.28575067024128686 + }, + { + "epoch": 2.830841395485195, + "grad_norm": 0.30523317654819143, + "learning_rate": 0.00029318098825571563, + "loss": 3.1548798084259033, + "step": 4829, + "token_acc": 0.2802185968180981 + }, + { + "epoch": 2.831427733802404, + "grad_norm": 0.26778805689058516, + "learning_rate": 0.0002931766540127249, + "loss": 3.068540096282959, + "step": 4830, + "token_acc": 0.292444714336319 + }, + { + "epoch": 2.832014072119613, + "grad_norm": 0.26983028423714406, + "learning_rate": 0.00029317231842478244, + "loss": 3.1479101181030273, + "step": 4831, + "token_acc": 0.2824498268030677 + }, + { + "epoch": 2.832600410436822, + "grad_norm": 0.29022877944336695, + "learning_rate": 0.00029316798149192896, + "loss": 3.0813260078430176, + "step": 4832, + "token_acc": 0.29194514465437893 + }, + { + "epoch": 2.833186748754031, + "grad_norm": 0.2827107637912602, + "learning_rate": 0.00029316364321420524, + "loss": 3.094301223754883, + "step": 4833, + "token_acc": 0.28942929357174063 + }, + { + "epoch": 2.83377308707124, + "grad_norm": 0.2914796846949472, + "learning_rate": 0.000293159303591652, + "loss": 3.1143198013305664, + "step": 4834, + "token_acc": 0.28593511842347064 + }, + { + "epoch": 2.834359425388449, + "grad_norm": 0.3298816808976584, + "learning_rate": 0.00029315496262431, + "loss": 3.13633394241333, + "step": 4835, + "token_acc": 0.2843331502574184 + }, + { + "epoch": 2.834945763705658, + "grad_norm": 0.27119629383261534, + "learning_rate": 0.0002931506203122201, + "loss": 3.0696592330932617, + "step": 4836, + "token_acc": 0.29377861143539225 + }, + { + "epoch": 2.835532102022867, + "grad_norm": 0.30175032629848947, + "learning_rate": 0.00029314627665542295, + "loss": 3.1369216442108154, + "step": 4837, + "token_acc": 0.28312175486753405 + }, + { + "epoch": 2.8361184403400763, + "grad_norm": 0.302949311866685, + "learning_rate": 0.00029314193165395946, + "loss": 3.1368658542633057, + "step": 4838, + "token_acc": 0.28322248389016846 + }, + { + "epoch": 2.8367047786572854, + "grad_norm": 0.30883963838799494, + "learning_rate": 0.0002931375853078703, + "loss": 3.1040472984313965, + "step": 4839, + "token_acc": 0.2867527979782351 + }, + { + "epoch": 2.8372911169744945, + "grad_norm": 0.2827985693863901, + "learning_rate": 0.00029313323761719654, + "loss": 3.1218366622924805, + "step": 4840, + "token_acc": 0.2836967814379112 + }, + { + "epoch": 2.837877455291703, + "grad_norm": 0.2681569708886454, + "learning_rate": 0.00029312888858197886, + "loss": 3.1114094257354736, + "step": 4841, + "token_acc": 0.28511330755505787 + }, + { + "epoch": 2.8384637936089123, + "grad_norm": 0.3264772174141351, + "learning_rate": 0.0002931245382022581, + "loss": 3.1159815788269043, + "step": 4842, + "token_acc": 0.28625545539829506 + }, + { + "epoch": 2.8390501319261214, + "grad_norm": 0.319322879288859, + "learning_rate": 0.0002931201864780752, + "loss": 3.1065726280212402, + "step": 4843, + "token_acc": 0.285871300141679 + }, + { + "epoch": 2.8396364702433305, + "grad_norm": 0.2650891938940965, + "learning_rate": 0.000293115833409471, + "loss": 3.0474016666412354, + "step": 4844, + "token_acc": 0.295455126206745 + }, + { + "epoch": 2.840222808560539, + "grad_norm": 0.3119062335788396, + "learning_rate": 0.00029311147899648633, + "loss": 3.1173930168151855, + "step": 4845, + "token_acc": 0.28416118143890845 + }, + { + "epoch": 2.8408091468777483, + "grad_norm": 0.28850440755766965, + "learning_rate": 0.0002931071232391623, + "loss": 3.1064767837524414, + "step": 4846, + "token_acc": 0.28648632904554866 + }, + { + "epoch": 2.8413954851949574, + "grad_norm": 0.34849386978601515, + "learning_rate": 0.00029310276613753953, + "loss": 3.077314853668213, + "step": 4847, + "token_acc": 0.2921700015028788 + }, + { + "epoch": 2.8419818235121665, + "grad_norm": 0.31798818534025625, + "learning_rate": 0.0002930984076916592, + "loss": 3.1515605449676514, + "step": 4848, + "token_acc": 0.2810243967207225 + }, + { + "epoch": 2.8425681618293757, + "grad_norm": 0.29246374304269834, + "learning_rate": 0.00029309404790156215, + "loss": 3.1486268043518066, + "step": 4849, + "token_acc": 0.2804683665229301 + }, + { + "epoch": 2.8431545001465848, + "grad_norm": 0.3309723062237501, + "learning_rate": 0.0002930896867672893, + "loss": 3.158156394958496, + "step": 4850, + "token_acc": 0.27954230235783634 + }, + { + "epoch": 2.843740838463794, + "grad_norm": 0.3424735959826253, + "learning_rate": 0.00029308532428888167, + "loss": 3.109546184539795, + "step": 4851, + "token_acc": 0.28682783827206987 + }, + { + "epoch": 2.8443271767810026, + "grad_norm": 0.3242010601750907, + "learning_rate": 0.0002930809604663803, + "loss": 3.1344964504241943, + "step": 4852, + "token_acc": 0.28407193895159333 + }, + { + "epoch": 2.8449135150982117, + "grad_norm": 0.2900361692375067, + "learning_rate": 0.000293076595299826, + "loss": 3.083594560623169, + "step": 4853, + "token_acc": 0.29146301464143803 + }, + { + "epoch": 2.8454998534154208, + "grad_norm": 0.31932065749919647, + "learning_rate": 0.00029307222878925996, + "loss": 3.1174509525299072, + "step": 4854, + "token_acc": 0.28473392024594435 + }, + { + "epoch": 2.84608619173263, + "grad_norm": 0.31590827138962746, + "learning_rate": 0.0002930678609347231, + "loss": 3.141587972640991, + "step": 4855, + "token_acc": 0.2844984834387304 + }, + { + "epoch": 2.8466725300498386, + "grad_norm": 0.27427109296086205, + "learning_rate": 0.00029306349173625646, + "loss": 3.107347011566162, + "step": 4856, + "token_acc": 0.28620114738818214 + }, + { + "epoch": 2.8472588683670477, + "grad_norm": 0.3118988287136445, + "learning_rate": 0.00029305912119390113, + "loss": 3.106328248977661, + "step": 4857, + "token_acc": 0.286497204558962 + }, + { + "epoch": 2.847845206684257, + "grad_norm": 0.318315578909619, + "learning_rate": 0.00029305474930769814, + "loss": 3.148857593536377, + "step": 4858, + "token_acc": 0.2800166847672653 + }, + { + "epoch": 2.848431545001466, + "grad_norm": 0.298690396112282, + "learning_rate": 0.00029305037607768846, + "loss": 3.1197948455810547, + "step": 4859, + "token_acc": 0.28621684219726434 + }, + { + "epoch": 2.849017883318675, + "grad_norm": 0.29569161439209757, + "learning_rate": 0.00029304600150391335, + "loss": 3.128751277923584, + "step": 4860, + "token_acc": 0.286149257280512 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.2940780428381767, + "learning_rate": 0.00029304162558641374, + "loss": 3.104583740234375, + "step": 4861, + "token_acc": 0.28515424587252514 + }, + { + "epoch": 2.850190559953093, + "grad_norm": 0.30763720723307464, + "learning_rate": 0.0002930372483252309, + "loss": 3.1395905017852783, + "step": 4862, + "token_acc": 0.2815394177975759 + }, + { + "epoch": 2.850776898270302, + "grad_norm": 0.2969169871594266, + "learning_rate": 0.00029303286972040576, + "loss": 3.1214871406555176, + "step": 4863, + "token_acc": 0.2862974108393171 + }, + { + "epoch": 2.851363236587511, + "grad_norm": 0.312502708456956, + "learning_rate": 0.0002930284897719796, + "loss": 3.07090163230896, + "step": 4864, + "token_acc": 0.29187167578669854 + }, + { + "epoch": 2.85194957490472, + "grad_norm": 0.29267313294868036, + "learning_rate": 0.00029302410847999356, + "loss": 3.1061668395996094, + "step": 4865, + "token_acc": 0.2863634445581425 + }, + { + "epoch": 2.852535913221929, + "grad_norm": 0.24357207731341954, + "learning_rate": 0.0002930197258444887, + "loss": 3.1159253120422363, + "step": 4866, + "token_acc": 0.28773236923753065 + }, + { + "epoch": 2.853122251539138, + "grad_norm": 0.2783362557811744, + "learning_rate": 0.0002930153418655062, + "loss": 3.059241533279419, + "step": 4867, + "token_acc": 0.2928322504190838 + }, + { + "epoch": 2.853708589856347, + "grad_norm": 0.2838960647409691, + "learning_rate": 0.0002930109565430873, + "loss": 3.144043445587158, + "step": 4868, + "token_acc": 0.2822934491125016 + }, + { + "epoch": 2.854294928173556, + "grad_norm": 0.2565551470454247, + "learning_rate": 0.0002930065698772732, + "loss": 3.130599021911621, + "step": 4869, + "token_acc": 0.283473805006662 + }, + { + "epoch": 2.8548812664907652, + "grad_norm": 0.2591378229642998, + "learning_rate": 0.00029300218186810505, + "loss": 3.1789751052856445, + "step": 4870, + "token_acc": 0.2761540380081097 + }, + { + "epoch": 2.8554676048079743, + "grad_norm": 0.2521603663592778, + "learning_rate": 0.0002929977925156241, + "loss": 3.1506099700927734, + "step": 4871, + "token_acc": 0.2807923634991304 + }, + { + "epoch": 2.8560539431251835, + "grad_norm": 0.2964112174558061, + "learning_rate": 0.0002929934018198716, + "loss": 3.0899569988250732, + "step": 4872, + "token_acc": 0.287917724771943 + }, + { + "epoch": 2.856640281442392, + "grad_norm": 0.3105701653327218, + "learning_rate": 0.0002929890097808888, + "loss": 3.1177895069122314, + "step": 4873, + "token_acc": 0.28626774354034573 + }, + { + "epoch": 2.8572266197596012, + "grad_norm": 0.2950936350932301, + "learning_rate": 0.0002929846163987169, + "loss": 3.09696102142334, + "step": 4874, + "token_acc": 0.2871042463205912 + }, + { + "epoch": 2.8578129580768104, + "grad_norm": 0.32728913303465995, + "learning_rate": 0.00029298022167339717, + "loss": 3.1100754737854004, + "step": 4875, + "token_acc": 0.2867621918182918 + }, + { + "epoch": 2.8583992963940195, + "grad_norm": 0.28055158717238665, + "learning_rate": 0.000292975825604971, + "loss": 3.1094837188720703, + "step": 4876, + "token_acc": 0.2866343581867925 + }, + { + "epoch": 2.858985634711228, + "grad_norm": 0.28321546417950155, + "learning_rate": 0.0002929714281934796, + "loss": 3.133733034133911, + "step": 4877, + "token_acc": 0.2830215885576091 + }, + { + "epoch": 2.8595719730284372, + "grad_norm": 0.3164221953206162, + "learning_rate": 0.0002929670294389643, + "loss": 3.140878200531006, + "step": 4878, + "token_acc": 0.2818786657806192 + }, + { + "epoch": 2.8601583113456464, + "grad_norm": 0.32065432651004266, + "learning_rate": 0.00029296262934146633, + "loss": 3.1077091693878174, + "step": 4879, + "token_acc": 0.28559167609557956 + }, + { + "epoch": 2.8607446496628555, + "grad_norm": 0.2732841849190749, + "learning_rate": 0.0002929582279010271, + "loss": 3.107861042022705, + "step": 4880, + "token_acc": 0.2858103313039256 + }, + { + "epoch": 2.8613309879800646, + "grad_norm": 0.3052649279221656, + "learning_rate": 0.000292953825117688, + "loss": 3.1658401489257812, + "step": 4881, + "token_acc": 0.2781068206727932 + }, + { + "epoch": 2.8619173262972737, + "grad_norm": 0.2970572470124204, + "learning_rate": 0.0002929494209914904, + "loss": 3.118779182434082, + "step": 4882, + "token_acc": 0.2856068330795206 + }, + { + "epoch": 2.862503664614483, + "grad_norm": 0.31198031160947043, + "learning_rate": 0.0002929450155224756, + "loss": 3.1081762313842773, + "step": 4883, + "token_acc": 0.28829772360410283 + }, + { + "epoch": 2.8630900029316915, + "grad_norm": 0.31601388176329565, + "learning_rate": 0.0002929406087106849, + "loss": 3.1330833435058594, + "step": 4884, + "token_acc": 0.28466973679439295 + }, + { + "epoch": 2.8636763412489006, + "grad_norm": 0.2830113195935163, + "learning_rate": 0.0002929362005561599, + "loss": 3.100548505783081, + "step": 4885, + "token_acc": 0.2873959428923845 + }, + { + "epoch": 2.8642626795661097, + "grad_norm": 0.29921316637741135, + "learning_rate": 0.00029293179105894184, + "loss": 3.125476837158203, + "step": 4886, + "token_acc": 0.2838808884679541 + }, + { + "epoch": 2.864849017883319, + "grad_norm": 0.3335859726263728, + "learning_rate": 0.0002929273802190722, + "loss": 3.0884318351745605, + "step": 4887, + "token_acc": 0.2890766528081912 + }, + { + "epoch": 2.8654353562005275, + "grad_norm": 0.3089060955370666, + "learning_rate": 0.00029292296803659244, + "loss": 3.101276397705078, + "step": 4888, + "token_acc": 0.28810263558303095 + }, + { + "epoch": 2.8660216945177366, + "grad_norm": 0.30047636215590856, + "learning_rate": 0.000292918554511544, + "loss": 3.0985074043273926, + "step": 4889, + "token_acc": 0.28903013283218726 + }, + { + "epoch": 2.8666080328349457, + "grad_norm": 0.3134857563913906, + "learning_rate": 0.00029291413964396834, + "loss": 3.1573214530944824, + "step": 4890, + "token_acc": 0.2812182511341575 + }, + { + "epoch": 2.867194371152155, + "grad_norm": 0.2629266766148198, + "learning_rate": 0.0002929097234339069, + "loss": 3.0910210609436035, + "step": 4891, + "token_acc": 0.29000242822527733 + }, + { + "epoch": 2.867780709469364, + "grad_norm": 0.2915704342228827, + "learning_rate": 0.0002929053058814012, + "loss": 3.0773186683654785, + "step": 4892, + "token_acc": 0.29068148750147466 + }, + { + "epoch": 2.868367047786573, + "grad_norm": 0.3076166870050994, + "learning_rate": 0.0002929008869864927, + "loss": 3.143329620361328, + "step": 4893, + "token_acc": 0.28330985753470883 + }, + { + "epoch": 2.868953386103782, + "grad_norm": 0.27639408230821655, + "learning_rate": 0.0002928964667492229, + "loss": 3.1065673828125, + "step": 4894, + "token_acc": 0.2855755184027919 + }, + { + "epoch": 2.869539724420991, + "grad_norm": 0.2946722825662996, + "learning_rate": 0.0002928920451696334, + "loss": 3.151127338409424, + "step": 4895, + "token_acc": 0.28159459273053317 + }, + { + "epoch": 2.8701260627382, + "grad_norm": 0.2798906964861311, + "learning_rate": 0.00029288762224776566, + "loss": 3.058100938796997, + "step": 4896, + "token_acc": 0.2925107516570819 + }, + { + "epoch": 2.870712401055409, + "grad_norm": 0.3006552208621793, + "learning_rate": 0.0002928831979836612, + "loss": 3.095743179321289, + "step": 4897, + "token_acc": 0.29008229617252523 + }, + { + "epoch": 2.871298739372618, + "grad_norm": 0.2940403268361965, + "learning_rate": 0.00029287877237736177, + "loss": 3.1036224365234375, + "step": 4898, + "token_acc": 0.2869403492618208 + }, + { + "epoch": 2.871885077689827, + "grad_norm": 0.32453748369769375, + "learning_rate": 0.00029287434542890866, + "loss": 3.117668867111206, + "step": 4899, + "token_acc": 0.28508433106306025 + }, + { + "epoch": 2.872471416007036, + "grad_norm": 0.31176107990279756, + "learning_rate": 0.0002928699171383437, + "loss": 3.1080455780029297, + "step": 4900, + "token_acc": 0.28614597007088477 + }, + { + "epoch": 2.873057754324245, + "grad_norm": 0.30313176922850993, + "learning_rate": 0.00029286548750570834, + "loss": 3.0947012901306152, + "step": 4901, + "token_acc": 0.289301459440749 + }, + { + "epoch": 2.873644092641454, + "grad_norm": 0.2965457736440195, + "learning_rate": 0.0002928610565310442, + "loss": 3.1441471576690674, + "step": 4902, + "token_acc": 0.28153570012101736 + }, + { + "epoch": 2.8742304309586633, + "grad_norm": 0.31130153828995155, + "learning_rate": 0.00029285662421439304, + "loss": 3.0831656455993652, + "step": 4903, + "token_acc": 0.28918876510277736 + }, + { + "epoch": 2.8748167692758724, + "grad_norm": 0.31569128621032455, + "learning_rate": 0.00029285219055579637, + "loss": 3.1388185024261475, + "step": 4904, + "token_acc": 0.2811086659383922 + }, + { + "epoch": 2.875403107593081, + "grad_norm": 0.30562116025444946, + "learning_rate": 0.0002928477555552958, + "loss": 3.1299424171447754, + "step": 4905, + "token_acc": 0.2831447828456722 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.3040373356176108, + "learning_rate": 0.00029284331921293315, + "loss": 3.138782024383545, + "step": 4906, + "token_acc": 0.2822918825836695 + }, + { + "epoch": 2.8765757842274993, + "grad_norm": 0.27788813694937736, + "learning_rate": 0.0002928388815287499, + "loss": 3.1150121688842773, + "step": 4907, + "token_acc": 0.28705276437461796 + }, + { + "epoch": 2.8771621225447084, + "grad_norm": 0.2913365583270703, + "learning_rate": 0.0002928344425027879, + "loss": 3.093143939971924, + "step": 4908, + "token_acc": 0.2889180873095509 + }, + { + "epoch": 2.8777484608619175, + "grad_norm": 0.29617297110367347, + "learning_rate": 0.00029283000213508876, + "loss": 3.1062145233154297, + "step": 4909, + "token_acc": 0.2859172128492038 + }, + { + "epoch": 2.878334799179126, + "grad_norm": 0.27817404090336395, + "learning_rate": 0.00029282556042569427, + "loss": 3.0848326683044434, + "step": 4910, + "token_acc": 0.28965601502802485 + }, + { + "epoch": 2.8789211374963353, + "grad_norm": 0.2909815980452108, + "learning_rate": 0.00029282111737464605, + "loss": 3.1346516609191895, + "step": 4911, + "token_acc": 0.28332517668355883 + }, + { + "epoch": 2.8795074758135444, + "grad_norm": 0.30011979876552053, + "learning_rate": 0.0002928166729819859, + "loss": 3.1066339015960693, + "step": 4912, + "token_acc": 0.28799454261054014 + }, + { + "epoch": 2.8800938141307535, + "grad_norm": 0.30858597954703165, + "learning_rate": 0.00029281222724775554, + "loss": 3.1208858489990234, + "step": 4913, + "token_acc": 0.28421937001563974 + }, + { + "epoch": 2.8806801524479626, + "grad_norm": 0.3048810946956163, + "learning_rate": 0.00029280778017199674, + "loss": 3.1051111221313477, + "step": 4914, + "token_acc": 0.2843144238107352 + }, + { + "epoch": 2.8812664907651717, + "grad_norm": 0.2503148079871603, + "learning_rate": 0.0002928033317547513, + "loss": 3.0708112716674805, + "step": 4915, + "token_acc": 0.2932530108032425 + }, + { + "epoch": 2.8818528290823804, + "grad_norm": 0.3278497290402753, + "learning_rate": 0.00029279888199606097, + "loss": 3.116889238357544, + "step": 4916, + "token_acc": 0.2858832006207623 + }, + { + "epoch": 2.8824391673995895, + "grad_norm": 0.3178528102144469, + "learning_rate": 0.0002927944308959676, + "loss": 3.134352207183838, + "step": 4917, + "token_acc": 0.2837621180631753 + }, + { + "epoch": 2.8830255057167986, + "grad_norm": 0.26289329138763, + "learning_rate": 0.0002927899784545129, + "loss": 3.1432456970214844, + "step": 4918, + "token_acc": 0.28338165052924186 + }, + { + "epoch": 2.8836118440340077, + "grad_norm": 0.2735276987729094, + "learning_rate": 0.00029278552467173883, + "loss": 3.1087403297424316, + "step": 4919, + "token_acc": 0.28726494740485153 + }, + { + "epoch": 2.8841981823512164, + "grad_norm": 0.2562728647980269, + "learning_rate": 0.00029278106954768715, + "loss": 3.088632583618164, + "step": 4920, + "token_acc": 0.28892174418946004 + }, + { + "epoch": 2.8847845206684255, + "grad_norm": 0.2840800829442021, + "learning_rate": 0.00029277661308239975, + "loss": 3.1328256130218506, + "step": 4921, + "token_acc": 0.2862577270565858 + }, + { + "epoch": 2.8853708589856346, + "grad_norm": 0.24464763726085684, + "learning_rate": 0.00029277215527591843, + "loss": 3.0734317302703857, + "step": 4922, + "token_acc": 0.2924629215942844 + }, + { + "epoch": 2.8859571973028437, + "grad_norm": 0.2597953911110076, + "learning_rate": 0.0002927676961282851, + "loss": 3.1095104217529297, + "step": 4923, + "token_acc": 0.2863353310608518 + }, + { + "epoch": 2.886543535620053, + "grad_norm": 0.2426772338052482, + "learning_rate": 0.0002927632356395416, + "loss": 3.0974559783935547, + "step": 4924, + "token_acc": 0.28842860201241705 + }, + { + "epoch": 2.887129873937262, + "grad_norm": 0.26898672966739984, + "learning_rate": 0.00029275877380972995, + "loss": 3.0966944694519043, + "step": 4925, + "token_acc": 0.28738041862090247 + }, + { + "epoch": 2.887716212254471, + "grad_norm": 0.28125507556368123, + "learning_rate": 0.00029275431063889194, + "loss": 3.1200003623962402, + "step": 4926, + "token_acc": 0.2860851974365373 + }, + { + "epoch": 2.8883025505716797, + "grad_norm": 0.26108841549529993, + "learning_rate": 0.0002927498461270696, + "loss": 3.1195297241210938, + "step": 4927, + "token_acc": 0.28657675552504575 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.26447303883420087, + "learning_rate": 0.0002927453802743048, + "loss": 3.100036382675171, + "step": 4928, + "token_acc": 0.28860276486705777 + }, + { + "epoch": 2.889475227206098, + "grad_norm": 0.30022872527426225, + "learning_rate": 0.00029274091308063946, + "loss": 3.0976741313934326, + "step": 4929, + "token_acc": 0.2867228712944708 + }, + { + "epoch": 2.890061565523307, + "grad_norm": 0.38994051818640874, + "learning_rate": 0.0002927364445461156, + "loss": 3.0919029712677, + "step": 4930, + "token_acc": 0.2888243114217998 + }, + { + "epoch": 2.8906479038405157, + "grad_norm": 0.33524114510752634, + "learning_rate": 0.0002927319746707752, + "loss": 3.1277573108673096, + "step": 4931, + "token_acc": 0.28400379177292084 + }, + { + "epoch": 2.891234242157725, + "grad_norm": 0.33577874867564966, + "learning_rate": 0.00029272750345466024, + "loss": 3.1251015663146973, + "step": 4932, + "token_acc": 0.283281615820988 + }, + { + "epoch": 2.891820580474934, + "grad_norm": 0.317200780972459, + "learning_rate": 0.0002927230308978127, + "loss": 3.088057518005371, + "step": 4933, + "token_acc": 0.29048793931599254 + }, + { + "epoch": 2.892406918792143, + "grad_norm": 0.26743418629055843, + "learning_rate": 0.0002927185570002746, + "loss": 3.1403372287750244, + "step": 4934, + "token_acc": 0.2835100154402545 + }, + { + "epoch": 2.892993257109352, + "grad_norm": 0.30650204487135085, + "learning_rate": 0.000292714081762088, + "loss": 3.112123966217041, + "step": 4935, + "token_acc": 0.28606729166331657 + }, + { + "epoch": 2.8935795954265613, + "grad_norm": 0.2642426748034353, + "learning_rate": 0.0002927096051832949, + "loss": 3.1048831939697266, + "step": 4936, + "token_acc": 0.28852212297933566 + }, + { + "epoch": 2.8941659337437704, + "grad_norm": 0.2721247602872461, + "learning_rate": 0.00029270512726393733, + "loss": 3.145318031311035, + "step": 4937, + "token_acc": 0.2815633919241387 + }, + { + "epoch": 2.894752272060979, + "grad_norm": 0.32184859082025835, + "learning_rate": 0.00029270064800405744, + "loss": 3.104903221130371, + "step": 4938, + "token_acc": 0.2861115642453761 + }, + { + "epoch": 2.895338610378188, + "grad_norm": 0.2910367872232558, + "learning_rate": 0.00029269616740369725, + "loss": 3.131070613861084, + "step": 4939, + "token_acc": 0.2843429870593367 + }, + { + "epoch": 2.8959249486953973, + "grad_norm": 0.2727329077051126, + "learning_rate": 0.00029269168546289877, + "loss": 3.078613758087158, + "step": 4940, + "token_acc": 0.29349566703911767 + }, + { + "epoch": 2.8965112870126064, + "grad_norm": 0.27612225416491676, + "learning_rate": 0.0002926872021817043, + "loss": 3.0929245948791504, + "step": 4941, + "token_acc": 0.28985038028565213 + }, + { + "epoch": 2.897097625329815, + "grad_norm": 0.26991637512739985, + "learning_rate": 0.00029268271756015577, + "loss": 3.121178388595581, + "step": 4942, + "token_acc": 0.285773841091929 + }, + { + "epoch": 2.897683963647024, + "grad_norm": 0.2866764291857337, + "learning_rate": 0.0002926782315982954, + "loss": 3.06008243560791, + "step": 4943, + "token_acc": 0.29359372775693177 + }, + { + "epoch": 2.8982703019642333, + "grad_norm": 0.25828128337205913, + "learning_rate": 0.00029267374429616525, + "loss": 3.1210179328918457, + "step": 4944, + "token_acc": 0.2872359198982266 + }, + { + "epoch": 2.8988566402814424, + "grad_norm": 0.2711878668036536, + "learning_rate": 0.0002926692556538076, + "loss": 3.124605655670166, + "step": 4945, + "token_acc": 0.2849804202282201 + }, + { + "epoch": 2.8994429785986515, + "grad_norm": 0.2801277060597001, + "learning_rate": 0.0002926647656712645, + "loss": 3.1145777702331543, + "step": 4946, + "token_acc": 0.2870258988515674 + }, + { + "epoch": 2.9000293169158606, + "grad_norm": 0.27688706534969365, + "learning_rate": 0.0002926602743485782, + "loss": 3.107100486755371, + "step": 4947, + "token_acc": 0.28582339104677096 + }, + { + "epoch": 2.9006156552330697, + "grad_norm": 0.3092188678568817, + "learning_rate": 0.00029265578168579087, + "loss": 3.0880823135375977, + "step": 4948, + "token_acc": 0.29001466968348294 + }, + { + "epoch": 2.9012019935502784, + "grad_norm": 0.30680723722334424, + "learning_rate": 0.00029265128768294463, + "loss": 3.162522315979004, + "step": 4949, + "token_acc": 0.27888485168791705 + }, + { + "epoch": 2.9017883318674875, + "grad_norm": 0.2786024667163245, + "learning_rate": 0.0002926467923400818, + "loss": 3.164005994796753, + "step": 4950, + "token_acc": 0.2801493339770458 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.29592536937037517, + "learning_rate": 0.00029264229565724464, + "loss": 3.1397287845611572, + "step": 4951, + "token_acc": 0.28402484680337126 + }, + { + "epoch": 2.9029610085019057, + "grad_norm": 0.2894062453717159, + "learning_rate": 0.00029263779763447523, + "loss": 3.1011850833892822, + "step": 4952, + "token_acc": 0.2874964825063315 + }, + { + "epoch": 2.9035473468191144, + "grad_norm": 0.2785809159898299, + "learning_rate": 0.000292633298271816, + "loss": 3.144193649291992, + "step": 4953, + "token_acc": 0.28225386417333237 + }, + { + "epoch": 2.9041336851363235, + "grad_norm": 0.29062543122106116, + "learning_rate": 0.00029262879756930906, + "loss": 3.0774881839752197, + "step": 4954, + "token_acc": 0.2922983673970577 + }, + { + "epoch": 2.9047200234535326, + "grad_norm": 0.33999937328734076, + "learning_rate": 0.0002926242955269968, + "loss": 3.132823944091797, + "step": 4955, + "token_acc": 0.2824032670784978 + }, + { + "epoch": 2.9053063617707418, + "grad_norm": 0.3348172070761862, + "learning_rate": 0.0002926197921449215, + "loss": 3.140523672103882, + "step": 4956, + "token_acc": 0.2835831732128086 + }, + { + "epoch": 2.905892700087951, + "grad_norm": 0.37377312262468865, + "learning_rate": 0.00029261528742312537, + "loss": 3.127758502960205, + "step": 4957, + "token_acc": 0.2835527008529009 + }, + { + "epoch": 2.90647903840516, + "grad_norm": 0.3119879299794423, + "learning_rate": 0.00029261078136165084, + "loss": 3.130155086517334, + "step": 4958, + "token_acc": 0.2832016281044804 + }, + { + "epoch": 2.9070653767223686, + "grad_norm": 0.27538770872274165, + "learning_rate": 0.0002926062739605401, + "loss": 3.1095874309539795, + "step": 4959, + "token_acc": 0.286663998282338 + }, + { + "epoch": 2.9076517150395778, + "grad_norm": 0.328417701159959, + "learning_rate": 0.0002926017652198357, + "loss": 3.1445393562316895, + "step": 4960, + "token_acc": 0.28257410691664553 + }, + { + "epoch": 2.908238053356787, + "grad_norm": 0.27324969970569907, + "learning_rate": 0.00029259725513957984, + "loss": 3.1246590614318848, + "step": 4961, + "token_acc": 0.2841856860561025 + }, + { + "epoch": 2.908824391673996, + "grad_norm": 0.3133821787387028, + "learning_rate": 0.00029259274371981495, + "loss": 3.1496641635894775, + "step": 4962, + "token_acc": 0.2809899605283491 + }, + { + "epoch": 2.909410729991205, + "grad_norm": 0.33152757421254747, + "learning_rate": 0.0002925882309605833, + "loss": 3.0999300479888916, + "step": 4963, + "token_acc": 0.2875515031707073 + }, + { + "epoch": 2.9099970683084138, + "grad_norm": 0.29249392061176943, + "learning_rate": 0.0002925837168619274, + "loss": 3.1073200702667236, + "step": 4964, + "token_acc": 0.2869334363561618 + }, + { + "epoch": 2.910583406625623, + "grad_norm": 0.34548501849537905, + "learning_rate": 0.0002925792014238896, + "loss": 3.110381603240967, + "step": 4965, + "token_acc": 0.2880349123831474 + }, + { + "epoch": 2.911169744942832, + "grad_norm": 0.2650746599302164, + "learning_rate": 0.00029257468464651237, + "loss": 3.109809398651123, + "step": 4966, + "token_acc": 0.28615309809006445 + }, + { + "epoch": 2.911756083260041, + "grad_norm": 0.29624180585907084, + "learning_rate": 0.00029257016652983807, + "loss": 3.0853514671325684, + "step": 4967, + "token_acc": 0.2868831126170857 + }, + { + "epoch": 2.91234242157725, + "grad_norm": 0.3126753364793521, + "learning_rate": 0.00029256564707390916, + "loss": 3.1212120056152344, + "step": 4968, + "token_acc": 0.2846807918198553 + }, + { + "epoch": 2.9129287598944593, + "grad_norm": 0.3143575070828216, + "learning_rate": 0.0002925611262787681, + "loss": 3.1368916034698486, + "step": 4969, + "token_acc": 0.28324516866477045 + }, + { + "epoch": 2.913515098211668, + "grad_norm": 0.29151622212375655, + "learning_rate": 0.0002925566041444574, + "loss": 3.078115463256836, + "step": 4970, + "token_acc": 0.29077776789870347 + }, + { + "epoch": 2.914101436528877, + "grad_norm": 0.3090904460134409, + "learning_rate": 0.00029255208067101947, + "loss": 3.0871689319610596, + "step": 4971, + "token_acc": 0.2905621194720199 + }, + { + "epoch": 2.914687774846086, + "grad_norm": 0.2683446758484812, + "learning_rate": 0.00029254755585849686, + "loss": 3.050039768218994, + "step": 4972, + "token_acc": 0.2954593043545698 + }, + { + "epoch": 2.9152741131632953, + "grad_norm": 0.3099663899704349, + "learning_rate": 0.00029254302970693204, + "loss": 3.0935826301574707, + "step": 4973, + "token_acc": 0.2887233305936049 + }, + { + "epoch": 2.915860451480504, + "grad_norm": 0.34269777317285155, + "learning_rate": 0.00029253850221636757, + "loss": 3.1380667686462402, + "step": 4974, + "token_acc": 0.28417702935411426 + }, + { + "epoch": 2.916446789797713, + "grad_norm": 0.3022665232041975, + "learning_rate": 0.0002925339733868459, + "loss": 3.1303019523620605, + "step": 4975, + "token_acc": 0.2833759126454073 + }, + { + "epoch": 2.917033128114922, + "grad_norm": 0.31008959576603373, + "learning_rate": 0.00029252944321840954, + "loss": 3.124453544616699, + "step": 4976, + "token_acc": 0.2833341752967921 + }, + { + "epoch": 2.9176194664321313, + "grad_norm": 0.31857098831353664, + "learning_rate": 0.00029252491171110126, + "loss": 3.108292579650879, + "step": 4977, + "token_acc": 0.28709394671837235 + }, + { + "epoch": 2.9182058047493404, + "grad_norm": 0.28319523898828003, + "learning_rate": 0.0002925203788649634, + "loss": 3.046318531036377, + "step": 4978, + "token_acc": 0.2957385453380327 + }, + { + "epoch": 2.9187921430665495, + "grad_norm": 0.2951248402759101, + "learning_rate": 0.00029251584468003867, + "loss": 3.0906453132629395, + "step": 4979, + "token_acc": 0.28703728494091063 + }, + { + "epoch": 2.9193784813837587, + "grad_norm": 0.2798656693356633, + "learning_rate": 0.00029251130915636963, + "loss": 3.0820045471191406, + "step": 4980, + "token_acc": 0.2912473430243686 + }, + { + "epoch": 2.9199648197009673, + "grad_norm": 0.33480404943122455, + "learning_rate": 0.0002925067722939989, + "loss": 3.1203885078430176, + "step": 4981, + "token_acc": 0.2862848715742903 + }, + { + "epoch": 2.9205511580181764, + "grad_norm": 0.2635372628344961, + "learning_rate": 0.000292502234092969, + "loss": 3.09853458404541, + "step": 4982, + "token_acc": 0.28715111557531203 + }, + { + "epoch": 2.9211374963353856, + "grad_norm": 0.29815940073322733, + "learning_rate": 0.00029249769455332264, + "loss": 3.0711936950683594, + "step": 4983, + "token_acc": 0.29259683787275076 + }, + { + "epoch": 2.9217238346525947, + "grad_norm": 0.33360922274189964, + "learning_rate": 0.0002924931536751025, + "loss": 3.112743854522705, + "step": 4984, + "token_acc": 0.2861688634947954 + }, + { + "epoch": 2.9223101729698033, + "grad_norm": 0.29455893712412884, + "learning_rate": 0.00029248861145835116, + "loss": 3.1136324405670166, + "step": 4985, + "token_acc": 0.2875326592909465 + }, + { + "epoch": 2.9228965112870124, + "grad_norm": 0.26459221464377275, + "learning_rate": 0.0002924840679031114, + "loss": 3.1287808418273926, + "step": 4986, + "token_acc": 0.285265152413555 + }, + { + "epoch": 2.9234828496042216, + "grad_norm": 0.3051166536420172, + "learning_rate": 0.0002924795230094257, + "loss": 3.1437859535217285, + "step": 4987, + "token_acc": 0.28179214742259484 + }, + { + "epoch": 2.9240691879214307, + "grad_norm": 0.32042602870288234, + "learning_rate": 0.000292474976777337, + "loss": 3.1082262992858887, + "step": 4988, + "token_acc": 0.28618188511536213 + }, + { + "epoch": 2.92465552623864, + "grad_norm": 0.3065393347162595, + "learning_rate": 0.0002924704292068878, + "loss": 3.1299118995666504, + "step": 4989, + "token_acc": 0.284432351068141 + }, + { + "epoch": 2.925241864555849, + "grad_norm": 0.33469868278724907, + "learning_rate": 0.0002924658802981209, + "loss": 3.1187663078308105, + "step": 4990, + "token_acc": 0.2827876358180947 + }, + { + "epoch": 2.925828202873058, + "grad_norm": 0.31055680710044126, + "learning_rate": 0.00029246133005107907, + "loss": 3.102005958557129, + "step": 4991, + "token_acc": 0.2883572534195369 + }, + { + "epoch": 2.9264145411902667, + "grad_norm": 0.2837105422550072, + "learning_rate": 0.00029245677846580497, + "loss": 3.1385719776153564, + "step": 4992, + "token_acc": 0.2826013687602887 + }, + { + "epoch": 2.927000879507476, + "grad_norm": 0.32902865942065007, + "learning_rate": 0.00029245222554234143, + "loss": 3.0618696212768555, + "step": 4993, + "token_acc": 0.29262519535685183 + }, + { + "epoch": 2.927587217824685, + "grad_norm": 0.30300505921728804, + "learning_rate": 0.00029244767128073113, + "loss": 3.0951180458068848, + "step": 4994, + "token_acc": 0.2880493842867215 + }, + { + "epoch": 2.928173556141894, + "grad_norm": 0.28681627713445695, + "learning_rate": 0.000292443115681017, + "loss": 3.1173095703125, + "step": 4995, + "token_acc": 0.28474962197803305 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.2910267767491318, + "learning_rate": 0.0002924385587432417, + "loss": 3.106464385986328, + "step": 4996, + "token_acc": 0.28874393660164205 + }, + { + "epoch": 2.929346232776312, + "grad_norm": 0.27645558923889724, + "learning_rate": 0.0002924340004674481, + "loss": 3.0502796173095703, + "step": 4997, + "token_acc": 0.2948054248266788 + }, + { + "epoch": 2.929932571093521, + "grad_norm": 0.31975705283015143, + "learning_rate": 0.00029242944085367895, + "loss": 3.101111888885498, + "step": 4998, + "token_acc": 0.2882396662324512 + }, + { + "epoch": 2.93051890941073, + "grad_norm": 0.28387466329300026, + "learning_rate": 0.00029242487990197713, + "loss": 3.1161346435546875, + "step": 4999, + "token_acc": 0.2851972000195515 + }, + { + "epoch": 2.931105247727939, + "grad_norm": 0.26470443898442736, + "learning_rate": 0.00029242031761238555, + "loss": 3.092757225036621, + "step": 5000, + "token_acc": 0.28801156081233703 + }, + { + "epoch": 2.9316915860451482, + "grad_norm": 0.2935624392966686, + "learning_rate": 0.00029241575398494693, + "loss": 3.112367630004883, + "step": 5001, + "token_acc": 0.286848041894107 + }, + { + "epoch": 2.9322779243623573, + "grad_norm": 0.2717630801157289, + "learning_rate": 0.00029241118901970426, + "loss": 3.11051082611084, + "step": 5002, + "token_acc": 0.2873603647442136 + }, + { + "epoch": 2.932864262679566, + "grad_norm": 0.2965877518109553, + "learning_rate": 0.0002924066227167003, + "loss": 3.133275270462036, + "step": 5003, + "token_acc": 0.2820839928496618 + }, + { + "epoch": 2.933450600996775, + "grad_norm": 0.2945892762268756, + "learning_rate": 0.00029240205507597805, + "loss": 3.107689380645752, + "step": 5004, + "token_acc": 0.28653478982127767 + }, + { + "epoch": 2.9340369393139842, + "grad_norm": 0.270451132271406, + "learning_rate": 0.00029239748609758044, + "loss": 3.1071999073028564, + "step": 5005, + "token_acc": 0.28731238449225566 + }, + { + "epoch": 2.9346232776311933, + "grad_norm": 0.25780841343004535, + "learning_rate": 0.0002923929157815503, + "loss": 3.0937864780426025, + "step": 5006, + "token_acc": 0.28983774690247777 + }, + { + "epoch": 2.935209615948402, + "grad_norm": 0.27703455323971504, + "learning_rate": 0.00029238834412793056, + "loss": 3.156954288482666, + "step": 5007, + "token_acc": 0.2806017820251207 + }, + { + "epoch": 2.935795954265611, + "grad_norm": 0.2798221079752183, + "learning_rate": 0.0002923837711367642, + "loss": 3.123319625854492, + "step": 5008, + "token_acc": 0.2844411184434315 + }, + { + "epoch": 2.9363822925828202, + "grad_norm": 0.2879098214791004, + "learning_rate": 0.0002923791968080942, + "loss": 3.1486434936523438, + "step": 5009, + "token_acc": 0.28070470650138757 + }, + { + "epoch": 2.9369686309000294, + "grad_norm": 0.2808736609517171, + "learning_rate": 0.0002923746211419635, + "loss": 3.109757661819458, + "step": 5010, + "token_acc": 0.28527400533038855 + }, + { + "epoch": 2.9375549692172385, + "grad_norm": 0.2533734691125129, + "learning_rate": 0.00029237004413841506, + "loss": 3.1228911876678467, + "step": 5011, + "token_acc": 0.2854579006597439 + }, + { + "epoch": 2.9381413075344476, + "grad_norm": 0.2969630460629132, + "learning_rate": 0.00029236546579749194, + "loss": 3.106259346008301, + "step": 5012, + "token_acc": 0.28632474191989776 + }, + { + "epoch": 2.9387276458516562, + "grad_norm": 0.2842042215648192, + "learning_rate": 0.0002923608861192371, + "loss": 3.112988233566284, + "step": 5013, + "token_acc": 0.28704147436632044 + }, + { + "epoch": 2.9393139841688654, + "grad_norm": 0.299648245266583, + "learning_rate": 0.0002923563051036936, + "loss": 3.117884874343872, + "step": 5014, + "token_acc": 0.2857277759770505 + }, + { + "epoch": 2.9399003224860745, + "grad_norm": 0.2954187035281139, + "learning_rate": 0.00029235172275090437, + "loss": 3.1189539432525635, + "step": 5015, + "token_acc": 0.28490329394679514 + }, + { + "epoch": 2.9404866608032836, + "grad_norm": 0.284827769950012, + "learning_rate": 0.0002923471390609125, + "loss": 3.149946928024292, + "step": 5016, + "token_acc": 0.27916704880111687 + }, + { + "epoch": 2.9410729991204922, + "grad_norm": 0.28029557699564717, + "learning_rate": 0.00029234255403376116, + "loss": 3.0718367099761963, + "step": 5017, + "token_acc": 0.2931087110247397 + }, + { + "epoch": 2.9416593374377014, + "grad_norm": 0.2898818814431397, + "learning_rate": 0.0002923379676694933, + "loss": 3.095771074295044, + "step": 5018, + "token_acc": 0.28719309602021986 + }, + { + "epoch": 2.9422456757549105, + "grad_norm": 0.30172115280909606, + "learning_rate": 0.00029233337996815203, + "loss": 3.1002144813537598, + "step": 5019, + "token_acc": 0.28762030680907 + }, + { + "epoch": 2.9428320140721196, + "grad_norm": 0.26771476175976017, + "learning_rate": 0.0002923287909297805, + "loss": 3.1250195503234863, + "step": 5020, + "token_acc": 0.28476341286668994 + }, + { + "epoch": 2.9434183523893287, + "grad_norm": 0.27568986992191624, + "learning_rate": 0.0002923242005544217, + "loss": 3.116955280303955, + "step": 5021, + "token_acc": 0.2868338807498634 + }, + { + "epoch": 2.944004690706538, + "grad_norm": 0.283915465374956, + "learning_rate": 0.00029231960884211884, + "loss": 3.095149040222168, + "step": 5022, + "token_acc": 0.2887512456798762 + }, + { + "epoch": 2.944591029023747, + "grad_norm": 0.3039138594946964, + "learning_rate": 0.00029231501579291507, + "loss": 3.133984088897705, + "step": 5023, + "token_acc": 0.28458802485602325 + }, + { + "epoch": 2.9451773673409556, + "grad_norm": 0.3267981077909956, + "learning_rate": 0.0002923104214068535, + "loss": 3.113675117492676, + "step": 5024, + "token_acc": 0.2880666222518321 + }, + { + "epoch": 2.9457637056581647, + "grad_norm": 0.26727105241606786, + "learning_rate": 0.00029230582568397727, + "loss": 3.1102640628814697, + "step": 5025, + "token_acc": 0.2853624773427761 + }, + { + "epoch": 2.946350043975374, + "grad_norm": 0.30394459750023173, + "learning_rate": 0.00029230122862432956, + "loss": 3.132255792617798, + "step": 5026, + "token_acc": 0.28444027348089 + }, + { + "epoch": 2.946936382292583, + "grad_norm": 0.31726410101999647, + "learning_rate": 0.00029229663022795353, + "loss": 3.1948556900024414, + "step": 5027, + "token_acc": 0.27442031538158684 + }, + { + "epoch": 2.9475227206097916, + "grad_norm": 0.2914379381217216, + "learning_rate": 0.00029229203049489246, + "loss": 3.1268067359924316, + "step": 5028, + "token_acc": 0.28450594536607005 + }, + { + "epoch": 2.9481090589270007, + "grad_norm": 0.26689470541833427, + "learning_rate": 0.00029228742942518943, + "loss": 3.0843310356140137, + "step": 5029, + "token_acc": 0.2893726158338455 + }, + { + "epoch": 2.94869539724421, + "grad_norm": 0.3141608036130159, + "learning_rate": 0.0002922828270188878, + "loss": 3.1054859161376953, + "step": 5030, + "token_acc": 0.28667206195055384 + }, + { + "epoch": 2.949281735561419, + "grad_norm": 0.2909470277842254, + "learning_rate": 0.00029227822327603073, + "loss": 3.0878162384033203, + "step": 5031, + "token_acc": 0.2888544265390061 + }, + { + "epoch": 2.949868073878628, + "grad_norm": 0.2754065362476325, + "learning_rate": 0.00029227361819666146, + "loss": 3.1084535121917725, + "step": 5032, + "token_acc": 0.2874672435696403 + }, + { + "epoch": 2.950454412195837, + "grad_norm": 0.33081130899280703, + "learning_rate": 0.0002922690117808233, + "loss": 3.1709115505218506, + "step": 5033, + "token_acc": 0.2792268895848166 + }, + { + "epoch": 2.9510407505130463, + "grad_norm": 0.3024505638941856, + "learning_rate": 0.00029226440402855945, + "loss": 3.084867000579834, + "step": 5034, + "token_acc": 0.2897134553477198 + }, + { + "epoch": 2.951627088830255, + "grad_norm": 0.3196573221832568, + "learning_rate": 0.0002922597949399132, + "loss": 3.124715805053711, + "step": 5035, + "token_acc": 0.28550095539867154 + }, + { + "epoch": 2.952213427147464, + "grad_norm": 0.3125207643151057, + "learning_rate": 0.0002922551845149279, + "loss": 3.11138916015625, + "step": 5036, + "token_acc": 0.28565198767604727 + }, + { + "epoch": 2.952799765464673, + "grad_norm": 0.2841398455259827, + "learning_rate": 0.0002922505727536469, + "loss": 3.1248779296875, + "step": 5037, + "token_acc": 0.2834396002982374 + }, + { + "epoch": 2.9533861037818823, + "grad_norm": 0.27615971932154265, + "learning_rate": 0.00029224595965611337, + "loss": 3.082575798034668, + "step": 5038, + "token_acc": 0.29088427728495164 + }, + { + "epoch": 2.953972442099091, + "grad_norm": 0.2727563166715515, + "learning_rate": 0.0002922413452223707, + "loss": 3.1098690032958984, + "step": 5039, + "token_acc": 0.28775291805886616 + }, + { + "epoch": 2.9545587804163, + "grad_norm": 0.30534677443510827, + "learning_rate": 0.0002922367294524624, + "loss": 3.1228795051574707, + "step": 5040, + "token_acc": 0.28585293000359063 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.3164136291629472, + "learning_rate": 0.00029223211234643155, + "loss": 3.166372060775757, + "step": 5041, + "token_acc": 0.2790833258588214 + }, + { + "epoch": 2.9557314570507183, + "grad_norm": 0.2424803757181179, + "learning_rate": 0.00029222749390432173, + "loss": 3.1350111961364746, + "step": 5042, + "token_acc": 0.28367948644653346 + }, + { + "epoch": 2.9563177953679274, + "grad_norm": 0.29425809889211346, + "learning_rate": 0.00029222287412617625, + "loss": 3.1102657318115234, + "step": 5043, + "token_acc": 0.286075106808941 + }, + { + "epoch": 2.9569041336851365, + "grad_norm": 0.2686714713181295, + "learning_rate": 0.0002922182530120385, + "loss": 3.094804048538208, + "step": 5044, + "token_acc": 0.28902560683057194 + }, + { + "epoch": 2.9574904720023456, + "grad_norm": 0.2552977709804675, + "learning_rate": 0.0002922136305619519, + "loss": 3.1175918579101562, + "step": 5045, + "token_acc": 0.2844833977651856 + }, + { + "epoch": 2.9580768103195543, + "grad_norm": 0.26328315869907826, + "learning_rate": 0.00029220900677595993, + "loss": 3.119746208190918, + "step": 5046, + "token_acc": 0.2870929242158172 + }, + { + "epoch": 2.9586631486367634, + "grad_norm": 0.2660971791319296, + "learning_rate": 0.00029220438165410595, + "loss": 3.0971240997314453, + "step": 5047, + "token_acc": 0.28673313307410203 + }, + { + "epoch": 2.9592494869539725, + "grad_norm": 0.27713531753903836, + "learning_rate": 0.0002921997551964334, + "loss": 3.147752285003662, + "step": 5048, + "token_acc": 0.2824378183390302 + }, + { + "epoch": 2.9598358252711816, + "grad_norm": 0.2885836531869793, + "learning_rate": 0.0002921951274029858, + "loss": 3.13508939743042, + "step": 5049, + "token_acc": 0.28274861131268886 + }, + { + "epoch": 2.9604221635883903, + "grad_norm": 0.2486620316568557, + "learning_rate": 0.0002921904982738066, + "loss": 3.0889055728912354, + "step": 5050, + "token_acc": 0.2899389512409693 + }, + { + "epoch": 2.9610085019055994, + "grad_norm": 0.25213031562449134, + "learning_rate": 0.0002921858678089392, + "loss": 3.1143767833709717, + "step": 5051, + "token_acc": 0.28602060744831304 + }, + { + "epoch": 2.9615948402228085, + "grad_norm": 0.2823484359610412, + "learning_rate": 0.00029218123600842724, + "loss": 3.1275830268859863, + "step": 5052, + "token_acc": 0.2854069470991948 + }, + { + "epoch": 2.9621811785400176, + "grad_norm": 0.2681195545528729, + "learning_rate": 0.00029217660287231414, + "loss": 3.061953544616699, + "step": 5053, + "token_acc": 0.2934716827206343 + }, + { + "epoch": 2.9627675168572267, + "grad_norm": 0.2738560149521086, + "learning_rate": 0.0002921719684006434, + "loss": 3.1011276245117188, + "step": 5054, + "token_acc": 0.2873449538416601 + }, + { + "epoch": 2.963353855174436, + "grad_norm": 0.28788269346238926, + "learning_rate": 0.00029216733259345866, + "loss": 3.0788917541503906, + "step": 5055, + "token_acc": 0.2921459164865474 + }, + { + "epoch": 2.963940193491645, + "grad_norm": 0.2684540787524686, + "learning_rate": 0.00029216269545080334, + "loss": 3.1130213737487793, + "step": 5056, + "token_acc": 0.28596392017810657 + }, + { + "epoch": 2.9645265318088536, + "grad_norm": 0.29280314792644013, + "learning_rate": 0.0002921580569727211, + "loss": 3.08811354637146, + "step": 5057, + "token_acc": 0.2899706701977346 + }, + { + "epoch": 2.9651128701260627, + "grad_norm": 0.2736859729891114, + "learning_rate": 0.00029215341715925546, + "loss": 3.064713478088379, + "step": 5058, + "token_acc": 0.2930078083595379 + }, + { + "epoch": 2.965699208443272, + "grad_norm": 0.3119371502553631, + "learning_rate": 0.00029214877601045007, + "loss": 3.075654983520508, + "step": 5059, + "token_acc": 0.29036854419788266 + }, + { + "epoch": 2.966285546760481, + "grad_norm": 0.30838257067779073, + "learning_rate": 0.00029214413352634844, + "loss": 3.1679680347442627, + "step": 5060, + "token_acc": 0.27908365695281995 + }, + { + "epoch": 2.9668718850776896, + "grad_norm": 0.28086665075064576, + "learning_rate": 0.0002921394897069942, + "loss": 3.1128907203674316, + "step": 5061, + "token_acc": 0.2856346497299925 + }, + { + "epoch": 2.9674582233948987, + "grad_norm": 0.33922836876725826, + "learning_rate": 0.00029213484455243097, + "loss": 3.125619888305664, + "step": 5062, + "token_acc": 0.2834312597959061 + }, + { + "epoch": 2.968044561712108, + "grad_norm": 0.29732833116121965, + "learning_rate": 0.0002921301980627025, + "loss": 3.076028347015381, + "step": 5063, + "token_acc": 0.2908988888402705 + }, + { + "epoch": 2.968630900029317, + "grad_norm": 0.28312112236743636, + "learning_rate": 0.00029212555023785226, + "loss": 3.0619237422943115, + "step": 5064, + "token_acc": 0.2926577472663208 + }, + { + "epoch": 2.969217238346526, + "grad_norm": 0.2896989396851186, + "learning_rate": 0.00029212090107792396, + "loss": 3.0955214500427246, + "step": 5065, + "token_acc": 0.2894506555220841 + }, + { + "epoch": 2.969803576663735, + "grad_norm": 0.25909225964845883, + "learning_rate": 0.0002921162505829614, + "loss": 3.1067535877227783, + "step": 5066, + "token_acc": 0.2882758070520662 + }, + { + "epoch": 2.970389914980944, + "grad_norm": 0.2901746444673933, + "learning_rate": 0.0002921115987530081, + "loss": 3.096381664276123, + "step": 5067, + "token_acc": 0.2878427485959696 + }, + { + "epoch": 2.970976253298153, + "grad_norm": 0.2826122913154138, + "learning_rate": 0.0002921069455881079, + "loss": 3.0496273040771484, + "step": 5068, + "token_acc": 0.29456771028496864 + }, + { + "epoch": 2.971562591615362, + "grad_norm": 0.30532828055029776, + "learning_rate": 0.00029210229108830437, + "loss": 3.0849170684814453, + "step": 5069, + "token_acc": 0.2910914693957188 + }, + { + "epoch": 2.972148929932571, + "grad_norm": 0.30650939828861495, + "learning_rate": 0.0002920976352536413, + "loss": 3.1432456970214844, + "step": 5070, + "token_acc": 0.28116453203025066 + }, + { + "epoch": 2.97273526824978, + "grad_norm": 0.28854117470965823, + "learning_rate": 0.00029209297808416247, + "loss": 3.1159439086914062, + "step": 5071, + "token_acc": 0.2846584576546575 + }, + { + "epoch": 2.973321606566989, + "grad_norm": 0.2880506681741183, + "learning_rate": 0.00029208831957991155, + "loss": 3.112246036529541, + "step": 5072, + "token_acc": 0.28675354218454874 + }, + { + "epoch": 2.973907944884198, + "grad_norm": 0.3210323389304767, + "learning_rate": 0.00029208365974093235, + "loss": 3.1248300075531006, + "step": 5073, + "token_acc": 0.28411261349532957 + }, + { + "epoch": 2.974494283201407, + "grad_norm": 0.2823815269355905, + "learning_rate": 0.0002920789985672686, + "loss": 3.0823240280151367, + "step": 5074, + "token_acc": 0.28983230023844686 + }, + { + "epoch": 2.9750806215186163, + "grad_norm": 0.3019603247716682, + "learning_rate": 0.0002920743360589642, + "loss": 3.134385585784912, + "step": 5075, + "token_acc": 0.28310445927303346 + }, + { + "epoch": 2.9756669598358254, + "grad_norm": 0.277972235832804, + "learning_rate": 0.0002920696722160628, + "loss": 3.099012613296509, + "step": 5076, + "token_acc": 0.2894199174319656 + }, + { + "epoch": 2.9762532981530345, + "grad_norm": 0.31332694927412935, + "learning_rate": 0.00029206500703860824, + "loss": 3.102360248565674, + "step": 5077, + "token_acc": 0.28587114342256875 + }, + { + "epoch": 2.976839636470243, + "grad_norm": 0.3085274597774751, + "learning_rate": 0.0002920603405266444, + "loss": 3.0702714920043945, + "step": 5078, + "token_acc": 0.2932083927694195 + }, + { + "epoch": 2.9774259747874523, + "grad_norm": 0.2934084141542044, + "learning_rate": 0.0002920556726802151, + "loss": 3.0628905296325684, + "step": 5079, + "token_acc": 0.2943366621361363 + }, + { + "epoch": 2.9780123131046614, + "grad_norm": 0.3034667084742859, + "learning_rate": 0.0002920510034993642, + "loss": 3.1041674613952637, + "step": 5080, + "token_acc": 0.28816404501400184 + }, + { + "epoch": 2.9785986514218705, + "grad_norm": 0.30504339649800805, + "learning_rate": 0.0002920463329841355, + "loss": 3.1233677864074707, + "step": 5081, + "token_acc": 0.283686965375702 + }, + { + "epoch": 2.979184989739079, + "grad_norm": 0.28168778645511033, + "learning_rate": 0.00029204166113457286, + "loss": 3.1091670989990234, + "step": 5082, + "token_acc": 0.2861745376247329 + }, + { + "epoch": 2.9797713280562883, + "grad_norm": 0.3086575599636707, + "learning_rate": 0.00029203698795072033, + "loss": 3.160965919494629, + "step": 5083, + "token_acc": 0.2802046260985791 + }, + { + "epoch": 2.9803576663734974, + "grad_norm": 0.29098702910844904, + "learning_rate": 0.0002920323134326216, + "loss": 3.107667922973633, + "step": 5084, + "token_acc": 0.2874566968713065 + }, + { + "epoch": 2.9809440046907065, + "grad_norm": 0.2878858555278777, + "learning_rate": 0.0002920276375803207, + "loss": 3.1095995903015137, + "step": 5085, + "token_acc": 0.2880353247858701 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.26777331084812295, + "learning_rate": 0.00029202296039386157, + "loss": 3.110292673110962, + "step": 5086, + "token_acc": 0.28547494406440355 + }, + { + "epoch": 2.9821166813251248, + "grad_norm": 0.2777794674932488, + "learning_rate": 0.00029201828187328807, + "loss": 3.1438937187194824, + "step": 5087, + "token_acc": 0.28034792786082885 + }, + { + "epoch": 2.982703019642334, + "grad_norm": 0.2610162108795578, + "learning_rate": 0.00029201360201864423, + "loss": 3.151064157485962, + "step": 5088, + "token_acc": 0.28043263727154305 + }, + { + "epoch": 2.9832893579595425, + "grad_norm": 0.3000237250262898, + "learning_rate": 0.0002920089208299739, + "loss": 3.118253469467163, + "step": 5089, + "token_acc": 0.28660861926415876 + }, + { + "epoch": 2.9838756962767516, + "grad_norm": 0.25633572554202444, + "learning_rate": 0.00029200423830732115, + "loss": 3.116499900817871, + "step": 5090, + "token_acc": 0.28607967143188967 + }, + { + "epoch": 2.9844620345939608, + "grad_norm": 0.3028180024406386, + "learning_rate": 0.00029199955445073, + "loss": 3.076850414276123, + "step": 5091, + "token_acc": 0.29044654049277685 + }, + { + "epoch": 2.98504837291117, + "grad_norm": 0.27873791987289087, + "learning_rate": 0.00029199486926024425, + "loss": 3.119007110595703, + "step": 5092, + "token_acc": 0.28424345311075483 + }, + { + "epoch": 2.9856347112283785, + "grad_norm": 0.30101612615759105, + "learning_rate": 0.0002919901827359081, + "loss": 3.091470956802368, + "step": 5093, + "token_acc": 0.28922672723300147 + }, + { + "epoch": 2.9862210495455876, + "grad_norm": 0.2969763761309888, + "learning_rate": 0.00029198549487776553, + "loss": 3.104546308517456, + "step": 5094, + "token_acc": 0.2867430364725637 + }, + { + "epoch": 2.9868073878627968, + "grad_norm": 0.3147234461507524, + "learning_rate": 0.0002919808056858606, + "loss": 3.0515449047088623, + "step": 5095, + "token_acc": 0.2945025510546417 + }, + { + "epoch": 2.987393726180006, + "grad_norm": 0.3041313650394808, + "learning_rate": 0.00029197611516023725, + "loss": 3.0847854614257812, + "step": 5096, + "token_acc": 0.28839045632812604 + }, + { + "epoch": 2.987980064497215, + "grad_norm": 0.33422917465186386, + "learning_rate": 0.0002919714233009397, + "loss": 3.138659954071045, + "step": 5097, + "token_acc": 0.2823014371090749 + }, + { + "epoch": 2.988566402814424, + "grad_norm": 0.30287992374803485, + "learning_rate": 0.00029196673010801187, + "loss": 3.091615676879883, + "step": 5098, + "token_acc": 0.28828833501466516 + }, + { + "epoch": 2.989152741131633, + "grad_norm": 0.2666360946851808, + "learning_rate": 0.00029196203558149787, + "loss": 3.109976053237915, + "step": 5099, + "token_acc": 0.2875342915272035 + }, + { + "epoch": 2.989739079448842, + "grad_norm": 0.31778316560697006, + "learning_rate": 0.000291957339721442, + "loss": 3.0775437355041504, + "step": 5100, + "token_acc": 0.28994236837021614 + }, + { + "epoch": 2.990325417766051, + "grad_norm": 0.31512748334759194, + "learning_rate": 0.00029195264252788804, + "loss": 3.1121773719787598, + "step": 5101, + "token_acc": 0.2870045693980587 + }, + { + "epoch": 2.99091175608326, + "grad_norm": 0.2814887995647331, + "learning_rate": 0.00029194794400088037, + "loss": 3.119619846343994, + "step": 5102, + "token_acc": 0.2846271129561171 + }, + { + "epoch": 2.991498094400469, + "grad_norm": 0.3123520536000268, + "learning_rate": 0.000291943244140463, + "loss": 3.1075708866119385, + "step": 5103, + "token_acc": 0.2888442975487829 + }, + { + "epoch": 2.992084432717678, + "grad_norm": 0.2926123203573768, + "learning_rate": 0.0002919385429466802, + "loss": 3.129755735397339, + "step": 5104, + "token_acc": 0.2822806339389254 + }, + { + "epoch": 2.992670771034887, + "grad_norm": 0.3460480775163618, + "learning_rate": 0.00029193384041957597, + "loss": 3.1306610107421875, + "step": 5105, + "token_acc": 0.2846267210132901 + }, + { + "epoch": 2.993257109352096, + "grad_norm": 0.25453685638867674, + "learning_rate": 0.00029192913655919463, + "loss": 3.141019105911255, + "step": 5106, + "token_acc": 0.2818223104708392 + }, + { + "epoch": 2.993843447669305, + "grad_norm": 0.2919845637776699, + "learning_rate": 0.0002919244313655803, + "loss": 3.102055072784424, + "step": 5107, + "token_acc": 0.2882983138300825 + }, + { + "epoch": 2.9944297859865143, + "grad_norm": 0.2736581917853656, + "learning_rate": 0.00029191972483877713, + "loss": 3.109133720397949, + "step": 5108, + "token_acc": 0.2873338356385555 + }, + { + "epoch": 2.9950161243037234, + "grad_norm": 0.29908852519117934, + "learning_rate": 0.00029191501697882943, + "loss": 3.110525608062744, + "step": 5109, + "token_acc": 0.28685337080487566 + }, + { + "epoch": 2.9956024626209325, + "grad_norm": 0.2631826908241675, + "learning_rate": 0.00029191030778578133, + "loss": 3.1174607276916504, + "step": 5110, + "token_acc": 0.28643832580522605 + }, + { + "epoch": 2.996188800938141, + "grad_norm": 0.3026834790418186, + "learning_rate": 0.00029190559725967717, + "loss": 3.1329574584960938, + "step": 5111, + "token_acc": 0.2830717802580965 + }, + { + "epoch": 2.9967751392553503, + "grad_norm": 0.28455824840030103, + "learning_rate": 0.00029190088540056113, + "loss": 3.110084056854248, + "step": 5112, + "token_acc": 0.2869272159975831 + }, + { + "epoch": 2.9973614775725594, + "grad_norm": 0.29720405646404147, + "learning_rate": 0.00029189617220847744, + "loss": 3.104649305343628, + "step": 5113, + "token_acc": 0.28822385817145874 + }, + { + "epoch": 2.9979478158897686, + "grad_norm": 0.31455821757449737, + "learning_rate": 0.00029189145768347046, + "loss": 3.071871519088745, + "step": 5114, + "token_acc": 0.2922248326150431 + }, + { + "epoch": 2.998534154206977, + "grad_norm": 0.267812225104246, + "learning_rate": 0.00029188674182558446, + "loss": 3.153928279876709, + "step": 5115, + "token_acc": 0.2805709009281518 + }, + { + "epoch": 2.9991204925241863, + "grad_norm": 0.3374159442242459, + "learning_rate": 0.0002918820246348637, + "loss": 3.042898654937744, + "step": 5116, + "token_acc": 0.2956119249724393 + }, + { + "epoch": 2.9997068308413954, + "grad_norm": 0.3105184009673184, + "learning_rate": 0.0002918773061113525, + "loss": 3.1127207279205322, + "step": 5117, + "token_acc": 0.28413267548240184 + }, + { + "epoch": 3.0, + "grad_norm": 0.35649660795366334, + "learning_rate": 0.00029187258625509513, + "loss": 3.1052021980285645, + "step": 5118, + "token_acc": 0.28912331011880377 + }, + { + "epoch": 3.0, + "eval_loss": 3.106259822845459, + "eval_runtime": 8.79, + "eval_samples_per_second": 29.124, + "eval_steps_per_second": 3.64, + "eval_token_acc": 0.28690646098030675, + "step": 5118 + }, + { + "epoch": 3.000586338317209, + "grad_norm": 0.3428987835507253, + "learning_rate": 0.0002918678650661361, + "loss": 3.0772767066955566, + "step": 5119, + "token_acc": 0.2890505350217869 + }, + { + "epoch": 3.0011726766344182, + "grad_norm": 0.38204917548378226, + "learning_rate": 0.0002918631425445196, + "loss": 3.009969711303711, + "step": 5120, + "token_acc": 0.29887828570279346 + }, + { + "epoch": 3.001759014951627, + "grad_norm": 0.325726653851143, + "learning_rate": 0.00029185841869029005, + "loss": 3.0065507888793945, + "step": 5121, + "token_acc": 0.30044527895572937 + }, + { + "epoch": 3.002345353268836, + "grad_norm": 0.32029728512926536, + "learning_rate": 0.00029185369350349173, + "loss": 3.053997278213501, + "step": 5122, + "token_acc": 0.29280360891745716 + }, + { + "epoch": 3.002931691586045, + "grad_norm": 0.3591779446246009, + "learning_rate": 0.0002918489669841692, + "loss": 3.019418954849243, + "step": 5123, + "token_acc": 0.2965564756197434 + }, + { + "epoch": 3.0035180299032542, + "grad_norm": 0.2876695986586287, + "learning_rate": 0.0002918442391323667, + "loss": 3.0437543392181396, + "step": 5124, + "token_acc": 0.2947273066368263 + }, + { + "epoch": 3.0041043682204633, + "grad_norm": 0.38647439184998883, + "learning_rate": 0.0002918395099481288, + "loss": 3.0338940620422363, + "step": 5125, + "token_acc": 0.29668929727025944 + }, + { + "epoch": 3.0046907065376725, + "grad_norm": 0.320748907096521, + "learning_rate": 0.0002918347794314998, + "loss": 3.040165901184082, + "step": 5126, + "token_acc": 0.2942833930652958 + }, + { + "epoch": 3.005277044854881, + "grad_norm": 0.3135189448896224, + "learning_rate": 0.00029183004758252416, + "loss": 3.0427966117858887, + "step": 5127, + "token_acc": 0.2937558193822025 + }, + { + "epoch": 3.0058633831720902, + "grad_norm": 0.35892905103197015, + "learning_rate": 0.00029182531440124636, + "loss": 3.0228395462036133, + "step": 5128, + "token_acc": 0.29631644082893877 + }, + { + "epoch": 3.0064497214892993, + "grad_norm": 0.2586295660541764, + "learning_rate": 0.0002918205798877108, + "loss": 3.024183750152588, + "step": 5129, + "token_acc": 0.29765954011026896 + }, + { + "epoch": 3.0070360598065085, + "grad_norm": 0.2976298905463469, + "learning_rate": 0.00029181584404196204, + "loss": 3.006627082824707, + "step": 5130, + "token_acc": 0.29926706247044604 + }, + { + "epoch": 3.0076223981237176, + "grad_norm": 0.27477264899715503, + "learning_rate": 0.00029181110686404447, + "loss": 3.019460678100586, + "step": 5131, + "token_acc": 0.2980581051158939 + }, + { + "epoch": 3.0082087364409262, + "grad_norm": 0.32687387686689073, + "learning_rate": 0.00029180636835400266, + "loss": 3.0451433658599854, + "step": 5132, + "token_acc": 0.2937275121682832 + }, + { + "epoch": 3.0087950747581353, + "grad_norm": 0.27219474441838104, + "learning_rate": 0.00029180162851188116, + "loss": 3.0167179107666016, + "step": 5133, + "token_acc": 0.2985784461989341 + }, + { + "epoch": 3.0093814130753445, + "grad_norm": 0.2992250666438191, + "learning_rate": 0.00029179688733772444, + "loss": 2.9692487716674805, + "step": 5134, + "token_acc": 0.3051114350734359 + }, + { + "epoch": 3.0099677513925536, + "grad_norm": 0.23907920148708325, + "learning_rate": 0.000291792144831577, + "loss": 3.0530848503112793, + "step": 5135, + "token_acc": 0.2912823978887682 + }, + { + "epoch": 3.0105540897097627, + "grad_norm": 0.2851943167981261, + "learning_rate": 0.00029178740099348343, + "loss": 2.986829996109009, + "step": 5136, + "token_acc": 0.30258522641490443 + }, + { + "epoch": 3.0111404280269713, + "grad_norm": 0.2609894454355711, + "learning_rate": 0.00029178265582348827, + "loss": 3.0341172218322754, + "step": 5137, + "token_acc": 0.2964714276966168 + }, + { + "epoch": 3.0117267663441805, + "grad_norm": 0.262371070548386, + "learning_rate": 0.00029177790932163617, + "loss": 3.008013963699341, + "step": 5138, + "token_acc": 0.300569405971068 + }, + { + "epoch": 3.0123131046613896, + "grad_norm": 0.2899917737805946, + "learning_rate": 0.0002917731614879716, + "loss": 2.9970388412475586, + "step": 5139, + "token_acc": 0.29985284266824197 + }, + { + "epoch": 3.0128994429785987, + "grad_norm": 0.31385162365275776, + "learning_rate": 0.00029176841232253926, + "loss": 3.005070209503174, + "step": 5140, + "token_acc": 0.29971071687089934 + }, + { + "epoch": 3.013485781295808, + "grad_norm": 0.28444976093957813, + "learning_rate": 0.00029176366182538367, + "loss": 3.01517391204834, + "step": 5141, + "token_acc": 0.29762677952252753 + }, + { + "epoch": 3.014072119613017, + "grad_norm": 0.2628632016505637, + "learning_rate": 0.00029175890999654956, + "loss": 3.0234503746032715, + "step": 5142, + "token_acc": 0.29601649572497646 + }, + { + "epoch": 3.0146584579302256, + "grad_norm": 0.2764732814873224, + "learning_rate": 0.0002917541568360815, + "loss": 3.0196003913879395, + "step": 5143, + "token_acc": 0.2971501463043987 + }, + { + "epoch": 3.0152447962474347, + "grad_norm": 0.26490755174116515, + "learning_rate": 0.00029174940234402415, + "loss": 3.019275665283203, + "step": 5144, + "token_acc": 0.2972331600777025 + }, + { + "epoch": 3.015831134564644, + "grad_norm": 0.31653988455340526, + "learning_rate": 0.0002917446465204222, + "loss": 3.069411516189575, + "step": 5145, + "token_acc": 0.2883319037287451 + }, + { + "epoch": 3.016417472881853, + "grad_norm": 0.29913382677093603, + "learning_rate": 0.0002917398893653202, + "loss": 3.0390093326568604, + "step": 5146, + "token_acc": 0.2941605265796441 + }, + { + "epoch": 3.017003811199062, + "grad_norm": 0.2550690008646368, + "learning_rate": 0.000291735130878763, + "loss": 3.0008602142333984, + "step": 5147, + "token_acc": 0.3000978681317543 + }, + { + "epoch": 3.0175901495162707, + "grad_norm": 0.25606412007552953, + "learning_rate": 0.0002917303710607953, + "loss": 2.9886553287506104, + "step": 5148, + "token_acc": 0.30346013160045776 + }, + { + "epoch": 3.01817648783348, + "grad_norm": 0.2496773079359478, + "learning_rate": 0.00029172560991146167, + "loss": 3.069382905960083, + "step": 5149, + "token_acc": 0.2914038602441986 + }, + { + "epoch": 3.018762826150689, + "grad_norm": 0.2616090078057608, + "learning_rate": 0.00029172084743080693, + "loss": 3.0578813552856445, + "step": 5150, + "token_acc": 0.2928235595936915 + }, + { + "epoch": 3.019349164467898, + "grad_norm": 0.27315518524246335, + "learning_rate": 0.0002917160836188758, + "loss": 3.0412020683288574, + "step": 5151, + "token_acc": 0.29490910109241625 + }, + { + "epoch": 3.019935502785107, + "grad_norm": 0.23878646080769525, + "learning_rate": 0.00029171131847571303, + "loss": 3.0304746627807617, + "step": 5152, + "token_acc": 0.2951688818932877 + }, + { + "epoch": 3.0205218411023163, + "grad_norm": 0.2876082097503468, + "learning_rate": 0.0002917065520013634, + "loss": 2.981696844100952, + "step": 5153, + "token_acc": 0.3020214885880395 + }, + { + "epoch": 3.021108179419525, + "grad_norm": 0.28207747030239555, + "learning_rate": 0.00029170178419587166, + "loss": 3.0600028038024902, + "step": 5154, + "token_acc": 0.29314351574484776 + }, + { + "epoch": 3.021694517736734, + "grad_norm": 0.2817992677196652, + "learning_rate": 0.00029169701505928254, + "loss": 3.0093984603881836, + "step": 5155, + "token_acc": 0.29840652288663994 + }, + { + "epoch": 3.022280856053943, + "grad_norm": 0.26918558903648926, + "learning_rate": 0.00029169224459164097, + "loss": 3.035729169845581, + "step": 5156, + "token_acc": 0.2949751625497862 + }, + { + "epoch": 3.0228671943711523, + "grad_norm": 0.2569778488638999, + "learning_rate": 0.0002916874727929917, + "loss": 3.046964168548584, + "step": 5157, + "token_acc": 0.29369545610447023 + }, + { + "epoch": 3.0234535326883614, + "grad_norm": 0.24400015281268123, + "learning_rate": 0.00029168269966337956, + "loss": 3.0505614280700684, + "step": 5158, + "token_acc": 0.2927835603959866 + }, + { + "epoch": 3.02403987100557, + "grad_norm": 0.2669409104658442, + "learning_rate": 0.0002916779252028493, + "loss": 3.012430191040039, + "step": 5159, + "token_acc": 0.298516996080007 + }, + { + "epoch": 3.024626209322779, + "grad_norm": 0.31446285633272164, + "learning_rate": 0.0002916731494114459, + "loss": 3.041745662689209, + "step": 5160, + "token_acc": 0.2932763768602614 + }, + { + "epoch": 3.0252125476399883, + "grad_norm": 0.3328351220803896, + "learning_rate": 0.00029166837228921413, + "loss": 3.0514984130859375, + "step": 5161, + "token_acc": 0.2918695738820728 + }, + { + "epoch": 3.0257988859571974, + "grad_norm": 0.266987277661329, + "learning_rate": 0.00029166359383619897, + "loss": 3.0127437114715576, + "step": 5162, + "token_acc": 0.2983614997558438 + }, + { + "epoch": 3.0263852242744065, + "grad_norm": 0.2816894640610194, + "learning_rate": 0.0002916588140524452, + "loss": 2.994537353515625, + "step": 5163, + "token_acc": 0.301362910838862 + }, + { + "epoch": 3.026971562591615, + "grad_norm": 0.2696814467394109, + "learning_rate": 0.00029165403293799773, + "loss": 3.0565550327301025, + "step": 5164, + "token_acc": 0.292368946714895 + }, + { + "epoch": 3.0275579009088243, + "grad_norm": 0.2804114869835539, + "learning_rate": 0.0002916492504929016, + "loss": 3.0026986598968506, + "step": 5165, + "token_acc": 0.30045732103416917 + }, + { + "epoch": 3.0281442392260334, + "grad_norm": 0.2556176234800986, + "learning_rate": 0.00029164446671720154, + "loss": 3.011725425720215, + "step": 5166, + "token_acc": 0.298342469226808 + }, + { + "epoch": 3.0287305775432425, + "grad_norm": 0.2677426948848304, + "learning_rate": 0.00029163968161094255, + "loss": 3.010127544403076, + "step": 5167, + "token_acc": 0.2979048952447622 + }, + { + "epoch": 3.0293169158604516, + "grad_norm": 0.28163923321410006, + "learning_rate": 0.0002916348951741697, + "loss": 3.051567792892456, + "step": 5168, + "token_acc": 0.2928791623143359 + }, + { + "epoch": 3.0299032541776607, + "grad_norm": 0.23808396751546573, + "learning_rate": 0.00029163010740692783, + "loss": 3.0435824394226074, + "step": 5169, + "token_acc": 0.29455579528387477 + }, + { + "epoch": 3.0304895924948694, + "grad_norm": 0.2833773570574727, + "learning_rate": 0.00029162531830926203, + "loss": 2.9934639930725098, + "step": 5170, + "token_acc": 0.3011256439064719 + }, + { + "epoch": 3.0310759308120785, + "grad_norm": 0.24085181271272585, + "learning_rate": 0.0002916205278812171, + "loss": 3.0210280418395996, + "step": 5171, + "token_acc": 0.295643903564126 + }, + { + "epoch": 3.0316622691292876, + "grad_norm": 0.25357470192780635, + "learning_rate": 0.0002916157361228382, + "loss": 3.0010781288146973, + "step": 5172, + "token_acc": 0.2988520627204396 + }, + { + "epoch": 3.0322486074464967, + "grad_norm": 0.25228637212539484, + "learning_rate": 0.00029161094303417027, + "loss": 3.048628330230713, + "step": 5173, + "token_acc": 0.293258409239119 + }, + { + "epoch": 3.032834945763706, + "grad_norm": 0.24885748925346615, + "learning_rate": 0.00029160614861525836, + "loss": 3.0450143814086914, + "step": 5174, + "token_acc": 0.2933806134115949 + }, + { + "epoch": 3.0334212840809145, + "grad_norm": 0.23571978509644365, + "learning_rate": 0.0002916013528661475, + "loss": 2.995908498764038, + "step": 5175, + "token_acc": 0.2998466958384519 + }, + { + "epoch": 3.0340076223981236, + "grad_norm": 0.2552105548561712, + "learning_rate": 0.00029159655578688275, + "loss": 3.0059757232666016, + "step": 5176, + "token_acc": 0.3004024242921667 + }, + { + "epoch": 3.0345939607153327, + "grad_norm": 0.24088933008471156, + "learning_rate": 0.00029159175737750913, + "loss": 3.0323877334594727, + "step": 5177, + "token_acc": 0.29646869942113674 + }, + { + "epoch": 3.035180299032542, + "grad_norm": 0.28513270486128606, + "learning_rate": 0.0002915869576380718, + "loss": 3.0714263916015625, + "step": 5178, + "token_acc": 0.2894802222948676 + }, + { + "epoch": 3.035766637349751, + "grad_norm": 0.30758702171737035, + "learning_rate": 0.00029158215656861574, + "loss": 3.0113954544067383, + "step": 5179, + "token_acc": 0.29699933341665624 + }, + { + "epoch": 3.03635297566696, + "grad_norm": 0.323033706133563, + "learning_rate": 0.0002915773541691861, + "loss": 3.0249106884002686, + "step": 5180, + "token_acc": 0.2972941289867116 + }, + { + "epoch": 3.0369393139841687, + "grad_norm": 0.28729023130252634, + "learning_rate": 0.00029157255043982803, + "loss": 3.077242136001587, + "step": 5181, + "token_acc": 0.2900023214006144 + }, + { + "epoch": 3.037525652301378, + "grad_norm": 0.3060869474858443, + "learning_rate": 0.0002915677453805866, + "loss": 3.0337257385253906, + "step": 5182, + "token_acc": 0.29562530501251366 + }, + { + "epoch": 3.038111990618587, + "grad_norm": 0.3344903339440123, + "learning_rate": 0.000291562938991507, + "loss": 3.0713930130004883, + "step": 5183, + "token_acc": 0.28970109492038265 + }, + { + "epoch": 3.038698328935796, + "grad_norm": 0.28792158311649885, + "learning_rate": 0.00029155813127263434, + "loss": 3.0177316665649414, + "step": 5184, + "token_acc": 0.2985738763674707 + }, + { + "epoch": 3.039284667253005, + "grad_norm": 0.2875059348409001, + "learning_rate": 0.00029155332222401375, + "loss": 2.99538516998291, + "step": 5185, + "token_acc": 0.30071428949920254 + }, + { + "epoch": 3.039871005570214, + "grad_norm": 0.246600797635135, + "learning_rate": 0.00029154851184569043, + "loss": 3.019031524658203, + "step": 5186, + "token_acc": 0.29730581416122437 + }, + { + "epoch": 3.040457343887423, + "grad_norm": 0.2584191600695276, + "learning_rate": 0.00029154370013770965, + "loss": 3.0655641555786133, + "step": 5187, + "token_acc": 0.2925974691965461 + }, + { + "epoch": 3.041043682204632, + "grad_norm": 0.27371519022451346, + "learning_rate": 0.00029153888710011655, + "loss": 3.0783467292785645, + "step": 5188, + "token_acc": 0.29066810747693467 + }, + { + "epoch": 3.041630020521841, + "grad_norm": 0.2441482242471838, + "learning_rate": 0.00029153407273295625, + "loss": 3.028428554534912, + "step": 5189, + "token_acc": 0.2975527092776646 + }, + { + "epoch": 3.0422163588390503, + "grad_norm": 0.27158319958110794, + "learning_rate": 0.0002915292570362741, + "loss": 3.0171897411346436, + "step": 5190, + "token_acc": 0.29848974529887745 + }, + { + "epoch": 3.042802697156259, + "grad_norm": 0.26310857194812015, + "learning_rate": 0.0002915244400101153, + "loss": 3.032041072845459, + "step": 5191, + "token_acc": 0.29355104894745365 + }, + { + "epoch": 3.043389035473468, + "grad_norm": 0.2601856517982112, + "learning_rate": 0.00029151962165452507, + "loss": 3.0440006256103516, + "step": 5192, + "token_acc": 0.2937117779191607 + }, + { + "epoch": 3.043975373790677, + "grad_norm": 0.24458705158506702, + "learning_rate": 0.0002915148019695487, + "loss": 3.008754253387451, + "step": 5193, + "token_acc": 0.2994218575158523 + }, + { + "epoch": 3.0445617121078863, + "grad_norm": 0.30684553523996044, + "learning_rate": 0.00029150998095523145, + "loss": 3.0330638885498047, + "step": 5194, + "token_acc": 0.294830181760904 + }, + { + "epoch": 3.0451480504250954, + "grad_norm": 0.28298543103930274, + "learning_rate": 0.00029150515861161864, + "loss": 3.0463099479675293, + "step": 5195, + "token_acc": 0.29323223870849235 + }, + { + "epoch": 3.0457343887423045, + "grad_norm": 0.28390459720668926, + "learning_rate": 0.00029150033493875553, + "loss": 2.9751009941101074, + "step": 5196, + "token_acc": 0.30497469340331956 + }, + { + "epoch": 3.046320727059513, + "grad_norm": 0.3215378811248207, + "learning_rate": 0.0002914955099366874, + "loss": 3.0451245307922363, + "step": 5197, + "token_acc": 0.29320234903226133 + }, + { + "epoch": 3.0469070653767223, + "grad_norm": 0.2798016452920836, + "learning_rate": 0.0002914906836054597, + "loss": 3.027647018432617, + "step": 5198, + "token_acc": 0.29544133727383654 + }, + { + "epoch": 3.0474934036939314, + "grad_norm": 0.2888236980541533, + "learning_rate": 0.00029148585594511765, + "loss": 3.015082359313965, + "step": 5199, + "token_acc": 0.29608200141198704 + }, + { + "epoch": 3.0480797420111405, + "grad_norm": 0.26177040561509235, + "learning_rate": 0.00029148102695570667, + "loss": 3.013658046722412, + "step": 5200, + "token_acc": 0.29756072628767843 + }, + { + "epoch": 3.0486660803283496, + "grad_norm": 0.27437900107557617, + "learning_rate": 0.0002914761966372721, + "loss": 3.0065486431121826, + "step": 5201, + "token_acc": 0.2994768684409191 + }, + { + "epoch": 3.0492524186455583, + "grad_norm": 0.2734905893590825, + "learning_rate": 0.00029147136498985926, + "loss": 3.011589527130127, + "step": 5202, + "token_acc": 0.29777078570054916 + }, + { + "epoch": 3.0498387569627674, + "grad_norm": 0.27222738910048155, + "learning_rate": 0.0002914665320135136, + "loss": 3.0344481468200684, + "step": 5203, + "token_acc": 0.29587463816161846 + }, + { + "epoch": 3.0504250952799765, + "grad_norm": 0.27026541942071214, + "learning_rate": 0.0002914616977082805, + "loss": 3.020756483078003, + "step": 5204, + "token_acc": 0.29807481616991355 + }, + { + "epoch": 3.0510114335971856, + "grad_norm": 0.28178055624345033, + "learning_rate": 0.00029145686207420537, + "loss": 3.0360846519470215, + "step": 5205, + "token_acc": 0.2954580074483005 + }, + { + "epoch": 3.0515977719143947, + "grad_norm": 0.2773927270643848, + "learning_rate": 0.0002914520251113336, + "loss": 3.049901247024536, + "step": 5206, + "token_acc": 0.2930996613424732 + }, + { + "epoch": 3.052184110231604, + "grad_norm": 0.3081820774148254, + "learning_rate": 0.0002914471868197107, + "loss": 3.057295083999634, + "step": 5207, + "token_acc": 0.29189094614204997 + }, + { + "epoch": 3.0527704485488125, + "grad_norm": 0.2827348820056371, + "learning_rate": 0.0002914423471993821, + "loss": 3.0319340229034424, + "step": 5208, + "token_acc": 0.2952012650733338 + }, + { + "epoch": 3.0533567868660216, + "grad_norm": 0.26348809936882417, + "learning_rate": 0.0002914375062503932, + "loss": 3.078206777572632, + "step": 5209, + "token_acc": 0.2915844021109129 + }, + { + "epoch": 3.0539431251832307, + "grad_norm": 0.3251125014485965, + "learning_rate": 0.00029143266397278956, + "loss": 3.0280237197875977, + "step": 5210, + "token_acc": 0.29541854781177646 + }, + { + "epoch": 3.05452946350044, + "grad_norm": 0.3262754941828055, + "learning_rate": 0.0002914278203666166, + "loss": 3.058741331100464, + "step": 5211, + "token_acc": 0.29180403818389467 + }, + { + "epoch": 3.055115801817649, + "grad_norm": 0.27571426230826795, + "learning_rate": 0.0002914229754319198, + "loss": 2.995025396347046, + "step": 5212, + "token_acc": 0.29915258526761096 + }, + { + "epoch": 3.0557021401348576, + "grad_norm": 0.3178771911569342, + "learning_rate": 0.0002914181291687448, + "loss": 3.016723394393921, + "step": 5213, + "token_acc": 0.2970398593200469 + }, + { + "epoch": 3.0562884784520667, + "grad_norm": 0.27403052071779477, + "learning_rate": 0.000291413281577137, + "loss": 3.040548801422119, + "step": 5214, + "token_acc": 0.2947568849053434 + }, + { + "epoch": 3.056874816769276, + "grad_norm": 0.2694084241339378, + "learning_rate": 0.00029140843265714205, + "loss": 3.008141040802002, + "step": 5215, + "token_acc": 0.2980059936949443 + }, + { + "epoch": 3.057461155086485, + "grad_norm": 0.2600415806676593, + "learning_rate": 0.00029140358240880535, + "loss": 3.0298898220062256, + "step": 5216, + "token_acc": 0.2959809264305177 + }, + { + "epoch": 3.058047493403694, + "grad_norm": 0.23645295401223948, + "learning_rate": 0.0002913987308321725, + "loss": 3.037425994873047, + "step": 5217, + "token_acc": 0.29331531247496506 + }, + { + "epoch": 3.0586338317209028, + "grad_norm": 0.2988990653137114, + "learning_rate": 0.00029139387792728917, + "loss": 3.02128005027771, + "step": 5218, + "token_acc": 0.2969729906260684 + }, + { + "epoch": 3.059220170038112, + "grad_norm": 0.2689150517806915, + "learning_rate": 0.00029138902369420087, + "loss": 3.015134334564209, + "step": 5219, + "token_acc": 0.29755498583187157 + }, + { + "epoch": 3.059806508355321, + "grad_norm": 0.2712522978119572, + "learning_rate": 0.0002913841681329532, + "loss": 2.978469133377075, + "step": 5220, + "token_acc": 0.3040158257022521 + }, + { + "epoch": 3.06039284667253, + "grad_norm": 0.2825418468138627, + "learning_rate": 0.0002913793112435918, + "loss": 3.059384822845459, + "step": 5221, + "token_acc": 0.29446063335430533 + }, + { + "epoch": 3.060979184989739, + "grad_norm": 0.288214756304783, + "learning_rate": 0.0002913744530261623, + "loss": 3.016928195953369, + "step": 5222, + "token_acc": 0.30014532288327195 + }, + { + "epoch": 3.0615655233069483, + "grad_norm": 0.2924105011258072, + "learning_rate": 0.0002913695934807103, + "loss": 3.0282325744628906, + "step": 5223, + "token_acc": 0.29591279787611746 + }, + { + "epoch": 3.062151861624157, + "grad_norm": 0.264473777104705, + "learning_rate": 0.0002913647326072815, + "loss": 3.0511207580566406, + "step": 5224, + "token_acc": 0.2938636938620616 + }, + { + "epoch": 3.062738199941366, + "grad_norm": 0.30071511551175406, + "learning_rate": 0.0002913598704059215, + "loss": 2.998683452606201, + "step": 5225, + "token_acc": 0.29979520593456505 + }, + { + "epoch": 3.063324538258575, + "grad_norm": 0.28618236494985083, + "learning_rate": 0.00029135500687667596, + "loss": 3.031513214111328, + "step": 5226, + "token_acc": 0.29687933369775665 + }, + { + "epoch": 3.0639108765757843, + "grad_norm": 0.29239835351191285, + "learning_rate": 0.0002913501420195907, + "loss": 3.032356023788452, + "step": 5227, + "token_acc": 0.29617430090924685 + }, + { + "epoch": 3.0644972148929934, + "grad_norm": 0.290236886561205, + "learning_rate": 0.0002913452758347113, + "loss": 3.036712646484375, + "step": 5228, + "token_acc": 0.2952040930338912 + }, + { + "epoch": 3.065083553210202, + "grad_norm": 0.2551781744286322, + "learning_rate": 0.00029134040832208346, + "loss": 3.0097084045410156, + "step": 5229, + "token_acc": 0.2986012970031567 + }, + { + "epoch": 3.065669891527411, + "grad_norm": 0.29453987930318065, + "learning_rate": 0.000291335539481753, + "loss": 3.0628161430358887, + "step": 5230, + "token_acc": 0.2914844453727758 + }, + { + "epoch": 3.0662562298446203, + "grad_norm": 0.2835681300026734, + "learning_rate": 0.0002913306693137655, + "loss": 3.051583766937256, + "step": 5231, + "token_acc": 0.2919014337469339 + }, + { + "epoch": 3.0668425681618294, + "grad_norm": 0.27537926074896424, + "learning_rate": 0.0002913257978181669, + "loss": 3.0277092456817627, + "step": 5232, + "token_acc": 0.2972079969001016 + }, + { + "epoch": 3.0674289064790385, + "grad_norm": 0.27716796763703283, + "learning_rate": 0.0002913209249950028, + "loss": 2.994946002960205, + "step": 5233, + "token_acc": 0.3012121707390156 + }, + { + "epoch": 3.068015244796247, + "grad_norm": 0.2788195107814617, + "learning_rate": 0.0002913160508443191, + "loss": 3.039104461669922, + "step": 5234, + "token_acc": 0.29692723116259684 + }, + { + "epoch": 3.0686015831134563, + "grad_norm": 0.2805907582201766, + "learning_rate": 0.0002913111753661615, + "loss": 3.0080413818359375, + "step": 5235, + "token_acc": 0.29870630636302115 + }, + { + "epoch": 3.0691879214306654, + "grad_norm": 0.2638732534983754, + "learning_rate": 0.00029130629856057586, + "loss": 3.0364460945129395, + "step": 5236, + "token_acc": 0.29493502112415865 + }, + { + "epoch": 3.0697742597478745, + "grad_norm": 0.2866275494992333, + "learning_rate": 0.0002913014204276079, + "loss": 3.0305275917053223, + "step": 5237, + "token_acc": 0.2953641710686169 + }, + { + "epoch": 3.0703605980650837, + "grad_norm": 0.3076917137824521, + "learning_rate": 0.00029129654096730353, + "loss": 3.06573486328125, + "step": 5238, + "token_acc": 0.29007896611219824 + }, + { + "epoch": 3.0709469363822928, + "grad_norm": 0.289842673824468, + "learning_rate": 0.0002912916601797086, + "loss": 3.041769027709961, + "step": 5239, + "token_acc": 0.2942454567595154 + }, + { + "epoch": 3.0715332746995014, + "grad_norm": 0.3215543516365585, + "learning_rate": 0.00029128677806486886, + "loss": 3.0553603172302246, + "step": 5240, + "token_acc": 0.29310713014592477 + }, + { + "epoch": 3.0721196130167105, + "grad_norm": 0.27710120859539134, + "learning_rate": 0.00029128189462283025, + "loss": 3.0448684692382812, + "step": 5241, + "token_acc": 0.2955300075240294 + }, + { + "epoch": 3.0727059513339197, + "grad_norm": 0.30535787732976594, + "learning_rate": 0.00029127700985363857, + "loss": 3.0393378734588623, + "step": 5242, + "token_acc": 0.2949774273624098 + }, + { + "epoch": 3.0732922896511288, + "grad_norm": 0.32328940982525484, + "learning_rate": 0.0002912721237573398, + "loss": 3.026825428009033, + "step": 5243, + "token_acc": 0.2962305732066103 + }, + { + "epoch": 3.073878627968338, + "grad_norm": 0.2694907499946927, + "learning_rate": 0.0002912672363339798, + "loss": 3.0590291023254395, + "step": 5244, + "token_acc": 0.29266625032302523 + }, + { + "epoch": 3.0744649662855466, + "grad_norm": 0.3130075470677598, + "learning_rate": 0.00029126234758360445, + "loss": 3.01023006439209, + "step": 5245, + "token_acc": 0.29972973895777466 + }, + { + "epoch": 3.0750513046027557, + "grad_norm": 0.30088267606481633, + "learning_rate": 0.0002912574575062597, + "loss": 3.0530476570129395, + "step": 5246, + "token_acc": 0.29454358333486486 + }, + { + "epoch": 3.0756376429199648, + "grad_norm": 0.2728116851746608, + "learning_rate": 0.00029125256610199155, + "loss": 3.008101463317871, + "step": 5247, + "token_acc": 0.29781640254144454 + }, + { + "epoch": 3.076223981237174, + "grad_norm": 0.25518764771562163, + "learning_rate": 0.0002912476733708458, + "loss": 3.0320041179656982, + "step": 5248, + "token_acc": 0.29377881977042736 + }, + { + "epoch": 3.076810319554383, + "grad_norm": 0.2559697073960544, + "learning_rate": 0.0002912427793128685, + "loss": 2.984266996383667, + "step": 5249, + "token_acc": 0.301332083517329 + }, + { + "epoch": 3.077396657871592, + "grad_norm": 0.2866211764693689, + "learning_rate": 0.00029123788392810564, + "loss": 3.0712506771087646, + "step": 5250, + "token_acc": 0.2891027880743031 + }, + { + "epoch": 3.077982996188801, + "grad_norm": 0.3019561852962493, + "learning_rate": 0.0002912329872166032, + "loss": 3.045159339904785, + "step": 5251, + "token_acc": 0.29561285428885825 + }, + { + "epoch": 3.07856933450601, + "grad_norm": 0.2787327847685759, + "learning_rate": 0.0002912280891784071, + "loss": 3.012904405593872, + "step": 5252, + "token_acc": 0.2982584519960475 + }, + { + "epoch": 3.079155672823219, + "grad_norm": 0.3018055043127577, + "learning_rate": 0.0002912231898135635, + "loss": 3.0492429733276367, + "step": 5253, + "token_acc": 0.2929207409780381 + }, + { + "epoch": 3.079742011140428, + "grad_norm": 0.2712480711430383, + "learning_rate": 0.0002912182891221182, + "loss": 3.022808313369751, + "step": 5254, + "token_acc": 0.2956509827372281 + }, + { + "epoch": 3.0803283494576372, + "grad_norm": 0.28303728685351587, + "learning_rate": 0.0002912133871041175, + "loss": 3.073479413986206, + "step": 5255, + "token_acc": 0.2903218991229576 + }, + { + "epoch": 3.080914687774846, + "grad_norm": 0.26367324493965505, + "learning_rate": 0.0002912084837596073, + "loss": 3.027346611022949, + "step": 5256, + "token_acc": 0.29459692581038366 + }, + { + "epoch": 3.081501026092055, + "grad_norm": 0.29592893911711987, + "learning_rate": 0.0002912035790886336, + "loss": 3.021733045578003, + "step": 5257, + "token_acc": 0.2973739174600542 + }, + { + "epoch": 3.082087364409264, + "grad_norm": 0.2728082061170481, + "learning_rate": 0.0002911986730912426, + "loss": 3.0271127223968506, + "step": 5258, + "token_acc": 0.2964456058722925 + }, + { + "epoch": 3.0826737027264732, + "grad_norm": 0.2637405929146654, + "learning_rate": 0.0002911937657674803, + "loss": 3.0387301445007324, + "step": 5259, + "token_acc": 0.2964996362586133 + }, + { + "epoch": 3.0832600410436823, + "grad_norm": 0.3012274951073473, + "learning_rate": 0.00029118885711739285, + "loss": 3.0389609336853027, + "step": 5260, + "token_acc": 0.2937640715262504 + }, + { + "epoch": 3.0838463793608915, + "grad_norm": 0.28951819611741003, + "learning_rate": 0.0002911839471410264, + "loss": 3.0748915672302246, + "step": 5261, + "token_acc": 0.28973866062573617 + }, + { + "epoch": 3.0844327176781, + "grad_norm": 0.27851112935665323, + "learning_rate": 0.0002911790358384269, + "loss": 3.0424728393554688, + "step": 5262, + "token_acc": 0.2955294857131081 + }, + { + "epoch": 3.0850190559953092, + "grad_norm": 0.3395531145577602, + "learning_rate": 0.0002911741232096407, + "loss": 3.101142168045044, + "step": 5263, + "token_acc": 0.2880601241923303 + }, + { + "epoch": 3.0856053943125183, + "grad_norm": 0.28791130132430853, + "learning_rate": 0.00029116920925471374, + "loss": 3.0443780422210693, + "step": 5264, + "token_acc": 0.2961017472025128 + }, + { + "epoch": 3.0861917326297275, + "grad_norm": 0.2624137148651111, + "learning_rate": 0.00029116429397369235, + "loss": 3.0252060890197754, + "step": 5265, + "token_acc": 0.2978307861998472 + }, + { + "epoch": 3.0867780709469366, + "grad_norm": 0.27412093286040606, + "learning_rate": 0.00029115937736662263, + "loss": 3.0254530906677246, + "step": 5266, + "token_acc": 0.2970956477745991 + }, + { + "epoch": 3.0873644092641452, + "grad_norm": 0.26802666604916614, + "learning_rate": 0.00029115445943355084, + "loss": 3.106532096862793, + "step": 5267, + "token_acc": 0.28511577964272805 + }, + { + "epoch": 3.0879507475813543, + "grad_norm": 0.2834502545045432, + "learning_rate": 0.00029114954017452305, + "loss": 3.0601377487182617, + "step": 5268, + "token_acc": 0.29177304414180216 + }, + { + "epoch": 3.0885370858985635, + "grad_norm": 0.2665283934430523, + "learning_rate": 0.00029114461958958555, + "loss": 3.115663528442383, + "step": 5269, + "token_acc": 0.2850559699535245 + }, + { + "epoch": 3.0891234242157726, + "grad_norm": 0.32352971403250863, + "learning_rate": 0.0002911396976787845, + "loss": 3.04937481880188, + "step": 5270, + "token_acc": 0.2925367943468707 + }, + { + "epoch": 3.0897097625329817, + "grad_norm": 0.29724575038427375, + "learning_rate": 0.00029113477444216623, + "loss": 2.998047351837158, + "step": 5271, + "token_acc": 0.2993163006378029 + }, + { + "epoch": 3.0902961008501904, + "grad_norm": 0.2792337655021091, + "learning_rate": 0.0002911298498797769, + "loss": 3.008059501647949, + "step": 5272, + "token_acc": 0.2987994047144763 + }, + { + "epoch": 3.0908824391673995, + "grad_norm": 0.28302037125129015, + "learning_rate": 0.00029112492399166283, + "loss": 3.004110336303711, + "step": 5273, + "token_acc": 0.29970166759475214 + }, + { + "epoch": 3.0914687774846086, + "grad_norm": 0.2861043248853114, + "learning_rate": 0.00029111999677787026, + "loss": 3.030120849609375, + "step": 5274, + "token_acc": 0.29603712064208676 + }, + { + "epoch": 3.0920551158018177, + "grad_norm": 0.2898510173222872, + "learning_rate": 0.0002911150682384455, + "loss": 3.0446724891662598, + "step": 5275, + "token_acc": 0.29339746165912595 + }, + { + "epoch": 3.092641454119027, + "grad_norm": 0.2616553193854271, + "learning_rate": 0.0002911101383734348, + "loss": 3.023768186569214, + "step": 5276, + "token_acc": 0.29770671681861555 + }, + { + "epoch": 3.093227792436236, + "grad_norm": 0.26604392068591975, + "learning_rate": 0.0002911052071828845, + "loss": 3.068734884262085, + "step": 5277, + "token_acc": 0.29075208582568723 + }, + { + "epoch": 3.0938141307534446, + "grad_norm": 0.28024541318550394, + "learning_rate": 0.0002911002746668409, + "loss": 3.006502628326416, + "step": 5278, + "token_acc": 0.30001069938300223 + }, + { + "epoch": 3.0944004690706537, + "grad_norm": 0.23853992026649376, + "learning_rate": 0.0002910953408253504, + "loss": 3.0279572010040283, + "step": 5279, + "token_acc": 0.2963340877889503 + }, + { + "epoch": 3.094986807387863, + "grad_norm": 0.2683110507176322, + "learning_rate": 0.0002910904056584592, + "loss": 3.064157485961914, + "step": 5280, + "token_acc": 0.2924125614086543 + }, + { + "epoch": 3.095573145705072, + "grad_norm": 0.2580392770864522, + "learning_rate": 0.0002910854691662139, + "loss": 3.02544903755188, + "step": 5281, + "token_acc": 0.2980517705852497 + }, + { + "epoch": 3.096159484022281, + "grad_norm": 0.25705252987444965, + "learning_rate": 0.0002910805313486607, + "loss": 3.047605037689209, + "step": 5282, + "token_acc": 0.2936258405464214 + }, + { + "epoch": 3.0967458223394897, + "grad_norm": 0.2686810867968365, + "learning_rate": 0.000291075592205846, + "loss": 3.029672622680664, + "step": 5283, + "token_acc": 0.2948142980965197 + }, + { + "epoch": 3.097332160656699, + "grad_norm": 0.25798674993215, + "learning_rate": 0.0002910706517378162, + "loss": 3.0323455333709717, + "step": 5284, + "token_acc": 0.2957712142993576 + }, + { + "epoch": 3.097918498973908, + "grad_norm": 0.26529552182636706, + "learning_rate": 0.0002910657099446177, + "loss": 3.0670180320739746, + "step": 5285, + "token_acc": 0.29174760107759645 + }, + { + "epoch": 3.098504837291117, + "grad_norm": 0.26072530967769364, + "learning_rate": 0.000291060766826297, + "loss": 3.051422595977783, + "step": 5286, + "token_acc": 0.2943143118737793 + }, + { + "epoch": 3.099091175608326, + "grad_norm": 0.2662723923015808, + "learning_rate": 0.00029105582238290046, + "loss": 2.9574272632598877, + "step": 5287, + "token_acc": 0.3073538383462097 + }, + { + "epoch": 3.099677513925535, + "grad_norm": 0.25998451113702425, + "learning_rate": 0.0002910508766144745, + "loss": 3.0511386394500732, + "step": 5288, + "token_acc": 0.2935157178351518 + }, + { + "epoch": 3.100263852242744, + "grad_norm": 0.2653944095854091, + "learning_rate": 0.00029104592952106567, + "loss": 3.054295063018799, + "step": 5289, + "token_acc": 0.29177011494252875 + }, + { + "epoch": 3.100850190559953, + "grad_norm": 0.25216494244240106, + "learning_rate": 0.00029104098110272034, + "loss": 3.0196685791015625, + "step": 5290, + "token_acc": 0.2969948857123508 + }, + { + "epoch": 3.101436528877162, + "grad_norm": 0.2670381508079933, + "learning_rate": 0.0002910360313594851, + "loss": 3.029458522796631, + "step": 5291, + "token_acc": 0.2967890237007583 + }, + { + "epoch": 3.1020228671943713, + "grad_norm": 0.2648730170671892, + "learning_rate": 0.00029103108029140636, + "loss": 3.054354429244995, + "step": 5292, + "token_acc": 0.2946121846661607 + }, + { + "epoch": 3.1026092055115804, + "grad_norm": 0.2635941475656157, + "learning_rate": 0.0002910261278985307, + "loss": 3.018207311630249, + "step": 5293, + "token_acc": 0.29725975567407315 + }, + { + "epoch": 3.103195543828789, + "grad_norm": 0.2500654426960767, + "learning_rate": 0.0002910211741809046, + "loss": 3.0683445930480957, + "step": 5294, + "token_acc": 0.2904388587232831 + }, + { + "epoch": 3.103781882145998, + "grad_norm": 0.2602962232334179, + "learning_rate": 0.00029101621913857454, + "loss": 3.0397167205810547, + "step": 5295, + "token_acc": 0.29478745005567564 + }, + { + "epoch": 3.1043682204632073, + "grad_norm": 0.2633219573744176, + "learning_rate": 0.0002910112627715872, + "loss": 3.0127696990966797, + "step": 5296, + "token_acc": 0.2986773426262554 + }, + { + "epoch": 3.1049545587804164, + "grad_norm": 0.24200877709083898, + "learning_rate": 0.000291006305079989, + "loss": 3.0220041275024414, + "step": 5297, + "token_acc": 0.29768044812218947 + }, + { + "epoch": 3.1055408970976255, + "grad_norm": 0.2534617035429079, + "learning_rate": 0.0002910013460638266, + "loss": 3.021932601928711, + "step": 5298, + "token_acc": 0.29764065335753176 + }, + { + "epoch": 3.106127235414834, + "grad_norm": 0.2659963699826802, + "learning_rate": 0.0002909963857231466, + "loss": 3.060288906097412, + "step": 5299, + "token_acc": 0.29008916495516707 + }, + { + "epoch": 3.1067135737320433, + "grad_norm": 0.29034979606634165, + "learning_rate": 0.00029099142405799547, + "loss": 3.0164363384246826, + "step": 5300, + "token_acc": 0.29796215544183086 + }, + { + "epoch": 3.1072999120492524, + "grad_norm": 0.2914038929001619, + "learning_rate": 0.00029098646106841996, + "loss": 3.094759464263916, + "step": 5301, + "token_acc": 0.2892385325247455 + }, + { + "epoch": 3.1078862503664615, + "grad_norm": 0.29332768010568017, + "learning_rate": 0.0002909814967544666, + "loss": 3.0512495040893555, + "step": 5302, + "token_acc": 0.29191861970321875 + }, + { + "epoch": 3.1084725886836706, + "grad_norm": 0.2653524462581572, + "learning_rate": 0.00029097653111618204, + "loss": 3.033874988555908, + "step": 5303, + "token_acc": 0.2954474218328495 + }, + { + "epoch": 3.1090589270008797, + "grad_norm": 0.3216990128984195, + "learning_rate": 0.000290971564153613, + "loss": 3.034513473510742, + "step": 5304, + "token_acc": 0.29418865820494344 + }, + { + "epoch": 3.1096452653180884, + "grad_norm": 0.29215720013142954, + "learning_rate": 0.00029096659586680596, + "loss": 3.055514335632324, + "step": 5305, + "token_acc": 0.29384500569543187 + }, + { + "epoch": 3.1102316036352975, + "grad_norm": 0.2661209457879693, + "learning_rate": 0.0002909616262558078, + "loss": 3.0203256607055664, + "step": 5306, + "token_acc": 0.2977842580130602 + }, + { + "epoch": 3.1108179419525066, + "grad_norm": 0.31991968140128063, + "learning_rate": 0.00029095665532066507, + "loss": 3.0420079231262207, + "step": 5307, + "token_acc": 0.2950226499984592 + }, + { + "epoch": 3.1114042802697157, + "grad_norm": 0.27564936949255975, + "learning_rate": 0.00029095168306142455, + "loss": 3.016251564025879, + "step": 5308, + "token_acc": 0.2988576704920195 + }, + { + "epoch": 3.111990618586925, + "grad_norm": 0.26903559222941487, + "learning_rate": 0.00029094670947813286, + "loss": 3.068302631378174, + "step": 5309, + "token_acc": 0.2918855142908841 + }, + { + "epoch": 3.1125769569041335, + "grad_norm": 0.2669738442698242, + "learning_rate": 0.0002909417345708368, + "loss": 3.0516302585601807, + "step": 5310, + "token_acc": 0.2929250729808628 + }, + { + "epoch": 3.1131632952213426, + "grad_norm": 0.23858634997733813, + "learning_rate": 0.000290936758339583, + "loss": 3.031249761581421, + "step": 5311, + "token_acc": 0.2978399611177066 + }, + { + "epoch": 3.1137496335385517, + "grad_norm": 0.26892259920206113, + "learning_rate": 0.00029093178078441837, + "loss": 2.981503963470459, + "step": 5312, + "token_acc": 0.30284342524703056 + }, + { + "epoch": 3.114335971855761, + "grad_norm": 0.2714404817560093, + "learning_rate": 0.00029092680190538953, + "loss": 3.0349693298339844, + "step": 5313, + "token_acc": 0.2954997051667666 + }, + { + "epoch": 3.11492231017297, + "grad_norm": 0.2846947492250074, + "learning_rate": 0.0002909218217025433, + "loss": 3.0137877464294434, + "step": 5314, + "token_acc": 0.2975225900219336 + }, + { + "epoch": 3.115508648490179, + "grad_norm": 0.28823721445773864, + "learning_rate": 0.00029091684017592634, + "loss": 3.0339722633361816, + "step": 5315, + "token_acc": 0.29658804957721824 + }, + { + "epoch": 3.1160949868073877, + "grad_norm": 0.31741437808371786, + "learning_rate": 0.00029091185732558567, + "loss": 3.025500774383545, + "step": 5316, + "token_acc": 0.29732495172529594 + }, + { + "epoch": 3.116681325124597, + "grad_norm": 0.2757561053419513, + "learning_rate": 0.00029090687315156793, + "loss": 3.0023460388183594, + "step": 5317, + "token_acc": 0.2985872099123052 + }, + { + "epoch": 3.117267663441806, + "grad_norm": 0.28865608109861973, + "learning_rate": 0.00029090188765391997, + "loss": 3.043158769607544, + "step": 5318, + "token_acc": 0.29525406057384673 + }, + { + "epoch": 3.117854001759015, + "grad_norm": 0.28475799256125384, + "learning_rate": 0.0002908969008326887, + "loss": 3.01466965675354, + "step": 5319, + "token_acc": 0.2975835688828807 + }, + { + "epoch": 3.118440340076224, + "grad_norm": 0.27167308381708766, + "learning_rate": 0.0002908919126879209, + "loss": 2.966885566711426, + "step": 5320, + "token_acc": 0.30620411875527836 + }, + { + "epoch": 3.119026678393433, + "grad_norm": 0.2877389023490949, + "learning_rate": 0.0002908869232196634, + "loss": 3.0306015014648438, + "step": 5321, + "token_acc": 0.2959798101495928 + }, + { + "epoch": 3.119613016710642, + "grad_norm": 0.28857422370392966, + "learning_rate": 0.0002908819324279631, + "loss": 3.001605987548828, + "step": 5322, + "token_acc": 0.2997729818412726 + }, + { + "epoch": 3.120199355027851, + "grad_norm": 0.28685639084763337, + "learning_rate": 0.00029087694031286693, + "loss": 3.020561695098877, + "step": 5323, + "token_acc": 0.29820995626246816 + }, + { + "epoch": 3.12078569334506, + "grad_norm": 0.26485751034770716, + "learning_rate": 0.0002908719468744217, + "loss": 2.9896717071533203, + "step": 5324, + "token_acc": 0.30070886220727266 + }, + { + "epoch": 3.1213720316622693, + "grad_norm": 0.25036735665863324, + "learning_rate": 0.0002908669521126744, + "loss": 2.986374616622925, + "step": 5325, + "token_acc": 0.30080679760995366 + }, + { + "epoch": 3.121958369979478, + "grad_norm": 0.2798110967066586, + "learning_rate": 0.0002908619560276719, + "loss": 3.0329840183258057, + "step": 5326, + "token_acc": 0.29525755181230645 + }, + { + "epoch": 3.122544708296687, + "grad_norm": 0.28123915862052506, + "learning_rate": 0.0002908569586194611, + "loss": 3.0159850120544434, + "step": 5327, + "token_acc": 0.297900702384756 + }, + { + "epoch": 3.123131046613896, + "grad_norm": 0.27559481323307433, + "learning_rate": 0.000290851959888089, + "loss": 3.0007190704345703, + "step": 5328, + "token_acc": 0.2997485753468777 + }, + { + "epoch": 3.1237173849311053, + "grad_norm": 0.2814679782227896, + "learning_rate": 0.00029084695983360256, + "loss": 3.042935371398926, + "step": 5329, + "token_acc": 0.29395242234630453 + }, + { + "epoch": 3.1243037232483144, + "grad_norm": 0.2765345249596503, + "learning_rate": 0.00029084195845604865, + "loss": 3.067530393600464, + "step": 5330, + "token_acc": 0.29082318147729047 + }, + { + "epoch": 3.1248900615655235, + "grad_norm": 0.3066915918381733, + "learning_rate": 0.0002908369557554744, + "loss": 3.0139434337615967, + "step": 5331, + "token_acc": 0.2991293405608086 + }, + { + "epoch": 3.125476399882732, + "grad_norm": 0.29974822109018, + "learning_rate": 0.00029083195173192674, + "loss": 2.991257667541504, + "step": 5332, + "token_acc": 0.3010812893283678 + }, + { + "epoch": 3.1260627381999413, + "grad_norm": 0.30507686688699515, + "learning_rate": 0.00029082694638545264, + "loss": 3.0258185863494873, + "step": 5333, + "token_acc": 0.29595493441460025 + }, + { + "epoch": 3.1266490765171504, + "grad_norm": 0.2899025326024655, + "learning_rate": 0.0002908219397160991, + "loss": 3.0256729125976562, + "step": 5334, + "token_acc": 0.29662606406955117 + }, + { + "epoch": 3.1272354148343595, + "grad_norm": 0.2621229607588633, + "learning_rate": 0.00029081693172391325, + "loss": 3.0674326419830322, + "step": 5335, + "token_acc": 0.29262342173825906 + }, + { + "epoch": 3.1278217531515686, + "grad_norm": 0.2716514394437105, + "learning_rate": 0.00029081192240894207, + "loss": 3.0238611698150635, + "step": 5336, + "token_acc": 0.29641827912721286 + }, + { + "epoch": 3.1284080914687773, + "grad_norm": 0.2909894649130366, + "learning_rate": 0.00029080691177123263, + "loss": 3.021062135696411, + "step": 5337, + "token_acc": 0.2970664539070797 + }, + { + "epoch": 3.1289944297859864, + "grad_norm": 0.28317424269579244, + "learning_rate": 0.00029080189981083195, + "loss": 3.0518617630004883, + "step": 5338, + "token_acc": 0.2933875592185718 + }, + { + "epoch": 3.1295807681031955, + "grad_norm": 0.2773546387978314, + "learning_rate": 0.00029079688652778723, + "loss": 3.0442910194396973, + "step": 5339, + "token_acc": 0.2930961020210436 + }, + { + "epoch": 3.1301671064204046, + "grad_norm": 0.2861116317668745, + "learning_rate": 0.0002907918719221454, + "loss": 3.005298614501953, + "step": 5340, + "token_acc": 0.30018897169671105 + }, + { + "epoch": 3.1307534447376137, + "grad_norm": 0.29064232662099465, + "learning_rate": 0.00029078685599395374, + "loss": 3.017728567123413, + "step": 5341, + "token_acc": 0.2976612708255292 + }, + { + "epoch": 3.1313397830548224, + "grad_norm": 0.26764454260736853, + "learning_rate": 0.00029078183874325925, + "loss": 3.065646171569824, + "step": 5342, + "token_acc": 0.291617418831002 + }, + { + "epoch": 3.1319261213720315, + "grad_norm": 0.2583612376154264, + "learning_rate": 0.00029077682017010904, + "loss": 3.0032882690429688, + "step": 5343, + "token_acc": 0.29911191546254734 + }, + { + "epoch": 3.1325124596892406, + "grad_norm": 0.24419268896215868, + "learning_rate": 0.0002907718002745504, + "loss": 3.0323257446289062, + "step": 5344, + "token_acc": 0.2963721050588692 + }, + { + "epoch": 3.1330987980064497, + "grad_norm": 0.2692959304495686, + "learning_rate": 0.0002907667790566303, + "loss": 3.0436573028564453, + "step": 5345, + "token_acc": 0.2934602085894925 + }, + { + "epoch": 3.133685136323659, + "grad_norm": 0.2879031454054452, + "learning_rate": 0.0002907617565163961, + "loss": 3.056293487548828, + "step": 5346, + "token_acc": 0.291785800823843 + }, + { + "epoch": 3.134271474640868, + "grad_norm": 0.26252872156172335, + "learning_rate": 0.0002907567326538948, + "loss": 3.0544304847717285, + "step": 5347, + "token_acc": 0.29200598664642247 + }, + { + "epoch": 3.1348578129580766, + "grad_norm": 0.3001944020911754, + "learning_rate": 0.00029075170746917364, + "loss": 3.0910696983337402, + "step": 5348, + "token_acc": 0.28824532313146173 + }, + { + "epoch": 3.1354441512752858, + "grad_norm": 0.26686152772090627, + "learning_rate": 0.0002907466809622799, + "loss": 3.033529758453369, + "step": 5349, + "token_acc": 0.29546787343467 + }, + { + "epoch": 3.136030489592495, + "grad_norm": 0.26608855188672215, + "learning_rate": 0.00029074165313326076, + "loss": 3.018584966659546, + "step": 5350, + "token_acc": 0.29791833551720664 + }, + { + "epoch": 3.136616827909704, + "grad_norm": 0.31266874620828916, + "learning_rate": 0.0002907366239821634, + "loss": 2.984137773513794, + "step": 5351, + "token_acc": 0.3014846072999794 + }, + { + "epoch": 3.137203166226913, + "grad_norm": 0.2853613582308391, + "learning_rate": 0.0002907315935090351, + "loss": 3.0114269256591797, + "step": 5352, + "token_acc": 0.29845401389875087 + }, + { + "epoch": 3.1377895045441218, + "grad_norm": 0.26666826878229777, + "learning_rate": 0.00029072656171392315, + "loss": 3.0533699989318848, + "step": 5353, + "token_acc": 0.2928744930606829 + }, + { + "epoch": 3.138375842861331, + "grad_norm": 0.2514837645318631, + "learning_rate": 0.0002907215285968748, + "loss": 3.0496621131896973, + "step": 5354, + "token_acc": 0.2920772891936726 + }, + { + "epoch": 3.13896218117854, + "grad_norm": 0.31350385719612367, + "learning_rate": 0.0002907164941579373, + "loss": 3.0495119094848633, + "step": 5355, + "token_acc": 0.29239103796096844 + }, + { + "epoch": 3.139548519495749, + "grad_norm": 0.28469027868181157, + "learning_rate": 0.0002907114583971579, + "loss": 3.0614070892333984, + "step": 5356, + "token_acc": 0.2933501997336884 + }, + { + "epoch": 3.140134857812958, + "grad_norm": 0.26943323416109544, + "learning_rate": 0.00029070642131458403, + "loss": 3.0570504665374756, + "step": 5357, + "token_acc": 0.2939087709699989 + }, + { + "epoch": 3.1407211961301673, + "grad_norm": 0.31653381839963846, + "learning_rate": 0.0002907013829102629, + "loss": 3.0413155555725098, + "step": 5358, + "token_acc": 0.2930761388280819 + }, + { + "epoch": 3.141307534447376, + "grad_norm": 0.24777844895424422, + "learning_rate": 0.0002906963431842419, + "loss": 3.0301499366760254, + "step": 5359, + "token_acc": 0.2943010461051783 + }, + { + "epoch": 3.141893872764585, + "grad_norm": 0.2584934993644365, + "learning_rate": 0.0002906913021365683, + "loss": 3.0104033946990967, + "step": 5360, + "token_acc": 0.29835608790004847 + }, + { + "epoch": 3.142480211081794, + "grad_norm": 0.27211892418997763, + "learning_rate": 0.00029068625976728956, + "loss": 3.04984450340271, + "step": 5361, + "token_acc": 0.2933301036036718 + }, + { + "epoch": 3.1430665493990033, + "grad_norm": 0.3064652902989057, + "learning_rate": 0.00029068121607645294, + "loss": 3.0345640182495117, + "step": 5362, + "token_acc": 0.2956272379448095 + }, + { + "epoch": 3.1436528877162124, + "grad_norm": 0.28620528785672883, + "learning_rate": 0.00029067617106410593, + "loss": 3.050757646560669, + "step": 5363, + "token_acc": 0.2953637671599308 + }, + { + "epoch": 3.144239226033421, + "grad_norm": 0.25219912609311596, + "learning_rate": 0.00029067112473029575, + "loss": 3.0174055099487305, + "step": 5364, + "token_acc": 0.3003174721051557 + }, + { + "epoch": 3.14482556435063, + "grad_norm": 0.26622879607773675, + "learning_rate": 0.00029066607707507, + "loss": 3.0639500617980957, + "step": 5365, + "token_acc": 0.28998510419058093 + }, + { + "epoch": 3.1454119026678393, + "grad_norm": 0.26544682544027964, + "learning_rate": 0.00029066102809847597, + "loss": 3.0324273109436035, + "step": 5366, + "token_acc": 0.2955882390471597 + }, + { + "epoch": 3.1459982409850484, + "grad_norm": 0.2614754878789053, + "learning_rate": 0.0002906559778005612, + "loss": 3.0420727729797363, + "step": 5367, + "token_acc": 0.2936059087727026 + }, + { + "epoch": 3.1465845793022575, + "grad_norm": 0.27349963247134323, + "learning_rate": 0.00029065092618137296, + "loss": 3.0477142333984375, + "step": 5368, + "token_acc": 0.29385631100022963 + }, + { + "epoch": 3.1471709176194667, + "grad_norm": 0.2943206293677598, + "learning_rate": 0.0002906458732409588, + "loss": 3.0173418521881104, + "step": 5369, + "token_acc": 0.29720723033798574 + }, + { + "epoch": 3.1477572559366753, + "grad_norm": 0.27339741580279997, + "learning_rate": 0.0002906408189793662, + "loss": 3.0467071533203125, + "step": 5370, + "token_acc": 0.2931114446216219 + }, + { + "epoch": 3.1483435942538844, + "grad_norm": 0.31017195464839714, + "learning_rate": 0.0002906357633966426, + "loss": 2.979586601257324, + "step": 5371, + "token_acc": 0.3030140313005936 + }, + { + "epoch": 3.1489299325710935, + "grad_norm": 0.27546916774469465, + "learning_rate": 0.0002906307064928356, + "loss": 3.0591816902160645, + "step": 5372, + "token_acc": 0.29251704221576136 + }, + { + "epoch": 3.1495162708883027, + "grad_norm": 0.28983717663959646, + "learning_rate": 0.00029062564826799254, + "loss": 3.0315098762512207, + "step": 5373, + "token_acc": 0.29758352662620463 + }, + { + "epoch": 3.1501026092055118, + "grad_norm": 0.27898695138237106, + "learning_rate": 0.000290620588722161, + "loss": 3.0246715545654297, + "step": 5374, + "token_acc": 0.2970512762112999 + }, + { + "epoch": 3.1506889475227204, + "grad_norm": 0.2996184907547014, + "learning_rate": 0.00029061552785538856, + "loss": 3.080068588256836, + "step": 5375, + "token_acc": 0.28924052097068964 + }, + { + "epoch": 3.1512752858399296, + "grad_norm": 0.2409592820298718, + "learning_rate": 0.0002906104656677227, + "loss": 3.025758743286133, + "step": 5376, + "token_acc": 0.29551797253903517 + }, + { + "epoch": 3.1518616241571387, + "grad_norm": 0.29808990099867594, + "learning_rate": 0.000290605402159211, + "loss": 3.0626211166381836, + "step": 5377, + "token_acc": 0.2912986619416453 + }, + { + "epoch": 3.1524479624743478, + "grad_norm": 0.28497633646817105, + "learning_rate": 0.000290600337329901, + "loss": 3.028475046157837, + "step": 5378, + "token_acc": 0.2958579881656805 + }, + { + "epoch": 3.153034300791557, + "grad_norm": 0.25883661850915174, + "learning_rate": 0.0002905952711798403, + "loss": 3.07020902633667, + "step": 5379, + "token_acc": 0.28953010316039046 + }, + { + "epoch": 3.1536206391087656, + "grad_norm": 0.284355941957499, + "learning_rate": 0.00029059020370907643, + "loss": 3.069153308868408, + "step": 5380, + "token_acc": 0.2911276728114763 + }, + { + "epoch": 3.1542069774259747, + "grad_norm": 0.27600335514483687, + "learning_rate": 0.0002905851349176571, + "loss": 3.044420003890991, + "step": 5381, + "token_acc": 0.2924093051639241 + }, + { + "epoch": 3.154793315743184, + "grad_norm": 0.26752599137582783, + "learning_rate": 0.00029058006480562986, + "loss": 3.0292701721191406, + "step": 5382, + "token_acc": 0.2969613188951181 + }, + { + "epoch": 3.155379654060393, + "grad_norm": 0.24568780454718595, + "learning_rate": 0.00029057499337304234, + "loss": 3.0193278789520264, + "step": 5383, + "token_acc": 0.2964344116228852 + }, + { + "epoch": 3.155965992377602, + "grad_norm": 0.3021540054641473, + "learning_rate": 0.0002905699206199422, + "loss": 3.0743165016174316, + "step": 5384, + "token_acc": 0.2901781703056538 + }, + { + "epoch": 3.1565523306948107, + "grad_norm": 0.2697057503939792, + "learning_rate": 0.0002905648465463771, + "loss": 3.041522264480591, + "step": 5385, + "token_acc": 0.2935100479904019 + }, + { + "epoch": 3.15713866901202, + "grad_norm": 0.2567815942260553, + "learning_rate": 0.0002905597711523946, + "loss": 3.0842158794403076, + "step": 5386, + "token_acc": 0.28950353870481665 + }, + { + "epoch": 3.157725007329229, + "grad_norm": 0.27295455417085845, + "learning_rate": 0.0002905546944380425, + "loss": 3.037230968475342, + "step": 5387, + "token_acc": 0.2934696672894184 + }, + { + "epoch": 3.158311345646438, + "grad_norm": 0.2759351507126541, + "learning_rate": 0.0002905496164033685, + "loss": 3.048790693283081, + "step": 5388, + "token_acc": 0.29569811244737215 + }, + { + "epoch": 3.158897683963647, + "grad_norm": 0.27960265665746, + "learning_rate": 0.00029054453704842017, + "loss": 3.076842784881592, + "step": 5389, + "token_acc": 0.2905825828302772 + }, + { + "epoch": 3.1594840222808562, + "grad_norm": 0.2719123286955516, + "learning_rate": 0.0002905394563732453, + "loss": 3.0441150665283203, + "step": 5390, + "token_acc": 0.2953043123555314 + }, + { + "epoch": 3.160070360598065, + "grad_norm": 0.2515209794493845, + "learning_rate": 0.00029053437437789165, + "loss": 3.0664401054382324, + "step": 5391, + "token_acc": 0.29121228421927725 + }, + { + "epoch": 3.160656698915274, + "grad_norm": 0.2598801143217189, + "learning_rate": 0.00029052929106240696, + "loss": 3.0633704662323, + "step": 5392, + "token_acc": 0.2923338071658049 + }, + { + "epoch": 3.161243037232483, + "grad_norm": 0.27793707679257335, + "learning_rate": 0.0002905242064268389, + "loss": 3.051159381866455, + "step": 5393, + "token_acc": 0.2919332587274288 + }, + { + "epoch": 3.1618293755496922, + "grad_norm": 0.29984634921823927, + "learning_rate": 0.00029051912047123524, + "loss": 3.047536849975586, + "step": 5394, + "token_acc": 0.29249922868913447 + }, + { + "epoch": 3.1624157138669013, + "grad_norm": 0.30388645272033243, + "learning_rate": 0.0002905140331956439, + "loss": 3.006138801574707, + "step": 5395, + "token_acc": 0.29989969401608946 + }, + { + "epoch": 3.16300205218411, + "grad_norm": 0.2651067217519387, + "learning_rate": 0.00029050894460011246, + "loss": 3.0379276275634766, + "step": 5396, + "token_acc": 0.29439573390834667 + }, + { + "epoch": 3.163588390501319, + "grad_norm": 0.23287997301780414, + "learning_rate": 0.00029050385468468886, + "loss": 3.059835910797119, + "step": 5397, + "token_acc": 0.2907920284523585 + }, + { + "epoch": 3.1641747288185282, + "grad_norm": 0.25642931909211886, + "learning_rate": 0.0002904987634494209, + "loss": 3.0495285987854004, + "step": 5398, + "token_acc": 0.29227074692234545 + }, + { + "epoch": 3.1647610671357373, + "grad_norm": 0.23633083670497035, + "learning_rate": 0.00029049367089435636, + "loss": 3.036954879760742, + "step": 5399, + "token_acc": 0.2954503284723548 + }, + { + "epoch": 3.1653474054529465, + "grad_norm": 0.23063904805409077, + "learning_rate": 0.00029048857701954314, + "loss": 3.043638229370117, + "step": 5400, + "token_acc": 0.2952816803746493 + }, + { + "epoch": 3.1659337437701556, + "grad_norm": 0.23850557502820416, + "learning_rate": 0.000290483481825029, + "loss": 3.064352035522461, + "step": 5401, + "token_acc": 0.2901427234856072 + }, + { + "epoch": 3.1665200820873642, + "grad_norm": 0.2399768774216624, + "learning_rate": 0.0002904783853108619, + "loss": 3.0597896575927734, + "step": 5402, + "token_acc": 0.29016302918748355 + }, + { + "epoch": 3.1671064204045734, + "grad_norm": 0.2673562749099399, + "learning_rate": 0.0002904732874770896, + "loss": 3.0342419147491455, + "step": 5403, + "token_acc": 0.2955557066962903 + }, + { + "epoch": 3.1676927587217825, + "grad_norm": 0.25975422664460696, + "learning_rate": 0.00029046818832376007, + "loss": 3.0449600219726562, + "step": 5404, + "token_acc": 0.2935151967849515 + }, + { + "epoch": 3.1682790970389916, + "grad_norm": 0.28275483028053316, + "learning_rate": 0.0002904630878509212, + "loss": 3.016857862472534, + "step": 5405, + "token_acc": 0.2983555227984339 + }, + { + "epoch": 3.1688654353562007, + "grad_norm": 0.28010196531031606, + "learning_rate": 0.0002904579860586209, + "loss": 3.0383315086364746, + "step": 5406, + "token_acc": 0.2967374595522771 + }, + { + "epoch": 3.1694517736734094, + "grad_norm": 0.29218044547206484, + "learning_rate": 0.0002904528829469071, + "loss": 3.0445313453674316, + "step": 5407, + "token_acc": 0.2930165890263696 + }, + { + "epoch": 3.1700381119906185, + "grad_norm": 0.27411002913094284, + "learning_rate": 0.00029044777851582775, + "loss": 3.0499987602233887, + "step": 5408, + "token_acc": 0.29396396745948605 + }, + { + "epoch": 3.1706244503078276, + "grad_norm": 0.2552897715355726, + "learning_rate": 0.00029044267276543074, + "loss": 3.037863254547119, + "step": 5409, + "token_acc": 0.29415527263795643 + }, + { + "epoch": 3.1712107886250367, + "grad_norm": 0.2627861453647873, + "learning_rate": 0.0002904375656957641, + "loss": 3.0614705085754395, + "step": 5410, + "token_acc": 0.2910051576913095 + }, + { + "epoch": 3.171797126942246, + "grad_norm": 0.2720362615129391, + "learning_rate": 0.0002904324573068757, + "loss": 3.0795886516571045, + "step": 5411, + "token_acc": 0.2877877661955273 + }, + { + "epoch": 3.172383465259455, + "grad_norm": 0.3035982086941271, + "learning_rate": 0.00029042734759881367, + "loss": 3.0455641746520996, + "step": 5412, + "token_acc": 0.2945114098203888 + }, + { + "epoch": 3.1729698035766636, + "grad_norm": 0.2615378869609572, + "learning_rate": 0.00029042223657162593, + "loss": 3.029334783554077, + "step": 5413, + "token_acc": 0.295146862804927 + }, + { + "epoch": 3.1735561418938727, + "grad_norm": 0.27318306972811945, + "learning_rate": 0.0002904171242253605, + "loss": 3.0196731090545654, + "step": 5414, + "token_acc": 0.29711179776993996 + }, + { + "epoch": 3.174142480211082, + "grad_norm": 0.31264625516040356, + "learning_rate": 0.0002904120105600654, + "loss": 3.062580108642578, + "step": 5415, + "token_acc": 0.29244153848902227 + }, + { + "epoch": 3.174728818528291, + "grad_norm": 0.30064617982550573, + "learning_rate": 0.00029040689557578866, + "loss": 3.0004348754882812, + "step": 5416, + "token_acc": 0.3005659283191981 + }, + { + "epoch": 3.1753151568455, + "grad_norm": 0.2450276221308275, + "learning_rate": 0.0002904017792725783, + "loss": 3.0122342109680176, + "step": 5417, + "token_acc": 0.2982122969593814 + }, + { + "epoch": 3.1759014951627087, + "grad_norm": 0.2697012010480371, + "learning_rate": 0.00029039666165048245, + "loss": 3.0764012336730957, + "step": 5418, + "token_acc": 0.29055975238694787 + }, + { + "epoch": 3.176487833479918, + "grad_norm": 0.29496868354150235, + "learning_rate": 0.00029039154270954915, + "loss": 2.9907400608062744, + "step": 5419, + "token_acc": 0.3018730969479114 + }, + { + "epoch": 3.177074171797127, + "grad_norm": 0.29589947118490356, + "learning_rate": 0.0002903864224498265, + "loss": 3.052560329437256, + "step": 5420, + "token_acc": 0.29297849137087323 + }, + { + "epoch": 3.177660510114336, + "grad_norm": 0.2890579235434179, + "learning_rate": 0.0002903813008713626, + "loss": 3.0558905601501465, + "step": 5421, + "token_acc": 0.29126228680212907 + }, + { + "epoch": 3.178246848431545, + "grad_norm": 0.30856421629343994, + "learning_rate": 0.00029037617797420554, + "loss": 3.02246356010437, + "step": 5422, + "token_acc": 0.2970672563868496 + }, + { + "epoch": 3.1788331867487543, + "grad_norm": 0.28135281418437047, + "learning_rate": 0.00029037105375840337, + "loss": 3.046564817428589, + "step": 5423, + "token_acc": 0.29374221421187946 + }, + { + "epoch": 3.179419525065963, + "grad_norm": 0.2935846657523766, + "learning_rate": 0.0002903659282240044, + "loss": 3.0378315448760986, + "step": 5424, + "token_acc": 0.2947460167860734 + }, + { + "epoch": 3.180005863383172, + "grad_norm": 0.29372605040584476, + "learning_rate": 0.0002903608013710566, + "loss": 3.066006660461426, + "step": 5425, + "token_acc": 0.28993622728553686 + }, + { + "epoch": 3.180592201700381, + "grad_norm": 0.25132738097446183, + "learning_rate": 0.00029035567319960826, + "loss": 3.0172410011291504, + "step": 5426, + "token_acc": 0.29805499523360596 + }, + { + "epoch": 3.1811785400175903, + "grad_norm": 0.2637122367677759, + "learning_rate": 0.0002903505437097075, + "loss": 3.0304558277130127, + "step": 5427, + "token_acc": 0.29510546213279343 + }, + { + "epoch": 3.1817648783347994, + "grad_norm": 0.28551598582098087, + "learning_rate": 0.0002903454129014025, + "loss": 3.064781427383423, + "step": 5428, + "token_acc": 0.29104862337556275 + }, + { + "epoch": 3.182351216652008, + "grad_norm": 0.31102106935712576, + "learning_rate": 0.00029034028077474144, + "loss": 3.0240018367767334, + "step": 5429, + "token_acc": 0.29745801616364353 + }, + { + "epoch": 3.182937554969217, + "grad_norm": 0.28845413054449426, + "learning_rate": 0.00029033514732977253, + "loss": 3.067995071411133, + "step": 5430, + "token_acc": 0.290314409407719 + }, + { + "epoch": 3.1835238932864263, + "grad_norm": 0.306178067736288, + "learning_rate": 0.0002903300125665441, + "loss": 3.012289524078369, + "step": 5431, + "token_acc": 0.2994647201946472 + }, + { + "epoch": 3.1841102316036354, + "grad_norm": 0.287390763312986, + "learning_rate": 0.00029032487648510423, + "loss": 3.053600311279297, + "step": 5432, + "token_acc": 0.2925086156332522 + }, + { + "epoch": 3.1846965699208445, + "grad_norm": 0.3108739635655479, + "learning_rate": 0.0002903197390855013, + "loss": 3.027193546295166, + "step": 5433, + "token_acc": 0.2958935040251895 + }, + { + "epoch": 3.185282908238053, + "grad_norm": 0.268665878807294, + "learning_rate": 0.00029031460036778344, + "loss": 3.0519514083862305, + "step": 5434, + "token_acc": 0.293075507105753 + }, + { + "epoch": 3.1858692465552623, + "grad_norm": 0.26836511236589805, + "learning_rate": 0.000290309460331999, + "loss": 3.021183967590332, + "step": 5435, + "token_acc": 0.29670797326375314 + }, + { + "epoch": 3.1864555848724714, + "grad_norm": 0.2763591767790281, + "learning_rate": 0.00029030431897819625, + "loss": 3.0452303886413574, + "step": 5436, + "token_acc": 0.29407998517884826 + }, + { + "epoch": 3.1870419231896805, + "grad_norm": 0.25012342390917064, + "learning_rate": 0.0002902991763064235, + "loss": 3.0219974517822266, + "step": 5437, + "token_acc": 0.29691974763678647 + }, + { + "epoch": 3.1876282615068896, + "grad_norm": 0.30717661013167247, + "learning_rate": 0.00029029403231672907, + "loss": 3.011460542678833, + "step": 5438, + "token_acc": 0.2992396965047956 + }, + { + "epoch": 3.1882145998240983, + "grad_norm": 0.23360642491833097, + "learning_rate": 0.0002902888870091612, + "loss": 3.031067371368408, + "step": 5439, + "token_acc": 0.29602434767803176 + }, + { + "epoch": 3.1888009381413074, + "grad_norm": 0.23975529012701033, + "learning_rate": 0.0002902837403837683, + "loss": 3.069235324859619, + "step": 5440, + "token_acc": 0.2912979569819972 + }, + { + "epoch": 3.1893872764585165, + "grad_norm": 0.23044013977346534, + "learning_rate": 0.00029027859244059874, + "loss": 3.056037425994873, + "step": 5441, + "token_acc": 0.29164073392600404 + }, + { + "epoch": 3.1899736147757256, + "grad_norm": 0.2672831892400742, + "learning_rate": 0.00029027344317970075, + "loss": 3.0441417694091797, + "step": 5442, + "token_acc": 0.2949424672196949 + }, + { + "epoch": 3.1905599530929347, + "grad_norm": 0.27655170021198183, + "learning_rate": 0.00029026829260112285, + "loss": 3.0650463104248047, + "step": 5443, + "token_acc": 0.29039545691340385 + }, + { + "epoch": 3.191146291410144, + "grad_norm": 0.2797173201953544, + "learning_rate": 0.00029026314070491335, + "loss": 3.0708415508270264, + "step": 5444, + "token_acc": 0.29222022439776185 + }, + { + "epoch": 3.1917326297273525, + "grad_norm": 0.26943236834101214, + "learning_rate": 0.0002902579874911206, + "loss": 3.0334792137145996, + "step": 5445, + "token_acc": 0.294558319969939 + }, + { + "epoch": 3.1923189680445616, + "grad_norm": 0.2597255027702492, + "learning_rate": 0.00029025283295979306, + "loss": 3.0589942932128906, + "step": 5446, + "token_acc": 0.2920280548200062 + }, + { + "epoch": 3.1929053063617707, + "grad_norm": 0.2537377800917062, + "learning_rate": 0.0002902476771109792, + "loss": 3.097214460372925, + "step": 5447, + "token_acc": 0.2865499248910522 + }, + { + "epoch": 3.19349164467898, + "grad_norm": 0.2777055243287423, + "learning_rate": 0.0002902425199447273, + "loss": 3.0852646827697754, + "step": 5448, + "token_acc": 0.28888888888888886 + }, + { + "epoch": 3.194077982996189, + "grad_norm": 0.2823010273558741, + "learning_rate": 0.00029023736146108604, + "loss": 3.0226335525512695, + "step": 5449, + "token_acc": 0.2968397406443049 + }, + { + "epoch": 3.1946643213133976, + "grad_norm": 0.25337305922484077, + "learning_rate": 0.0002902322016601037, + "loss": 3.025752544403076, + "step": 5450, + "token_acc": 0.2961723064925337 + }, + { + "epoch": 3.1952506596306067, + "grad_norm": 0.257736341736192, + "learning_rate": 0.00029022704054182874, + "loss": 3.093207836151123, + "step": 5451, + "token_acc": 0.28886961111444986 + }, + { + "epoch": 3.195836997947816, + "grad_norm": 0.2662523226579173, + "learning_rate": 0.00029022187810630974, + "loss": 3.0289978981018066, + "step": 5452, + "token_acc": 0.2967314233605952 + }, + { + "epoch": 3.196423336265025, + "grad_norm": 0.2610753546482197, + "learning_rate": 0.0002902167143535951, + "loss": 2.9915199279785156, + "step": 5453, + "token_acc": 0.3000393079673 + }, + { + "epoch": 3.197009674582234, + "grad_norm": 0.27860994031028113, + "learning_rate": 0.00029021154928373337, + "loss": 3.00730562210083, + "step": 5454, + "token_acc": 0.2992911254408967 + }, + { + "epoch": 3.197596012899443, + "grad_norm": 0.3059160783393794, + "learning_rate": 0.0002902063828967731, + "loss": 3.032785654067993, + "step": 5455, + "token_acc": 0.2969263627077723 + }, + { + "epoch": 3.198182351216652, + "grad_norm": 0.2895615831262765, + "learning_rate": 0.00029020121519276283, + "loss": 3.027360439300537, + "step": 5456, + "token_acc": 0.2975511336278184 + }, + { + "epoch": 3.198768689533861, + "grad_norm": 0.27258474032298347, + "learning_rate": 0.000290196046171751, + "loss": 3.0187621116638184, + "step": 5457, + "token_acc": 0.29648406166194996 + }, + { + "epoch": 3.19935502785107, + "grad_norm": 0.2994915424496726, + "learning_rate": 0.00029019087583378626, + "loss": 3.021726131439209, + "step": 5458, + "token_acc": 0.2963132861730124 + }, + { + "epoch": 3.199941366168279, + "grad_norm": 0.23911046501444816, + "learning_rate": 0.0002901857041789172, + "loss": 2.97152042388916, + "step": 5459, + "token_acc": 0.30634925170739563 + }, + { + "epoch": 3.2005277044854883, + "grad_norm": 0.26510796259876795, + "learning_rate": 0.0002901805312071923, + "loss": 3.0717005729675293, + "step": 5460, + "token_acc": 0.29043326244928314 + }, + { + "epoch": 3.201114042802697, + "grad_norm": 0.2628454817882755, + "learning_rate": 0.0002901753569186602, + "loss": 3.0305166244506836, + "step": 5461, + "token_acc": 0.2949253421008174 + }, + { + "epoch": 3.201700381119906, + "grad_norm": 0.2499596130301154, + "learning_rate": 0.0002901701813133695, + "loss": 3.029402256011963, + "step": 5462, + "token_acc": 0.29593944475153466 + }, + { + "epoch": 3.202286719437115, + "grad_norm": 0.25851004799773886, + "learning_rate": 0.0002901650043913689, + "loss": 3.0686545372009277, + "step": 5463, + "token_acc": 0.2911158703197696 + }, + { + "epoch": 3.2028730577543243, + "grad_norm": 0.24269600233604152, + "learning_rate": 0.00029015982615270686, + "loss": 2.9989731311798096, + "step": 5464, + "token_acc": 0.30219385449025604 + }, + { + "epoch": 3.2034593960715334, + "grad_norm": 0.27809831697534787, + "learning_rate": 0.00029015464659743216, + "loss": 3.047905445098877, + "step": 5465, + "token_acc": 0.2939025893113883 + }, + { + "epoch": 3.2040457343887425, + "grad_norm": 0.260624470803214, + "learning_rate": 0.00029014946572559347, + "loss": 3.066075325012207, + "step": 5466, + "token_acc": 0.2907085334968271 + }, + { + "epoch": 3.204632072705951, + "grad_norm": 0.3117444724017944, + "learning_rate": 0.00029014428353723936, + "loss": 3.0640780925750732, + "step": 5467, + "token_acc": 0.29116160313612216 + }, + { + "epoch": 3.2052184110231603, + "grad_norm": 0.28977588080943023, + "learning_rate": 0.0002901391000324185, + "loss": 3.0611000061035156, + "step": 5468, + "token_acc": 0.29149763536966333 + }, + { + "epoch": 3.2058047493403694, + "grad_norm": 0.256657011574282, + "learning_rate": 0.0002901339152111797, + "loss": 3.0344057083129883, + "step": 5469, + "token_acc": 0.2934286188785784 + }, + { + "epoch": 3.2063910876575785, + "grad_norm": 0.2521046874005232, + "learning_rate": 0.0002901287290735716, + "loss": 3.0613627433776855, + "step": 5470, + "token_acc": 0.291746213311208 + }, + { + "epoch": 3.2069774259747876, + "grad_norm": 0.2914426637231044, + "learning_rate": 0.0002901235416196429, + "loss": 3.0585129261016846, + "step": 5471, + "token_acc": 0.29366238433871295 + }, + { + "epoch": 3.2075637642919963, + "grad_norm": 0.26625924404058815, + "learning_rate": 0.00029011835284944233, + "loss": 3.0369839668273926, + "step": 5472, + "token_acc": 0.2958315352959131 + }, + { + "epoch": 3.2081501026092054, + "grad_norm": 0.24497887263317883, + "learning_rate": 0.00029011316276301866, + "loss": 3.039714813232422, + "step": 5473, + "token_acc": 0.294254735208495 + }, + { + "epoch": 3.2087364409264145, + "grad_norm": 0.28455192960044107, + "learning_rate": 0.00029010797136042065, + "loss": 3.0285723209381104, + "step": 5474, + "token_acc": 0.295212102267205 + }, + { + "epoch": 3.2093227792436236, + "grad_norm": 0.24803596456015764, + "learning_rate": 0.00029010277864169705, + "loss": 3.0835280418395996, + "step": 5475, + "token_acc": 0.2904488575746799 + }, + { + "epoch": 3.2099091175608327, + "grad_norm": 0.2714586783911179, + "learning_rate": 0.0002900975846068966, + "loss": 3.069371223449707, + "step": 5476, + "token_acc": 0.29053620045097706 + }, + { + "epoch": 3.210495455878042, + "grad_norm": 0.292598993112876, + "learning_rate": 0.0002900923892560682, + "loss": 3.039937973022461, + "step": 5477, + "token_acc": 0.2941823537655648 + }, + { + "epoch": 3.2110817941952505, + "grad_norm": 0.24353582760372317, + "learning_rate": 0.0002900871925892605, + "loss": 3.0645387172698975, + "step": 5478, + "token_acc": 0.294048816591178 + }, + { + "epoch": 3.2116681325124596, + "grad_norm": 0.26844071423364974, + "learning_rate": 0.00029008199460652244, + "loss": 3.0633792877197266, + "step": 5479, + "token_acc": 0.2911857414062841 + }, + { + "epoch": 3.2122544708296688, + "grad_norm": 0.2572926056219949, + "learning_rate": 0.00029007679530790277, + "loss": 3.0901787281036377, + "step": 5480, + "token_acc": 0.2859894814896913 + }, + { + "epoch": 3.212840809146878, + "grad_norm": 0.24949414812310355, + "learning_rate": 0.00029007159469345034, + "loss": 2.985351324081421, + "step": 5481, + "token_acc": 0.30219071815901666 + }, + { + "epoch": 3.213427147464087, + "grad_norm": 0.2637921694832076, + "learning_rate": 0.00029006639276321405, + "loss": 3.0297322273254395, + "step": 5482, + "token_acc": 0.29572955205704937 + }, + { + "epoch": 3.2140134857812956, + "grad_norm": 0.23107870206125952, + "learning_rate": 0.00029006118951724276, + "loss": 3.001009941101074, + "step": 5483, + "token_acc": 0.3009738023672367 + }, + { + "epoch": 3.2145998240985048, + "grad_norm": 0.28099929466254187, + "learning_rate": 0.00029005598495558535, + "loss": 3.0236196517944336, + "step": 5484, + "token_acc": 0.29652444870565675 + }, + { + "epoch": 3.215186162415714, + "grad_norm": 0.24376113620221973, + "learning_rate": 0.0002900507790782906, + "loss": 3.0315070152282715, + "step": 5485, + "token_acc": 0.2956924215271346 + }, + { + "epoch": 3.215772500732923, + "grad_norm": 0.27216546822972554, + "learning_rate": 0.0002900455718854076, + "loss": 3.0448055267333984, + "step": 5486, + "token_acc": 0.29287998315975183 + }, + { + "epoch": 3.216358839050132, + "grad_norm": 0.2722622024930363, + "learning_rate": 0.00029004036337698517, + "loss": 3.0293242931365967, + "step": 5487, + "token_acc": 0.2974669603524229 + }, + { + "epoch": 3.2169451773673408, + "grad_norm": 0.3047412288158636, + "learning_rate": 0.0002900351535530722, + "loss": 3.025803327560425, + "step": 5488, + "token_acc": 0.2983778335469446 + }, + { + "epoch": 3.21753151568455, + "grad_norm": 0.29046116637755764, + "learning_rate": 0.0002900299424137176, + "loss": 3.0587403774261475, + "step": 5489, + "token_acc": 0.2922288258800744 + }, + { + "epoch": 3.218117854001759, + "grad_norm": 0.31047108976155163, + "learning_rate": 0.0002900247299589705, + "loss": 3.032149314880371, + "step": 5490, + "token_acc": 0.29482897586374723 + }, + { + "epoch": 3.218704192318968, + "grad_norm": 0.2783179430004287, + "learning_rate": 0.00029001951618887965, + "loss": 3.0139455795288086, + "step": 5491, + "token_acc": 0.2985564744199761 + }, + { + "epoch": 3.219290530636177, + "grad_norm": 0.2880944611656997, + "learning_rate": 0.0002900143011034942, + "loss": 3.03299617767334, + "step": 5492, + "token_acc": 0.29511205589875145 + }, + { + "epoch": 3.219876868953386, + "grad_norm": 0.2628970147939955, + "learning_rate": 0.000290009084702863, + "loss": 3.0326945781707764, + "step": 5493, + "token_acc": 0.2954385532391357 + }, + { + "epoch": 3.220463207270595, + "grad_norm": 0.27945003544122965, + "learning_rate": 0.0002900038669870351, + "loss": 3.070260763168335, + "step": 5494, + "token_acc": 0.2910573232243124 + }, + { + "epoch": 3.221049545587804, + "grad_norm": 0.3093505490129998, + "learning_rate": 0.0002899986479560596, + "loss": 3.0959115028381348, + "step": 5495, + "token_acc": 0.28653825169265623 + }, + { + "epoch": 3.221635883905013, + "grad_norm": 0.31131706269182524, + "learning_rate": 0.0002899934276099854, + "loss": 3.0336928367614746, + "step": 5496, + "token_acc": 0.29768335646418276 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 0.2406284341023306, + "learning_rate": 0.0002899882059488616, + "loss": 3.0404491424560547, + "step": 5497, + "token_acc": 0.2937860835318438 + }, + { + "epoch": 3.2228085605394314, + "grad_norm": 0.30754568072435984, + "learning_rate": 0.0002899829829727373, + "loss": 3.0383620262145996, + "step": 5498, + "token_acc": 0.2954915519465415 + }, + { + "epoch": 3.22339489885664, + "grad_norm": 0.28308920880654004, + "learning_rate": 0.0002899777586816614, + "loss": 3.0325679779052734, + "step": 5499, + "token_acc": 0.29582267007890806 + }, + { + "epoch": 3.223981237173849, + "grad_norm": 0.2660949500121972, + "learning_rate": 0.00028997253307568315, + "loss": 3.052431344985962, + "step": 5500, + "token_acc": 0.2922578679049395 + }, + { + "epoch": 3.2245675754910583, + "grad_norm": 0.26247052812578126, + "learning_rate": 0.00028996730615485155, + "loss": 3.0427212715148926, + "step": 5501, + "token_acc": 0.2967759821524314 + }, + { + "epoch": 3.2251539138082674, + "grad_norm": 0.2831839948771978, + "learning_rate": 0.00028996207791921573, + "loss": 2.996189832687378, + "step": 5502, + "token_acc": 0.30126247144926516 + }, + { + "epoch": 3.2257402521254765, + "grad_norm": 0.2646212788919643, + "learning_rate": 0.0002899568483688248, + "loss": 3.0375609397888184, + "step": 5503, + "token_acc": 0.29427728643496476 + }, + { + "epoch": 3.226326590442685, + "grad_norm": 0.2601418899392246, + "learning_rate": 0.00028995161750372783, + "loss": 3.006164789199829, + "step": 5504, + "token_acc": 0.30018796543980997 + }, + { + "epoch": 3.2269129287598943, + "grad_norm": 0.2834035066983693, + "learning_rate": 0.00028994638532397403, + "loss": 3.057257890701294, + "step": 5505, + "token_acc": 0.2908360654009551 + }, + { + "epoch": 3.2274992670771034, + "grad_norm": 0.2655036459575535, + "learning_rate": 0.0002899411518296125, + "loss": 3.057617664337158, + "step": 5506, + "token_acc": 0.29160726287704436 + }, + { + "epoch": 3.2280856053943126, + "grad_norm": 0.26957184877541746, + "learning_rate": 0.0002899359170206924, + "loss": 2.9997286796569824, + "step": 5507, + "token_acc": 0.30154123171014746 + }, + { + "epoch": 3.2286719437115217, + "grad_norm": 0.2574211443116237, + "learning_rate": 0.000289930680897263, + "loss": 3.0693721771240234, + "step": 5508, + "token_acc": 0.2887575555452144 + }, + { + "epoch": 3.2292582820287308, + "grad_norm": 0.27231880374428336, + "learning_rate": 0.00028992544345937335, + "loss": 3.0254197120666504, + "step": 5509, + "token_acc": 0.29659363547681195 + }, + { + "epoch": 3.2298446203459394, + "grad_norm": 0.25761703798086216, + "learning_rate": 0.0002899202047070728, + "loss": 3.0041403770446777, + "step": 5510, + "token_acc": 0.3003474624278513 + }, + { + "epoch": 3.2304309586631486, + "grad_norm": 0.2710049634393407, + "learning_rate": 0.00028991496464041036, + "loss": 3.0407609939575195, + "step": 5511, + "token_acc": 0.29314395153252404 + }, + { + "epoch": 3.2310172969803577, + "grad_norm": 0.29333998688281687, + "learning_rate": 0.00028990972325943545, + "loss": 3.0918593406677246, + "step": 5512, + "token_acc": 0.2858426824748112 + }, + { + "epoch": 3.231603635297567, + "grad_norm": 0.2216722457733159, + "learning_rate": 0.00028990448056419717, + "loss": 3.0777058601379395, + "step": 5513, + "token_acc": 0.29001384199950075 + }, + { + "epoch": 3.232189973614776, + "grad_norm": 0.2759114810575503, + "learning_rate": 0.0002898992365547448, + "loss": 3.0397377014160156, + "step": 5514, + "token_acc": 0.29421461305122365 + }, + { + "epoch": 3.2327763119319846, + "grad_norm": 0.3048934127116464, + "learning_rate": 0.00028989399123112767, + "loss": 3.0330615043640137, + "step": 5515, + "token_acc": 0.2953765632658965 + }, + { + "epoch": 3.2333626502491937, + "grad_norm": 0.2706030388281219, + "learning_rate": 0.00028988874459339494, + "loss": 3.0417191982269287, + "step": 5516, + "token_acc": 0.29431646381201404 + }, + { + "epoch": 3.233948988566403, + "grad_norm": 0.2290258591512697, + "learning_rate": 0.000289883496641596, + "loss": 3.0254323482513428, + "step": 5517, + "token_acc": 0.2964872763335584 + }, + { + "epoch": 3.234535326883612, + "grad_norm": 0.24435024690518656, + "learning_rate": 0.00028987824737578016, + "loss": 3.0329482555389404, + "step": 5518, + "token_acc": 0.2959781621559375 + }, + { + "epoch": 3.235121665200821, + "grad_norm": 0.24804368325070508, + "learning_rate": 0.0002898729967959966, + "loss": 3.07688570022583, + "step": 5519, + "token_acc": 0.29049422728988505 + }, + { + "epoch": 3.23570800351803, + "grad_norm": 0.26034392240820975, + "learning_rate": 0.0002898677449022947, + "loss": 3.0242600440979004, + "step": 5520, + "token_acc": 0.295516451760309 + }, + { + "epoch": 3.236294341835239, + "grad_norm": 0.24695017229667232, + "learning_rate": 0.00028986249169472383, + "loss": 3.0386528968811035, + "step": 5521, + "token_acc": 0.29523492794863476 + }, + { + "epoch": 3.236880680152448, + "grad_norm": 0.271305842674611, + "learning_rate": 0.00028985723717333335, + "loss": 3.007998466491699, + "step": 5522, + "token_acc": 0.2995216234769178 + }, + { + "epoch": 3.237467018469657, + "grad_norm": 0.274615075010814, + "learning_rate": 0.00028985198133817255, + "loss": 3.0543458461761475, + "step": 5523, + "token_acc": 0.29289545472516176 + }, + { + "epoch": 3.238053356786866, + "grad_norm": 0.25477569345762213, + "learning_rate": 0.00028984672418929085, + "loss": 3.0879950523376465, + "step": 5524, + "token_acc": 0.2869741424354224 + }, + { + "epoch": 3.2386396951040752, + "grad_norm": 0.2529910242160972, + "learning_rate": 0.00028984146572673766, + "loss": 3.075887680053711, + "step": 5525, + "token_acc": 0.28980619528308427 + }, + { + "epoch": 3.239226033421284, + "grad_norm": 0.23379481591210036, + "learning_rate": 0.0002898362059505623, + "loss": 3.0306029319763184, + "step": 5526, + "token_acc": 0.29739750207475274 + }, + { + "epoch": 3.239812371738493, + "grad_norm": 0.2437474662186198, + "learning_rate": 0.00028983094486081425, + "loss": 3.070091724395752, + "step": 5527, + "token_acc": 0.2905064164841261 + }, + { + "epoch": 3.240398710055702, + "grad_norm": 0.2592720276562673, + "learning_rate": 0.00028982568245754285, + "loss": 3.0497727394104004, + "step": 5528, + "token_acc": 0.2925194424991707 + }, + { + "epoch": 3.2409850483729112, + "grad_norm": 0.2560044488766083, + "learning_rate": 0.0002898204187407976, + "loss": 3.033024787902832, + "step": 5529, + "token_acc": 0.29534485608997774 + }, + { + "epoch": 3.2415713866901203, + "grad_norm": 0.2775353210367765, + "learning_rate": 0.0002898151537106279, + "loss": 3.045614719390869, + "step": 5530, + "token_acc": 0.29309818660755876 + }, + { + "epoch": 3.2421577250073295, + "grad_norm": 0.2650649484386012, + "learning_rate": 0.00028980988736708327, + "loss": 2.987740993499756, + "step": 5531, + "token_acc": 0.3014742560943871 + }, + { + "epoch": 3.242744063324538, + "grad_norm": 0.281087962261792, + "learning_rate": 0.00028980461971021316, + "loss": 3.020624876022339, + "step": 5532, + "token_acc": 0.29760514469453375 + }, + { + "epoch": 3.2433304016417472, + "grad_norm": 0.22718387171114135, + "learning_rate": 0.000289799350740067, + "loss": 3.0307769775390625, + "step": 5533, + "token_acc": 0.2953111236984008 + }, + { + "epoch": 3.2439167399589564, + "grad_norm": 0.2743611903198156, + "learning_rate": 0.0002897940804566943, + "loss": 3.0270633697509766, + "step": 5534, + "token_acc": 0.2978946998738387 + }, + { + "epoch": 3.2445030782761655, + "grad_norm": 0.22540439950899302, + "learning_rate": 0.00028978880886014463, + "loss": 3.0013480186462402, + "step": 5535, + "token_acc": 0.30148696484866744 + }, + { + "epoch": 3.2450894165933746, + "grad_norm": 0.26447640159739955, + "learning_rate": 0.00028978353595046744, + "loss": 3.0681824684143066, + "step": 5536, + "token_acc": 0.29132485696776816 + }, + { + "epoch": 3.2456757549105832, + "grad_norm": 0.24351191133582292, + "learning_rate": 0.00028977826172771234, + "loss": 3.006533622741699, + "step": 5537, + "token_acc": 0.2985869881468925 + }, + { + "epoch": 3.2462620932277924, + "grad_norm": 0.28358950771167235, + "learning_rate": 0.0002897729861919288, + "loss": 3.02524995803833, + "step": 5538, + "token_acc": 0.29631564091809576 + }, + { + "epoch": 3.2468484315450015, + "grad_norm": 0.23711071388562208, + "learning_rate": 0.0002897677093431664, + "loss": 3.040681838989258, + "step": 5539, + "token_acc": 0.29385430768185405 + }, + { + "epoch": 3.2474347698622106, + "grad_norm": 0.29781931821894136, + "learning_rate": 0.0002897624311814747, + "loss": 3.041667938232422, + "step": 5540, + "token_acc": 0.29417273374670344 + }, + { + "epoch": 3.2480211081794197, + "grad_norm": 0.2459966849553532, + "learning_rate": 0.0002897571517069033, + "loss": 3.0244832038879395, + "step": 5541, + "token_acc": 0.29763461125248847 + }, + { + "epoch": 3.2486074464966284, + "grad_norm": 0.2511737806433179, + "learning_rate": 0.0002897518709195018, + "loss": 3.0318479537963867, + "step": 5542, + "token_acc": 0.2945540056932642 + }, + { + "epoch": 3.2491937848138375, + "grad_norm": 0.23754696734060896, + "learning_rate": 0.00028974658881931976, + "loss": 3.0464601516723633, + "step": 5543, + "token_acc": 0.29489593111368656 + }, + { + "epoch": 3.2497801231310466, + "grad_norm": 0.2585922764025102, + "learning_rate": 0.00028974130540640686, + "loss": 3.0354185104370117, + "step": 5544, + "token_acc": 0.29367609094746 + }, + { + "epoch": 3.2503664614482557, + "grad_norm": 0.2545077543639786, + "learning_rate": 0.00028973602068081266, + "loss": 3.0270252227783203, + "step": 5545, + "token_acc": 0.2954390108326485 + }, + { + "epoch": 3.250952799765465, + "grad_norm": 0.27002395286555086, + "learning_rate": 0.00028973073464258687, + "loss": 3.019052505493164, + "step": 5546, + "token_acc": 0.2970866500630819 + }, + { + "epoch": 3.2515391380826735, + "grad_norm": 0.24398926337427476, + "learning_rate": 0.00028972544729177914, + "loss": 3.0828499794006348, + "step": 5547, + "token_acc": 0.28769827047728275 + }, + { + "epoch": 3.2521254763998826, + "grad_norm": 0.2774191268910143, + "learning_rate": 0.0002897201586284391, + "loss": 3.0793557167053223, + "step": 5548, + "token_acc": 0.28889049938657296 + }, + { + "epoch": 3.2527118147170917, + "grad_norm": 0.2708361708467026, + "learning_rate": 0.0002897148686526164, + "loss": 3.0516977310180664, + "step": 5549, + "token_acc": 0.29326024795350614 + }, + { + "epoch": 3.253298153034301, + "grad_norm": 0.2741728670422611, + "learning_rate": 0.00028970957736436083, + "loss": 3.055417537689209, + "step": 5550, + "token_acc": 0.2926756195632178 + }, + { + "epoch": 3.25388449135151, + "grad_norm": 0.2424436953279273, + "learning_rate": 0.000289704284763722, + "loss": 3.04716420173645, + "step": 5551, + "token_acc": 0.29375208051851415 + }, + { + "epoch": 3.254470829668719, + "grad_norm": 0.27167684775524564, + "learning_rate": 0.00028969899085074967, + "loss": 3.092989683151245, + "step": 5552, + "token_acc": 0.28718912283524645 + }, + { + "epoch": 3.2550571679859277, + "grad_norm": 0.23928769940495204, + "learning_rate": 0.0002896936956254936, + "loss": 3.0554897785186768, + "step": 5553, + "token_acc": 0.29284548078429784 + }, + { + "epoch": 3.255643506303137, + "grad_norm": 0.257124270395592, + "learning_rate": 0.0002896883990880035, + "loss": 3.0346858501434326, + "step": 5554, + "token_acc": 0.2950895270807493 + }, + { + "epoch": 3.256229844620346, + "grad_norm": 0.25320612043741525, + "learning_rate": 0.00028968310123832913, + "loss": 3.0236854553222656, + "step": 5555, + "token_acc": 0.29637094695602867 + }, + { + "epoch": 3.256816182937555, + "grad_norm": 0.2656462721762266, + "learning_rate": 0.00028967780207652023, + "loss": 3.004427194595337, + "step": 5556, + "token_acc": 0.30014598156160816 + }, + { + "epoch": 3.257402521254764, + "grad_norm": 0.26252434158149857, + "learning_rate": 0.00028967250160262656, + "loss": 3.064389944076538, + "step": 5557, + "token_acc": 0.2903170495158793 + }, + { + "epoch": 3.257988859571973, + "grad_norm": 0.2348602538895089, + "learning_rate": 0.00028966719981669804, + "loss": 3.030764579772949, + "step": 5558, + "token_acc": 0.2958498852983374 + }, + { + "epoch": 3.258575197889182, + "grad_norm": 0.28693503524229624, + "learning_rate": 0.00028966189671878427, + "loss": 3.0904502868652344, + "step": 5559, + "token_acc": 0.286621665125168 + }, + { + "epoch": 3.259161536206391, + "grad_norm": 0.28046144791863115, + "learning_rate": 0.00028965659230893525, + "loss": 3.0459375381469727, + "step": 5560, + "token_acc": 0.2930299281408755 + }, + { + "epoch": 3.2597478745236, + "grad_norm": 0.2572314419433262, + "learning_rate": 0.00028965128658720073, + "loss": 3.0727224349975586, + "step": 5561, + "token_acc": 0.2887826743320374 + }, + { + "epoch": 3.2603342128408093, + "grad_norm": 0.30634122138218156, + "learning_rate": 0.00028964597955363053, + "loss": 3.035705089569092, + "step": 5562, + "token_acc": 0.2960697871720268 + }, + { + "epoch": 3.2609205511580184, + "grad_norm": 0.2624728149742603, + "learning_rate": 0.00028964067120827453, + "loss": 3.011590003967285, + "step": 5563, + "token_acc": 0.2971310870865394 + }, + { + "epoch": 3.261506889475227, + "grad_norm": 0.27718316656252945, + "learning_rate": 0.0002896353615511826, + "loss": 3.04506254196167, + "step": 5564, + "token_acc": 0.2934774866446197 + }, + { + "epoch": 3.262093227792436, + "grad_norm": 0.2637743918125357, + "learning_rate": 0.00028963005058240467, + "loss": 3.0343332290649414, + "step": 5565, + "token_acc": 0.297028400180455 + }, + { + "epoch": 3.2626795661096453, + "grad_norm": 0.3095602695595955, + "learning_rate": 0.0002896247383019905, + "loss": 3.054948091506958, + "step": 5566, + "token_acc": 0.2911274521344917 + }, + { + "epoch": 3.2632659044268544, + "grad_norm": 0.27837295114862903, + "learning_rate": 0.00028961942470999007, + "loss": 3.0525155067443848, + "step": 5567, + "token_acc": 0.29335435134634336 + }, + { + "epoch": 3.2638522427440635, + "grad_norm": 0.28051399073654887, + "learning_rate": 0.00028961410980645326, + "loss": 3.053393840789795, + "step": 5568, + "token_acc": 0.29285749710911746 + }, + { + "epoch": 3.264438581061272, + "grad_norm": 0.23268628511472414, + "learning_rate": 0.0002896087935914301, + "loss": 2.985903739929199, + "step": 5569, + "token_acc": 0.3038945289862849 + }, + { + "epoch": 3.2650249193784813, + "grad_norm": 0.2797353206589393, + "learning_rate": 0.00028960347606497036, + "loss": 3.046079397201538, + "step": 5570, + "token_acc": 0.29326629660844544 + }, + { + "epoch": 3.2656112576956904, + "grad_norm": 0.2646331614797058, + "learning_rate": 0.0002895981572271241, + "loss": 3.0570311546325684, + "step": 5571, + "token_acc": 0.29305243627272415 + }, + { + "epoch": 3.2661975960128995, + "grad_norm": 0.2640157528372407, + "learning_rate": 0.0002895928370779413, + "loss": 3.024941921234131, + "step": 5572, + "token_acc": 0.2965845523985059 + }, + { + "epoch": 3.2667839343301086, + "grad_norm": 0.2630308935397913, + "learning_rate": 0.0002895875156174719, + "loss": 3.022364616394043, + "step": 5573, + "token_acc": 0.2972293026101142 + }, + { + "epoch": 3.2673702726473177, + "grad_norm": 0.2723542224230714, + "learning_rate": 0.0002895821928457658, + "loss": 3.044461250305176, + "step": 5574, + "token_acc": 0.29613990886327934 + }, + { + "epoch": 3.2679566109645264, + "grad_norm": 0.2635002700485669, + "learning_rate": 0.0002895768687628732, + "loss": 3.0357108116149902, + "step": 5575, + "token_acc": 0.29512819366225895 + }, + { + "epoch": 3.2685429492817355, + "grad_norm": 0.2924223532968938, + "learning_rate": 0.000289571543368844, + "loss": 3.005058765411377, + "step": 5576, + "token_acc": 0.2988306364181874 + }, + { + "epoch": 3.2691292875989446, + "grad_norm": 0.25805775222937916, + "learning_rate": 0.00028956621666372814, + "loss": 3.038538694381714, + "step": 5577, + "token_acc": 0.294988928238396 + }, + { + "epoch": 3.2697156259161537, + "grad_norm": 0.27697048000392976, + "learning_rate": 0.0002895608886475758, + "loss": 3.1012697219848633, + "step": 5578, + "token_acc": 0.28552491837062766 + }, + { + "epoch": 3.270301964233363, + "grad_norm": 0.2689404664518939, + "learning_rate": 0.0002895555593204369, + "loss": 3.017056941986084, + "step": 5579, + "token_acc": 0.2976993091689755 + }, + { + "epoch": 3.2708883025505715, + "grad_norm": 0.31085951903930686, + "learning_rate": 0.00028955022868236164, + "loss": 3.0249881744384766, + "step": 5580, + "token_acc": 0.29720399001132597 + }, + { + "epoch": 3.2714746408677806, + "grad_norm": 0.2792005042723576, + "learning_rate": 0.0002895448967334, + "loss": 3.0462915897369385, + "step": 5581, + "token_acc": 0.29270653548912456 + }, + { + "epoch": 3.2720609791849897, + "grad_norm": 0.2501653093979064, + "learning_rate": 0.00028953956347360215, + "loss": 3.0420777797698975, + "step": 5582, + "token_acc": 0.29484552454036445 + }, + { + "epoch": 3.272647317502199, + "grad_norm": 0.27821523751966487, + "learning_rate": 0.0002895342289030181, + "loss": 3.036561965942383, + "step": 5583, + "token_acc": 0.29312528522861875 + }, + { + "epoch": 3.273233655819408, + "grad_norm": 0.2752180999282037, + "learning_rate": 0.000289528893021698, + "loss": 3.055999279022217, + "step": 5584, + "token_acc": 0.292750034039208 + }, + { + "epoch": 3.273819994136617, + "grad_norm": 0.2744202188018516, + "learning_rate": 0.0002895235558296919, + "loss": 3.059241771697998, + "step": 5585, + "token_acc": 0.2918350234453967 + }, + { + "epoch": 3.2744063324538257, + "grad_norm": 0.2970213841644865, + "learning_rate": 0.0002895182173270501, + "loss": 3.1046924591064453, + "step": 5586, + "token_acc": 0.2845886990801577 + }, + { + "epoch": 3.274992670771035, + "grad_norm": 0.24965215454831652, + "learning_rate": 0.00028951287751382264, + "loss": 3.068326711654663, + "step": 5587, + "token_acc": 0.2902764174699412 + }, + { + "epoch": 3.275579009088244, + "grad_norm": 0.2533446042709383, + "learning_rate": 0.00028950753639005964, + "loss": 3.054434299468994, + "step": 5588, + "token_acc": 0.29235562395600684 + }, + { + "epoch": 3.276165347405453, + "grad_norm": 0.2695192757253377, + "learning_rate": 0.00028950219395581134, + "loss": 3.0146186351776123, + "step": 5589, + "token_acc": 0.298695140252889 + }, + { + "epoch": 3.2767516857226617, + "grad_norm": 0.2691507038867357, + "learning_rate": 0.0002894968502111279, + "loss": 3.0028724670410156, + "step": 5590, + "token_acc": 0.2985304901607761 + }, + { + "epoch": 3.277338024039871, + "grad_norm": 0.22248586525319197, + "learning_rate": 0.0002894915051560595, + "loss": 3.0288336277008057, + "step": 5591, + "token_acc": 0.29625174906871216 + }, + { + "epoch": 3.27792436235708, + "grad_norm": 0.2708954300957032, + "learning_rate": 0.00028948615879065645, + "loss": 3.076141595840454, + "step": 5592, + "token_acc": 0.2899638169188918 + }, + { + "epoch": 3.278510700674289, + "grad_norm": 0.23599682238587283, + "learning_rate": 0.00028948081111496886, + "loss": 3.0233476161956787, + "step": 5593, + "token_acc": 0.2954615669797331 + }, + { + "epoch": 3.279097038991498, + "grad_norm": 0.23974637781131902, + "learning_rate": 0.000289475462129047, + "loss": 3.0459351539611816, + "step": 5594, + "token_acc": 0.29246231951781654 + }, + { + "epoch": 3.2796833773087073, + "grad_norm": 0.26834137358474613, + "learning_rate": 0.00028947011183294113, + "loss": 3.073460817337036, + "step": 5595, + "token_acc": 0.28783520050310935 + }, + { + "epoch": 3.280269715625916, + "grad_norm": 0.2737592203929691, + "learning_rate": 0.0002894647602267015, + "loss": 3.068303108215332, + "step": 5596, + "token_acc": 0.2910934723121481 + }, + { + "epoch": 3.280856053943125, + "grad_norm": 0.26018866374354915, + "learning_rate": 0.0002894594073103784, + "loss": 3.0626487731933594, + "step": 5597, + "token_acc": 0.29178409702650493 + }, + { + "epoch": 3.281442392260334, + "grad_norm": 0.27486993161148887, + "learning_rate": 0.00028945405308402207, + "loss": 3.0917370319366455, + "step": 5598, + "token_acc": 0.2868140591044543 + }, + { + "epoch": 3.2820287305775433, + "grad_norm": 0.2988141988285752, + "learning_rate": 0.0002894486975476828, + "loss": 3.022247314453125, + "step": 5599, + "token_acc": 0.297442706900565 + }, + { + "epoch": 3.2826150688947524, + "grad_norm": 0.27213499692012894, + "learning_rate": 0.000289443340701411, + "loss": 3.024195909500122, + "step": 5600, + "token_acc": 0.2967905771924984 + }, + { + "epoch": 3.283201407211961, + "grad_norm": 0.24738533843385893, + "learning_rate": 0.0002894379825452568, + "loss": 3.067525625228882, + "step": 5601, + "token_acc": 0.28963711902284417 + }, + { + "epoch": 3.28378774552917, + "grad_norm": 0.24452320180428466, + "learning_rate": 0.00028943262307927074, + "loss": 3.027327060699463, + "step": 5602, + "token_acc": 0.2965860136268969 + }, + { + "epoch": 3.2843740838463793, + "grad_norm": 0.24846095656656514, + "learning_rate": 0.00028942726230350306, + "loss": 3.02362322807312, + "step": 5603, + "token_acc": 0.29695964320678775 + }, + { + "epoch": 3.2849604221635884, + "grad_norm": 0.24155111210308483, + "learning_rate": 0.0002894219002180041, + "loss": 3.0494234561920166, + "step": 5604, + "token_acc": 0.2923557052393821 + }, + { + "epoch": 3.2855467604807975, + "grad_norm": 0.2623257375270167, + "learning_rate": 0.00028941653682282433, + "loss": 3.0666255950927734, + "step": 5605, + "token_acc": 0.29075131926821757 + }, + { + "epoch": 3.2861330987980066, + "grad_norm": 0.24793798874928888, + "learning_rate": 0.000289411172118014, + "loss": 3.025351047515869, + "step": 5606, + "token_acc": 0.296134512691555 + }, + { + "epoch": 3.2867194371152153, + "grad_norm": 0.2542816631691905, + "learning_rate": 0.0002894058061036236, + "loss": 3.0086538791656494, + "step": 5607, + "token_acc": 0.29916242701981954 + }, + { + "epoch": 3.2873057754324244, + "grad_norm": 0.2546294375795419, + "learning_rate": 0.0002894004387797034, + "loss": 3.0622305870056152, + "step": 5608, + "token_acc": 0.2917949474927249 + }, + { + "epoch": 3.2878921137496335, + "grad_norm": 0.2909815781212429, + "learning_rate": 0.00028939507014630404, + "loss": 3.0605907440185547, + "step": 5609, + "token_acc": 0.2931174659749648 + }, + { + "epoch": 3.2884784520668426, + "grad_norm": 0.26501348447786943, + "learning_rate": 0.0002893897002034758, + "loss": 3.0546793937683105, + "step": 5610, + "token_acc": 0.29260190666604186 + }, + { + "epoch": 3.2890647903840518, + "grad_norm": 0.28487987248399277, + "learning_rate": 0.00028938432895126917, + "loss": 3.069399118423462, + "step": 5611, + "token_acc": 0.2893468493370993 + }, + { + "epoch": 3.2896511287012604, + "grad_norm": 0.27624960784360575, + "learning_rate": 0.0002893789563897345, + "loss": 3.07187819480896, + "step": 5612, + "token_acc": 0.29043491698928625 + }, + { + "epoch": 3.2902374670184695, + "grad_norm": 0.2686755237792855, + "learning_rate": 0.00028937358251892247, + "loss": 3.06264066696167, + "step": 5613, + "token_acc": 0.2899755967158791 + }, + { + "epoch": 3.2908238053356786, + "grad_norm": 0.3241559374601863, + "learning_rate": 0.00028936820733888345, + "loss": 3.0519227981567383, + "step": 5614, + "token_acc": 0.2929717541317234 + }, + { + "epoch": 3.2914101436528878, + "grad_norm": 0.3024641843627616, + "learning_rate": 0.0002893628308496678, + "loss": 3.0414178371429443, + "step": 5615, + "token_acc": 0.29544063247228225 + }, + { + "epoch": 3.291996481970097, + "grad_norm": 0.2649525723763808, + "learning_rate": 0.00028935745305132623, + "loss": 3.053030014038086, + "step": 5616, + "token_acc": 0.29230753209967825 + }, + { + "epoch": 3.292582820287306, + "grad_norm": 0.29086022717146437, + "learning_rate": 0.0002893520739439092, + "loss": 3.0459671020507812, + "step": 5617, + "token_acc": 0.29495083976563696 + }, + { + "epoch": 3.2931691586045146, + "grad_norm": 0.3113350002552134, + "learning_rate": 0.0002893466935274672, + "loss": 3.0787854194641113, + "step": 5618, + "token_acc": 0.28710399381589224 + }, + { + "epoch": 3.2937554969217238, + "grad_norm": 0.276536547077667, + "learning_rate": 0.00028934131180205074, + "loss": 3.0457077026367188, + "step": 5619, + "token_acc": 0.2928958634882437 + }, + { + "epoch": 3.294341835238933, + "grad_norm": 0.2451697976073363, + "learning_rate": 0.00028933592876771047, + "loss": 3.0542550086975098, + "step": 5620, + "token_acc": 0.29330755518410573 + }, + { + "epoch": 3.294928173556142, + "grad_norm": 0.30348762755190195, + "learning_rate": 0.0002893305444244969, + "loss": 3.019636869430542, + "step": 5621, + "token_acc": 0.29669295226925513 + }, + { + "epoch": 3.295514511873351, + "grad_norm": 0.27094662936469793, + "learning_rate": 0.00028932515877246056, + "loss": 3.0415358543395996, + "step": 5622, + "token_acc": 0.2939202541945984 + }, + { + "epoch": 3.2961008501905598, + "grad_norm": 0.29545831273708617, + "learning_rate": 0.00028931977181165215, + "loss": 3.080528736114502, + "step": 5623, + "token_acc": 0.2893502395576099 + }, + { + "epoch": 3.296687188507769, + "grad_norm": 0.2616063765448077, + "learning_rate": 0.00028931438354212215, + "loss": 3.0078744888305664, + "step": 5624, + "token_acc": 0.29907510723023534 + }, + { + "epoch": 3.297273526824978, + "grad_norm": 0.26925258710688493, + "learning_rate": 0.0002893089939639213, + "loss": 3.0290021896362305, + "step": 5625, + "token_acc": 0.2978160830513409 + }, + { + "epoch": 3.297859865142187, + "grad_norm": 0.27235475531578, + "learning_rate": 0.0002893036030771002, + "loss": 3.0489304065704346, + "step": 5626, + "token_acc": 0.2934715989362586 + }, + { + "epoch": 3.298446203459396, + "grad_norm": 0.24303478753333185, + "learning_rate": 0.00028929821088170945, + "loss": 3.03454852104187, + "step": 5627, + "token_acc": 0.2958329386754228 + }, + { + "epoch": 3.2990325417766053, + "grad_norm": 0.25348481837335646, + "learning_rate": 0.0002892928173777997, + "loss": 3.0337109565734863, + "step": 5628, + "token_acc": 0.2961209218170694 + }, + { + "epoch": 3.299618880093814, + "grad_norm": 0.24501509421838993, + "learning_rate": 0.0002892874225654216, + "loss": 3.0312554836273193, + "step": 5629, + "token_acc": 0.29733483252898457 + }, + { + "epoch": 3.300205218411023, + "grad_norm": 0.2444204008317645, + "learning_rate": 0.0002892820264446259, + "loss": 3.012662410736084, + "step": 5630, + "token_acc": 0.29792457162852237 + }, + { + "epoch": 3.300791556728232, + "grad_norm": 0.2799154667017893, + "learning_rate": 0.00028927662901546324, + "loss": 3.071840286254883, + "step": 5631, + "token_acc": 0.2897522859175111 + }, + { + "epoch": 3.3013778950454413, + "grad_norm": 0.23898318591862108, + "learning_rate": 0.00028927123027798436, + "loss": 3.047630548477173, + "step": 5632, + "token_acc": 0.29232035562756736 + }, + { + "epoch": 3.3019642333626504, + "grad_norm": 0.25099394180945533, + "learning_rate": 0.00028926583023223987, + "loss": 3.055784225463867, + "step": 5633, + "token_acc": 0.2936365825820166 + }, + { + "epoch": 3.302550571679859, + "grad_norm": 0.23552959912147728, + "learning_rate": 0.0002892604288782806, + "loss": 3.0040926933288574, + "step": 5634, + "token_acc": 0.2983880943801128 + }, + { + "epoch": 3.303136909997068, + "grad_norm": 0.26549901656036967, + "learning_rate": 0.00028925502621615726, + "loss": 3.0339224338531494, + "step": 5635, + "token_acc": 0.29610382054089507 + }, + { + "epoch": 3.3037232483142773, + "grad_norm": 0.2628702517111344, + "learning_rate": 0.0002892496222459206, + "loss": 3.068821907043457, + "step": 5636, + "token_acc": 0.2902433463228588 + }, + { + "epoch": 3.3043095866314864, + "grad_norm": 0.24384994249774025, + "learning_rate": 0.0002892442169676214, + "loss": 3.072838306427002, + "step": 5637, + "token_acc": 0.29009729616322677 + }, + { + "epoch": 3.3048959249486956, + "grad_norm": 0.2460081987626046, + "learning_rate": 0.0002892388103813104, + "loss": 3.069751262664795, + "step": 5638, + "token_acc": 0.290347688027182 + }, + { + "epoch": 3.3054822632659047, + "grad_norm": 0.2391441256499383, + "learning_rate": 0.0002892334024870384, + "loss": 3.0618948936462402, + "step": 5639, + "token_acc": 0.29163372980127583 + }, + { + "epoch": 3.3060686015831133, + "grad_norm": 0.25227918271563365, + "learning_rate": 0.0002892279932848562, + "loss": 3.0628557205200195, + "step": 5640, + "token_acc": 0.29108158802864337 + }, + { + "epoch": 3.3066549399003224, + "grad_norm": 0.25959258926033146, + "learning_rate": 0.0002892225827748146, + "loss": 3.039865732192993, + "step": 5641, + "token_acc": 0.29323881746945885 + }, + { + "epoch": 3.3072412782175316, + "grad_norm": 0.24560392727195357, + "learning_rate": 0.00028921717095696444, + "loss": 3.0141172409057617, + "step": 5642, + "token_acc": 0.2986240414049926 + }, + { + "epoch": 3.3078276165347407, + "grad_norm": 0.2544576029115778, + "learning_rate": 0.0002892117578313566, + "loss": 3.070706605911255, + "step": 5643, + "token_acc": 0.2895937806787126 + }, + { + "epoch": 3.3084139548519493, + "grad_norm": 0.25629007695611444, + "learning_rate": 0.0002892063433980418, + "loss": 3.0716235637664795, + "step": 5644, + "token_acc": 0.2896479273436319 + }, + { + "epoch": 3.3090002931691584, + "grad_norm": 0.25628349181811394, + "learning_rate": 0.00028920092765707104, + "loss": 3.075153112411499, + "step": 5645, + "token_acc": 0.2899112237280868 + }, + { + "epoch": 3.3095866314863676, + "grad_norm": 0.26851441574071705, + "learning_rate": 0.00028919551060849517, + "loss": 3.03609037399292, + "step": 5646, + "token_acc": 0.2950730212215856 + }, + { + "epoch": 3.3101729698035767, + "grad_norm": 0.262111169435215, + "learning_rate": 0.000289190092252365, + "loss": 3.0301053524017334, + "step": 5647, + "token_acc": 0.29629315957463703 + }, + { + "epoch": 3.310759308120786, + "grad_norm": 0.23182079627180197, + "learning_rate": 0.0002891846725887315, + "loss": 3.0141682624816895, + "step": 5648, + "token_acc": 0.2998067816297355 + }, + { + "epoch": 3.311345646437995, + "grad_norm": 0.2713076981625062, + "learning_rate": 0.00028917925161764553, + "loss": 3.0790865421295166, + "step": 5649, + "token_acc": 0.2889047383260771 + }, + { + "epoch": 3.3119319847552036, + "grad_norm": 0.2599687456885167, + "learning_rate": 0.00028917382933915805, + "loss": 3.0578112602233887, + "step": 5650, + "token_acc": 0.29133181103763234 + }, + { + "epoch": 3.3125183230724127, + "grad_norm": 0.2717929792509535, + "learning_rate": 0.00028916840575332, + "loss": 3.0932698249816895, + "step": 5651, + "token_acc": 0.28786888295742 + }, + { + "epoch": 3.313104661389622, + "grad_norm": 0.2798270522480099, + "learning_rate": 0.00028916298086018234, + "loss": 3.0428555011749268, + "step": 5652, + "token_acc": 0.2941281385097004 + }, + { + "epoch": 3.313690999706831, + "grad_norm": 0.2514717966269451, + "learning_rate": 0.0002891575546597959, + "loss": 3.040719509124756, + "step": 5653, + "token_acc": 0.29470160851142957 + }, + { + "epoch": 3.31427733802404, + "grad_norm": 0.25076410118559816, + "learning_rate": 0.0002891521271522118, + "loss": 3.0356903076171875, + "step": 5654, + "token_acc": 0.2939879992257565 + }, + { + "epoch": 3.3148636763412487, + "grad_norm": 0.2658737319772316, + "learning_rate": 0.000289146698337481, + "loss": 3.0798327922821045, + "step": 5655, + "token_acc": 0.288920459608574 + }, + { + "epoch": 3.315450014658458, + "grad_norm": 0.25727412163460406, + "learning_rate": 0.00028914126821565447, + "loss": 3.0082955360412598, + "step": 5656, + "token_acc": 0.29924286753743684 + }, + { + "epoch": 3.316036352975667, + "grad_norm": 0.27566221820218606, + "learning_rate": 0.0002891358367867832, + "loss": 3.0909862518310547, + "step": 5657, + "token_acc": 0.2876140600218336 + }, + { + "epoch": 3.316622691292876, + "grad_norm": 0.27507499118158596, + "learning_rate": 0.00028913040405091823, + "loss": 3.0608484745025635, + "step": 5658, + "token_acc": 0.29135826766418177 + }, + { + "epoch": 3.317209029610085, + "grad_norm": 0.27047703744986173, + "learning_rate": 0.0002891249700081106, + "loss": 3.0465924739837646, + "step": 5659, + "token_acc": 0.2939666238767651 + }, + { + "epoch": 3.3177953679272942, + "grad_norm": 0.281899079549966, + "learning_rate": 0.00028911953465841136, + "loss": 3.070035457611084, + "step": 5660, + "token_acc": 0.2922704397058504 + }, + { + "epoch": 3.318381706244503, + "grad_norm": 0.2608369259985746, + "learning_rate": 0.0002891140980018716, + "loss": 3.029350996017456, + "step": 5661, + "token_acc": 0.2961996167497768 + }, + { + "epoch": 3.318968044561712, + "grad_norm": 0.2541751694335664, + "learning_rate": 0.00028910866003854227, + "loss": 3.0701494216918945, + "step": 5662, + "token_acc": 0.28874898271078364 + }, + { + "epoch": 3.319554382878921, + "grad_norm": 0.2408881196639801, + "learning_rate": 0.00028910322076847455, + "loss": 3.0196757316589355, + "step": 5663, + "token_acc": 0.2965763728057834 + }, + { + "epoch": 3.3201407211961302, + "grad_norm": 0.2834033881554853, + "learning_rate": 0.00028909778019171954, + "loss": 3.0354104042053223, + "step": 5664, + "token_acc": 0.29573684555402663 + }, + { + "epoch": 3.3207270595133394, + "grad_norm": 0.2473441893920157, + "learning_rate": 0.00028909233830832825, + "loss": 3.0485830307006836, + "step": 5665, + "token_acc": 0.2921580863234529 + }, + { + "epoch": 3.321313397830548, + "grad_norm": 0.24788250572949358, + "learning_rate": 0.0002890868951183519, + "loss": 3.041867971420288, + "step": 5666, + "token_acc": 0.2947608047766858 + }, + { + "epoch": 3.321899736147757, + "grad_norm": 0.2604208144989237, + "learning_rate": 0.0002890814506218416, + "loss": 3.0771420001983643, + "step": 5667, + "token_acc": 0.28914538551598123 + }, + { + "epoch": 3.3224860744649662, + "grad_norm": 0.24573400722510266, + "learning_rate": 0.00028907600481884854, + "loss": 3.045477867126465, + "step": 5668, + "token_acc": 0.2941581743923232 + }, + { + "epoch": 3.3230724127821754, + "grad_norm": 0.2592597887633075, + "learning_rate": 0.0002890705577094238, + "loss": 3.0070180892944336, + "step": 5669, + "token_acc": 0.2992412489027149 + }, + { + "epoch": 3.3236587510993845, + "grad_norm": 0.2689104687329127, + "learning_rate": 0.00028906510929361856, + "loss": 3.055882453918457, + "step": 5670, + "token_acc": 0.2924316934948277 + }, + { + "epoch": 3.3242450894165936, + "grad_norm": 0.2637215566167641, + "learning_rate": 0.000289059659571484, + "loss": 3.0357956886291504, + "step": 5671, + "token_acc": 0.29575579438507965 + }, + { + "epoch": 3.3248314277338022, + "grad_norm": 0.26116954068157694, + "learning_rate": 0.00028905420854307134, + "loss": 3.074798583984375, + "step": 5672, + "token_acc": 0.2903763175860006 + }, + { + "epoch": 3.3254177660510114, + "grad_norm": 0.2590549517089792, + "learning_rate": 0.00028904875620843173, + "loss": 3.053189277648926, + "step": 5673, + "token_acc": 0.29355535782382103 + }, + { + "epoch": 3.3260041043682205, + "grad_norm": 0.27491176400948103, + "learning_rate": 0.0002890433025676164, + "loss": 3.067682981491089, + "step": 5674, + "token_acc": 0.29062913572897786 + }, + { + "epoch": 3.3265904426854296, + "grad_norm": 0.29322490732737244, + "learning_rate": 0.00028903784762067674, + "loss": 3.0716772079467773, + "step": 5675, + "token_acc": 0.2900877269981257 + }, + { + "epoch": 3.3271767810026387, + "grad_norm": 0.24051997126987792, + "learning_rate": 0.00028903239136766375, + "loss": 3.0001792907714844, + "step": 5676, + "token_acc": 0.29868658373028717 + }, + { + "epoch": 3.3277631193198474, + "grad_norm": 0.25638381647041286, + "learning_rate": 0.0002890269338086288, + "loss": 3.0107851028442383, + "step": 5677, + "token_acc": 0.29869908494301234 + }, + { + "epoch": 3.3283494576370565, + "grad_norm": 0.23348084208780892, + "learning_rate": 0.00028902147494362315, + "loss": 3.042665958404541, + "step": 5678, + "token_acc": 0.2942836519794976 + }, + { + "epoch": 3.3289357959542656, + "grad_norm": 0.239186066657374, + "learning_rate": 0.0002890160147726981, + "loss": 3.0522398948669434, + "step": 5679, + "token_acc": 0.2925206044675303 + }, + { + "epoch": 3.3295221342714747, + "grad_norm": 0.2515515935525354, + "learning_rate": 0.00028901055329590494, + "loss": 3.0348756313323975, + "step": 5680, + "token_acc": 0.2948010618157455 + }, + { + "epoch": 3.330108472588684, + "grad_norm": 0.2459224676871044, + "learning_rate": 0.0002890050905132949, + "loss": 3.0534145832061768, + "step": 5681, + "token_acc": 0.2921477459485323 + }, + { + "epoch": 3.330694810905893, + "grad_norm": 0.23682384629040473, + "learning_rate": 0.0002889996264249194, + "loss": 3.0606369972229004, + "step": 5682, + "token_acc": 0.2915630585380491 + }, + { + "epoch": 3.3312811492231016, + "grad_norm": 0.270991647206072, + "learning_rate": 0.00028899416103082967, + "loss": 3.0742909908294678, + "step": 5683, + "token_acc": 0.29074362884549315 + }, + { + "epoch": 3.3318674875403107, + "grad_norm": 0.27407812012234445, + "learning_rate": 0.00028898869433107707, + "loss": 3.037078380584717, + "step": 5684, + "token_acc": 0.2941312804818785 + }, + { + "epoch": 3.33245382585752, + "grad_norm": 0.23529509773883864, + "learning_rate": 0.00028898322632571303, + "loss": 3.0711679458618164, + "step": 5685, + "token_acc": 0.2905800292346616 + }, + { + "epoch": 3.333040164174729, + "grad_norm": 0.2763313898446662, + "learning_rate": 0.00028897775701478885, + "loss": 3.028648853302002, + "step": 5686, + "token_acc": 0.29729147358063923 + }, + { + "epoch": 3.333626502491938, + "grad_norm": 0.25379415591560245, + "learning_rate": 0.0002889722863983559, + "loss": 3.0440781116485596, + "step": 5687, + "token_acc": 0.29392758517323797 + }, + { + "epoch": 3.3342128408091467, + "grad_norm": 0.2623555037503734, + "learning_rate": 0.0002889668144764656, + "loss": 3.0524182319641113, + "step": 5688, + "token_acc": 0.29334109831549704 + }, + { + "epoch": 3.334799179126356, + "grad_norm": 0.25130788950890903, + "learning_rate": 0.00028896134124916934, + "loss": 3.0114355087280273, + "step": 5689, + "token_acc": 0.2983103200617767 + }, + { + "epoch": 3.335385517443565, + "grad_norm": 0.2715045095980256, + "learning_rate": 0.0002889558667165185, + "loss": 3.0142765045166016, + "step": 5690, + "token_acc": 0.29873979373703036 + }, + { + "epoch": 3.335971855760774, + "grad_norm": 0.2638065604853873, + "learning_rate": 0.0002889503908785646, + "loss": 3.000058650970459, + "step": 5691, + "token_acc": 0.30046116554713337 + }, + { + "epoch": 3.336558194077983, + "grad_norm": 0.24247567550373464, + "learning_rate": 0.000288944913735359, + "loss": 3.0071098804473877, + "step": 5692, + "token_acc": 0.29852664271491647 + }, + { + "epoch": 3.3371445323951923, + "grad_norm": 0.25928555128734876, + "learning_rate": 0.0002889394352869531, + "loss": 3.0537776947021484, + "step": 5693, + "token_acc": 0.2923968015217618 + }, + { + "epoch": 3.337730870712401, + "grad_norm": 0.2554128030029239, + "learning_rate": 0.0002889339555333985, + "loss": 3.059546709060669, + "step": 5694, + "token_acc": 0.29318194048323737 + }, + { + "epoch": 3.33831720902961, + "grad_norm": 0.2575516442706426, + "learning_rate": 0.00028892847447474653, + "loss": 3.069221019744873, + "step": 5695, + "token_acc": 0.29034126195910515 + }, + { + "epoch": 3.338903547346819, + "grad_norm": 0.25672817949680926, + "learning_rate": 0.00028892299211104886, + "loss": 3.0990309715270996, + "step": 5696, + "token_acc": 0.28548023755887775 + }, + { + "epoch": 3.3394898856640283, + "grad_norm": 0.2610852745125035, + "learning_rate": 0.0002889175084423568, + "loss": 3.0478129386901855, + "step": 5697, + "token_acc": 0.29140556937578066 + }, + { + "epoch": 3.340076223981237, + "grad_norm": 0.2441749686262877, + "learning_rate": 0.000288912023468722, + "loss": 3.0370213985443115, + "step": 5698, + "token_acc": 0.294819236977632 + }, + { + "epoch": 3.340662562298446, + "grad_norm": 0.25211298722444914, + "learning_rate": 0.0002889065371901958, + "loss": 3.0255789756774902, + "step": 5699, + "token_acc": 0.29681823924180195 + }, + { + "epoch": 3.341248900615655, + "grad_norm": 0.24636868795176511, + "learning_rate": 0.00028890104960683, + "loss": 3.006986141204834, + "step": 5700, + "token_acc": 0.2995516924416636 + }, + { + "epoch": 3.3418352389328643, + "grad_norm": 0.2712470414314111, + "learning_rate": 0.000288895560718676, + "loss": 3.0725908279418945, + "step": 5701, + "token_acc": 0.29004314421569516 + }, + { + "epoch": 3.3424215772500734, + "grad_norm": 0.2653506625292416, + "learning_rate": 0.0002888900705257853, + "loss": 3.064542531967163, + "step": 5702, + "token_acc": 0.2921014777741263 + }, + { + "epoch": 3.3430079155672825, + "grad_norm": 0.26040624978626953, + "learning_rate": 0.00028888457902820954, + "loss": 3.052729606628418, + "step": 5703, + "token_acc": 0.29251485271865096 + }, + { + "epoch": 3.343594253884491, + "grad_norm": 0.30706659119730073, + "learning_rate": 0.0002888790862260003, + "loss": 3.0718531608581543, + "step": 5704, + "token_acc": 0.2898490568007978 + }, + { + "epoch": 3.3441805922017003, + "grad_norm": 0.2931635725672014, + "learning_rate": 0.0002888735921192093, + "loss": 3.039879560470581, + "step": 5705, + "token_acc": 0.2935851628602921 + }, + { + "epoch": 3.3447669305189094, + "grad_norm": 0.2769287748958181, + "learning_rate": 0.00028886809670788797, + "loss": 3.0811378955841064, + "step": 5706, + "token_acc": 0.28830422632234026 + }, + { + "epoch": 3.3453532688361185, + "grad_norm": 0.29139858484291564, + "learning_rate": 0.00028886259999208794, + "loss": 3.023240089416504, + "step": 5707, + "token_acc": 0.29806789264183325 + }, + { + "epoch": 3.3459396071533276, + "grad_norm": 0.24085872627378682, + "learning_rate": 0.000288857101971861, + "loss": 3.0622334480285645, + "step": 5708, + "token_acc": 0.2896504013024775 + }, + { + "epoch": 3.3465259454705363, + "grad_norm": 0.2905822487140089, + "learning_rate": 0.00028885160264725866, + "loss": 3.037686824798584, + "step": 5709, + "token_acc": 0.2949830679184588 + }, + { + "epoch": 3.3471122837877454, + "grad_norm": 0.26506598696880423, + "learning_rate": 0.00028884610201833263, + "loss": 3.0875120162963867, + "step": 5710, + "token_acc": 0.2876471402115839 + }, + { + "epoch": 3.3476986221049545, + "grad_norm": 0.3040369468891996, + "learning_rate": 0.00028884060008513453, + "loss": 3.019105911254883, + "step": 5711, + "token_acc": 0.29718421527761424 + }, + { + "epoch": 3.3482849604221636, + "grad_norm": 0.26828749695490156, + "learning_rate": 0.00028883509684771613, + "loss": 3.011282444000244, + "step": 5712, + "token_acc": 0.29847493980025525 + }, + { + "epoch": 3.3488712987393727, + "grad_norm": 0.29991367598088864, + "learning_rate": 0.00028882959230612905, + "loss": 3.1143527030944824, + "step": 5713, + "token_acc": 0.2864458105350714 + }, + { + "epoch": 3.349457637056582, + "grad_norm": 0.2825708864243587, + "learning_rate": 0.000288824086460425, + "loss": 3.019871234893799, + "step": 5714, + "token_acc": 0.29805985377101735 + }, + { + "epoch": 3.3500439753737905, + "grad_norm": 0.27661678682827556, + "learning_rate": 0.0002888185793106558, + "loss": 3.0867552757263184, + "step": 5715, + "token_acc": 0.28749610628991856 + }, + { + "epoch": 3.3506303136909996, + "grad_norm": 0.2458516586251347, + "learning_rate": 0.00028881307085687306, + "loss": 3.051924705505371, + "step": 5716, + "token_acc": 0.29273410276713563 + }, + { + "epoch": 3.3512166520082087, + "grad_norm": 0.26757666666863555, + "learning_rate": 0.00028880756109912856, + "loss": 3.0620033740997314, + "step": 5717, + "token_acc": 0.292191996932857 + }, + { + "epoch": 3.351802990325418, + "grad_norm": 0.25804743649465994, + "learning_rate": 0.00028880205003747406, + "loss": 3.0546751022338867, + "step": 5718, + "token_acc": 0.2909680318564486 + }, + { + "epoch": 3.352389328642627, + "grad_norm": 0.21231028787374762, + "learning_rate": 0.0002887965376719614, + "loss": 3.011420249938965, + "step": 5719, + "token_acc": 0.30028372774607925 + }, + { + "epoch": 3.3529756669598356, + "grad_norm": 0.2603236363899813, + "learning_rate": 0.0002887910240026422, + "loss": 3.0352206230163574, + "step": 5720, + "token_acc": 0.2959762013295847 + }, + { + "epoch": 3.3535620052770447, + "grad_norm": 0.24928058343906215, + "learning_rate": 0.00028878550902956845, + "loss": 3.0598530769348145, + "step": 5721, + "token_acc": 0.2924128075737954 + }, + { + "epoch": 3.354148343594254, + "grad_norm": 0.23846425006170233, + "learning_rate": 0.00028877999275279183, + "loss": 3.0860843658447266, + "step": 5722, + "token_acc": 0.2895748627632842 + }, + { + "epoch": 3.354734681911463, + "grad_norm": 0.2100157471983417, + "learning_rate": 0.0002887744751723642, + "loss": 3.0016491413116455, + "step": 5723, + "token_acc": 0.300344459748109 + }, + { + "epoch": 3.355321020228672, + "grad_norm": 0.24900957958125072, + "learning_rate": 0.0002887689562883373, + "loss": 3.077404260635376, + "step": 5724, + "token_acc": 0.2897764074928103 + }, + { + "epoch": 3.355907358545881, + "grad_norm": 0.23876661940971108, + "learning_rate": 0.0002887634361007631, + "loss": 3.0382790565490723, + "step": 5725, + "token_acc": 0.2933984553798448 + }, + { + "epoch": 3.35649369686309, + "grad_norm": 0.24471551857634855, + "learning_rate": 0.00028875791460969343, + "loss": 3.0530691146850586, + "step": 5726, + "token_acc": 0.29171793446982475 + }, + { + "epoch": 3.357080035180299, + "grad_norm": 0.2532109536154006, + "learning_rate": 0.0002887523918151801, + "loss": 3.0129566192626953, + "step": 5727, + "token_acc": 0.2981520080581624 + }, + { + "epoch": 3.357666373497508, + "grad_norm": 0.270450828809152, + "learning_rate": 0.0002887468677172751, + "loss": 3.0905673503875732, + "step": 5728, + "token_acc": 0.2877225700231717 + }, + { + "epoch": 3.358252711814717, + "grad_norm": 0.27102304697905344, + "learning_rate": 0.00028874134231603014, + "loss": 3.0597429275512695, + "step": 5729, + "token_acc": 0.2916221857522984 + }, + { + "epoch": 3.3588390501319263, + "grad_norm": 0.2587419296891728, + "learning_rate": 0.00028873581561149726, + "loss": 3.0456390380859375, + "step": 5730, + "token_acc": 0.29311443161317285 + }, + { + "epoch": 3.359425388449135, + "grad_norm": 0.24531870165269162, + "learning_rate": 0.00028873028760372833, + "loss": 3.0477406978607178, + "step": 5731, + "token_acc": 0.29254329900866255 + }, + { + "epoch": 3.360011726766344, + "grad_norm": 0.27333245817004337, + "learning_rate": 0.0002887247582927753, + "loss": 3.0554895401000977, + "step": 5732, + "token_acc": 0.29308647797936094 + }, + { + "epoch": 3.360598065083553, + "grad_norm": 0.27259197362058346, + "learning_rate": 0.00028871922767869014, + "loss": 3.075974464416504, + "step": 5733, + "token_acc": 0.2915246753246753 + }, + { + "epoch": 3.3611844034007623, + "grad_norm": 0.22623268570031646, + "learning_rate": 0.0002887136957615247, + "loss": 3.0111265182495117, + "step": 5734, + "token_acc": 0.2984246800969349 + }, + { + "epoch": 3.3617707417179714, + "grad_norm": 0.26712271876615806, + "learning_rate": 0.000288708162541331, + "loss": 3.0088930130004883, + "step": 5735, + "token_acc": 0.29995981784087866 + }, + { + "epoch": 3.3623570800351805, + "grad_norm": 0.2404744592843164, + "learning_rate": 0.000288702628018161, + "loss": 3.05560302734375, + "step": 5736, + "token_acc": 0.29184891484722414 + }, + { + "epoch": 3.362943418352389, + "grad_norm": 0.2577133893997576, + "learning_rate": 0.00028869709219206684, + "loss": 3.042890787124634, + "step": 5737, + "token_acc": 0.2938047493955981 + }, + { + "epoch": 3.3635297566695983, + "grad_norm": 0.2889791907655429, + "learning_rate": 0.0002886915550631003, + "loss": 3.0468735694885254, + "step": 5738, + "token_acc": 0.2925345878451241 + }, + { + "epoch": 3.3641160949868074, + "grad_norm": 0.2642595451123165, + "learning_rate": 0.00028868601663131353, + "loss": 3.048684597015381, + "step": 5739, + "token_acc": 0.2933533167565745 + }, + { + "epoch": 3.3647024333040165, + "grad_norm": 0.2859077329453965, + "learning_rate": 0.0002886804768967585, + "loss": 3.0520622730255127, + "step": 5740, + "token_acc": 0.2939728097066202 + }, + { + "epoch": 3.3652887716212256, + "grad_norm": 0.24416010302900212, + "learning_rate": 0.00028867493585948723, + "loss": 3.0231995582580566, + "step": 5741, + "token_acc": 0.29673291928781326 + }, + { + "epoch": 3.3658751099384343, + "grad_norm": 0.27897052200211175, + "learning_rate": 0.0002886693935195518, + "loss": 3.046410322189331, + "step": 5742, + "token_acc": 0.29265131404645267 + }, + { + "epoch": 3.3664614482556434, + "grad_norm": 0.26129614343303387, + "learning_rate": 0.00028866384987700437, + "loss": 3.0319457054138184, + "step": 5743, + "token_acc": 0.2967821000636837 + }, + { + "epoch": 3.3670477865728525, + "grad_norm": 0.2797400207957371, + "learning_rate": 0.00028865830493189686, + "loss": 3.0027589797973633, + "step": 5744, + "token_acc": 0.30050184541037145 + }, + { + "epoch": 3.3676341248900616, + "grad_norm": 0.25101595236652846, + "learning_rate": 0.00028865275868428144, + "loss": 3.0285730361938477, + "step": 5745, + "token_acc": 0.29406593406593406 + }, + { + "epoch": 3.3682204632072708, + "grad_norm": 0.28632711347437434, + "learning_rate": 0.00028864721113421016, + "loss": 3.045104503631592, + "step": 5746, + "token_acc": 0.2927444134991305 + }, + { + "epoch": 3.36880680152448, + "grad_norm": 0.2757253617868351, + "learning_rate": 0.00028864166228173517, + "loss": 3.0419559478759766, + "step": 5747, + "token_acc": 0.29466657421137415 + }, + { + "epoch": 3.3693931398416885, + "grad_norm": 0.24781005485638438, + "learning_rate": 0.00028863611212690855, + "loss": 3.0557100772857666, + "step": 5748, + "token_acc": 0.2933160716801667 + }, + { + "epoch": 3.3699794781588976, + "grad_norm": 0.2677493342617239, + "learning_rate": 0.0002886305606697826, + "loss": 3.023343563079834, + "step": 5749, + "token_acc": 0.29673538984596887 + }, + { + "epoch": 3.3705658164761068, + "grad_norm": 0.2442789107492446, + "learning_rate": 0.0002886250079104092, + "loss": 3.0109775066375732, + "step": 5750, + "token_acc": 0.29813723252720964 + }, + { + "epoch": 3.371152154793316, + "grad_norm": 0.2915176426549623, + "learning_rate": 0.0002886194538488407, + "loss": 3.0176544189453125, + "step": 5751, + "token_acc": 0.29819225906526564 + }, + { + "epoch": 3.3717384931105245, + "grad_norm": 0.2881594806516899, + "learning_rate": 0.0002886138984851292, + "loss": 3.042457342147827, + "step": 5752, + "token_acc": 0.2938888874495102 + }, + { + "epoch": 3.3723248314277336, + "grad_norm": 0.3055471866758318, + "learning_rate": 0.00028860834181932695, + "loss": 3.078770637512207, + "step": 5753, + "token_acc": 0.2892475832302683 + }, + { + "epoch": 3.3729111697449428, + "grad_norm": 0.28243846417569235, + "learning_rate": 0.0002886027838514861, + "loss": 3.024970531463623, + "step": 5754, + "token_acc": 0.2953969982334943 + }, + { + "epoch": 3.373497508062152, + "grad_norm": 0.2544545973054159, + "learning_rate": 0.0002885972245816588, + "loss": 3.0275511741638184, + "step": 5755, + "token_acc": 0.29666203251249157 + }, + { + "epoch": 3.374083846379361, + "grad_norm": 0.2779840945641556, + "learning_rate": 0.00028859166400989746, + "loss": 3.0503897666931152, + "step": 5756, + "token_acc": 0.2902596797248866 + }, + { + "epoch": 3.37467018469657, + "grad_norm": 0.2808407694469099, + "learning_rate": 0.00028858610213625406, + "loss": 3.035900592803955, + "step": 5757, + "token_acc": 0.29506116671721766 + }, + { + "epoch": 3.3752565230137788, + "grad_norm": 0.27280808235418263, + "learning_rate": 0.00028858053896078104, + "loss": 3.0691823959350586, + "step": 5758, + "token_acc": 0.2919291225766788 + }, + { + "epoch": 3.375842861330988, + "grad_norm": 0.26128314578785233, + "learning_rate": 0.0002885749744835306, + "loss": 3.0721797943115234, + "step": 5759, + "token_acc": 0.28831657646370146 + }, + { + "epoch": 3.376429199648197, + "grad_norm": 0.2658954375230314, + "learning_rate": 0.00028856940870455503, + "loss": 3.0480942726135254, + "step": 5760, + "token_acc": 0.2936971576666525 + }, + { + "epoch": 3.377015537965406, + "grad_norm": 0.2746358085127675, + "learning_rate": 0.00028856384162390656, + "loss": 3.059691905975342, + "step": 5761, + "token_acc": 0.2922623828647925 + }, + { + "epoch": 3.377601876282615, + "grad_norm": 0.2552751892859364, + "learning_rate": 0.0002885582732416375, + "loss": 3.0412750244140625, + "step": 5762, + "token_acc": 0.2935588271601625 + }, + { + "epoch": 3.378188214599824, + "grad_norm": 0.23261099126213788, + "learning_rate": 0.0002885527035578002, + "loss": 3.066521644592285, + "step": 5763, + "token_acc": 0.2908722120261701 + }, + { + "epoch": 3.378774552917033, + "grad_norm": 0.23794042019741388, + "learning_rate": 0.0002885471325724469, + "loss": 3.0494213104248047, + "step": 5764, + "token_acc": 0.2931613312538 + }, + { + "epoch": 3.379360891234242, + "grad_norm": 0.2531218723828242, + "learning_rate": 0.0002885415602856301, + "loss": 3.0331640243530273, + "step": 5765, + "token_acc": 0.29488001176767253 + }, + { + "epoch": 3.379947229551451, + "grad_norm": 0.25509266888777, + "learning_rate": 0.0002885359866974019, + "loss": 3.017084836959839, + "step": 5766, + "token_acc": 0.29875581027790876 + }, + { + "epoch": 3.3805335678686603, + "grad_norm": 0.26354033636075735, + "learning_rate": 0.0002885304118078148, + "loss": 3.048737049102783, + "step": 5767, + "token_acc": 0.2944639498766629 + }, + { + "epoch": 3.3811199061858694, + "grad_norm": 0.23052688969461935, + "learning_rate": 0.0002885248356169212, + "loss": 3.032196044921875, + "step": 5768, + "token_acc": 0.2961055408970976 + }, + { + "epoch": 3.381706244503078, + "grad_norm": 0.2706749118284473, + "learning_rate": 0.00028851925812477345, + "loss": 3.0332703590393066, + "step": 5769, + "token_acc": 0.295187743162889 + }, + { + "epoch": 3.382292582820287, + "grad_norm": 0.2569977525800442, + "learning_rate": 0.0002885136793314239, + "loss": 3.035550355911255, + "step": 5770, + "token_acc": 0.29464179032385174 + }, + { + "epoch": 3.3828789211374963, + "grad_norm": 0.2293144973750897, + "learning_rate": 0.000288508099236925, + "loss": 3.0439419746398926, + "step": 5771, + "token_acc": 0.2931473038119128 + }, + { + "epoch": 3.3834652594547054, + "grad_norm": 0.26031099769081184, + "learning_rate": 0.0002885025178413291, + "loss": 3.072713851928711, + "step": 5772, + "token_acc": 0.29015738460400176 + }, + { + "epoch": 3.3840515977719146, + "grad_norm": 0.20871474144763544, + "learning_rate": 0.00028849693514468875, + "loss": 3.019867420196533, + "step": 5773, + "token_acc": 0.29732814791338774 + }, + { + "epoch": 3.3846379360891232, + "grad_norm": 0.264087322815557, + "learning_rate": 0.0002884913511470563, + "loss": 3.019841432571411, + "step": 5774, + "token_acc": 0.29873463085055363 + }, + { + "epoch": 3.3852242744063323, + "grad_norm": 0.3077413436250943, + "learning_rate": 0.0002884857658484842, + "loss": 3.0371508598327637, + "step": 5775, + "token_acc": 0.2961688023592661 + }, + { + "epoch": 3.3858106127235414, + "grad_norm": 0.22072733136700523, + "learning_rate": 0.00028848017924902494, + "loss": 3.0144424438476562, + "step": 5776, + "token_acc": 0.2975036613644685 + }, + { + "epoch": 3.3863969510407506, + "grad_norm": 0.2794207359410877, + "learning_rate": 0.000288474591348731, + "loss": 3.0644044876098633, + "step": 5777, + "token_acc": 0.29108518086347723 + }, + { + "epoch": 3.3869832893579597, + "grad_norm": 0.27980478877498605, + "learning_rate": 0.0002884690021476549, + "loss": 3.0284857749938965, + "step": 5778, + "token_acc": 0.2966252220248668 + }, + { + "epoch": 3.387569627675169, + "grad_norm": 0.2719654973940896, + "learning_rate": 0.00028846341164584906, + "loss": 3.0000524520874023, + "step": 5779, + "token_acc": 0.30138855693391825 + }, + { + "epoch": 3.3881559659923774, + "grad_norm": 0.24644098573419446, + "learning_rate": 0.0002884578198433661, + "loss": 3.035294771194458, + "step": 5780, + "token_acc": 0.29602722470751064 + }, + { + "epoch": 3.3887423043095866, + "grad_norm": 0.2569235442861615, + "learning_rate": 0.0002884522267402585, + "loss": 3.0655293464660645, + "step": 5781, + "token_acc": 0.29038500739257544 + }, + { + "epoch": 3.3893286426267957, + "grad_norm": 0.2645668020700716, + "learning_rate": 0.0002884466323365788, + "loss": 3.0202529430389404, + "step": 5782, + "token_acc": 0.2963841213442824 + }, + { + "epoch": 3.389914980944005, + "grad_norm": 0.2596367354415436, + "learning_rate": 0.0002884410366323795, + "loss": 3.0303962230682373, + "step": 5783, + "token_acc": 0.2950342641295176 + }, + { + "epoch": 3.390501319261214, + "grad_norm": 0.22887668708480585, + "learning_rate": 0.0002884354396277133, + "loss": 3.0566201210021973, + "step": 5784, + "token_acc": 0.2918484970002909 + }, + { + "epoch": 3.3910876575784226, + "grad_norm": 0.2536883965852219, + "learning_rate": 0.00028842984132263253, + "loss": 3.064544916152954, + "step": 5785, + "token_acc": 0.29161752358867743 + }, + { + "epoch": 3.3916739958956317, + "grad_norm": 0.26638258046739627, + "learning_rate": 0.00028842424171719006, + "loss": 3.0846731662750244, + "step": 5786, + "token_acc": 0.28851901500649396 + }, + { + "epoch": 3.392260334212841, + "grad_norm": 0.27111280413427996, + "learning_rate": 0.00028841864081143834, + "loss": 3.056457996368408, + "step": 5787, + "token_acc": 0.29165368964408067 + }, + { + "epoch": 3.39284667253005, + "grad_norm": 0.2810631940203439, + "learning_rate": 0.00028841303860543, + "loss": 3.0368943214416504, + "step": 5788, + "token_acc": 0.29464607666379244 + }, + { + "epoch": 3.393433010847259, + "grad_norm": 0.2314594818423757, + "learning_rate": 0.00028840743509921774, + "loss": 3.055884838104248, + "step": 5789, + "token_acc": 0.29338258696344066 + }, + { + "epoch": 3.394019349164468, + "grad_norm": 0.27020331482132337, + "learning_rate": 0.0002884018302928541, + "loss": 3.0823497772216797, + "step": 5790, + "token_acc": 0.2886612739812863 + }, + { + "epoch": 3.394605687481677, + "grad_norm": 0.24976602056082572, + "learning_rate": 0.00028839622418639174, + "loss": 3.008296012878418, + "step": 5791, + "token_acc": 0.2977789663326202 + }, + { + "epoch": 3.395192025798886, + "grad_norm": 0.25841302598392085, + "learning_rate": 0.0002883906167798833, + "loss": 3.048793315887451, + "step": 5792, + "token_acc": 0.29384109198060226 + }, + { + "epoch": 3.395778364116095, + "grad_norm": 0.24988395905847802, + "learning_rate": 0.0002883850080733816, + "loss": 3.0127930641174316, + "step": 5793, + "token_acc": 0.3006722390566048 + }, + { + "epoch": 3.396364702433304, + "grad_norm": 0.2610463041770924, + "learning_rate": 0.0002883793980669392, + "loss": 3.037114381790161, + "step": 5794, + "token_acc": 0.29496723965361976 + }, + { + "epoch": 3.3969510407505132, + "grad_norm": 0.24260590449045194, + "learning_rate": 0.00028837378676060873, + "loss": 3.0361244678497314, + "step": 5795, + "token_acc": 0.29559348070832586 + }, + { + "epoch": 3.397537379067722, + "grad_norm": 0.25255175236827604, + "learning_rate": 0.0002883681741544431, + "loss": 3.054929256439209, + "step": 5796, + "token_acc": 0.29293808294762025 + }, + { + "epoch": 3.398123717384931, + "grad_norm": 0.26753736974825515, + "learning_rate": 0.00028836256024849486, + "loss": 3.046617031097412, + "step": 5797, + "token_acc": 0.2936092322522298 + }, + { + "epoch": 3.39871005570214, + "grad_norm": 0.25830671007931894, + "learning_rate": 0.00028835694504281687, + "loss": 3.080627202987671, + "step": 5798, + "token_acc": 0.2891387232875181 + }, + { + "epoch": 3.3992963940193492, + "grad_norm": 0.3094321489359569, + "learning_rate": 0.0002883513285374618, + "loss": 3.084319829940796, + "step": 5799, + "token_acc": 0.28655775859605737 + }, + { + "epoch": 3.3998827323365584, + "grad_norm": 0.2505391689398477, + "learning_rate": 0.00028834571073248243, + "loss": 3.019322156906128, + "step": 5800, + "token_acc": 0.2968487046547604 + }, + { + "epoch": 3.4004690706537675, + "grad_norm": 0.2633340607901542, + "learning_rate": 0.00028834009162793153, + "loss": 3.043074131011963, + "step": 5801, + "token_acc": 0.29328860318233785 + }, + { + "epoch": 3.401055408970976, + "grad_norm": 0.25705877239108366, + "learning_rate": 0.00028833447122386186, + "loss": 3.0755510330200195, + "step": 5802, + "token_acc": 0.2908960188352782 + }, + { + "epoch": 3.4016417472881852, + "grad_norm": 0.24310445465531763, + "learning_rate": 0.0002883288495203263, + "loss": 3.0069336891174316, + "step": 5803, + "token_acc": 0.30006954374614186 + }, + { + "epoch": 3.4022280856053944, + "grad_norm": 0.26036085263863024, + "learning_rate": 0.00028832322651737755, + "loss": 3.0563430786132812, + "step": 5804, + "token_acc": 0.2922666102902129 + }, + { + "epoch": 3.4028144239226035, + "grad_norm": 0.2539218647411148, + "learning_rate": 0.00028831760221506846, + "loss": 3.0674233436584473, + "step": 5805, + "token_acc": 0.2903041924976358 + }, + { + "epoch": 3.403400762239812, + "grad_norm": 0.2559192173872823, + "learning_rate": 0.0002883119766134519, + "loss": 3.011321544647217, + "step": 5806, + "token_acc": 0.29809091819210953 + }, + { + "epoch": 3.4039871005570213, + "grad_norm": 0.2485167035017231, + "learning_rate": 0.0002883063497125807, + "loss": 3.068293571472168, + "step": 5807, + "token_acc": 0.2907193594214129 + }, + { + "epoch": 3.4045734388742304, + "grad_norm": 0.273940327310567, + "learning_rate": 0.00028830072151250774, + "loss": 3.067835807800293, + "step": 5808, + "token_acc": 0.29086284659598055 + }, + { + "epoch": 3.4051597771914395, + "grad_norm": 0.2849778936192566, + "learning_rate": 0.00028829509201328587, + "loss": 3.0295190811157227, + "step": 5809, + "token_acc": 0.29622024035283534 + }, + { + "epoch": 3.4057461155086486, + "grad_norm": 0.23843773834340443, + "learning_rate": 0.00028828946121496797, + "loss": 3.016824722290039, + "step": 5810, + "token_acc": 0.29740385639991374 + }, + { + "epoch": 3.4063324538258577, + "grad_norm": 0.2613213078783441, + "learning_rate": 0.00028828382911760684, + "loss": 3.027890205383301, + "step": 5811, + "token_acc": 0.2944947905431921 + }, + { + "epoch": 3.4069187921430664, + "grad_norm": 0.242892891166697, + "learning_rate": 0.00028827819572125555, + "loss": 3.0233802795410156, + "step": 5812, + "token_acc": 0.29790433378707776 + }, + { + "epoch": 3.4075051304602755, + "grad_norm": 0.25230134000758875, + "learning_rate": 0.00028827256102596696, + "loss": 3.045851230621338, + "step": 5813, + "token_acc": 0.2941698609104528 + }, + { + "epoch": 3.4080914687774846, + "grad_norm": 0.22914362264866167, + "learning_rate": 0.0002882669250317939, + "loss": 3.0619325637817383, + "step": 5814, + "token_acc": 0.2926485768470581 + }, + { + "epoch": 3.4086778070946937, + "grad_norm": 0.22218817650477204, + "learning_rate": 0.00028826128773878944, + "loss": 3.0591695308685303, + "step": 5815, + "token_acc": 0.2914596677602458 + }, + { + "epoch": 3.409264145411903, + "grad_norm": 0.26955409295406296, + "learning_rate": 0.00028825564914700656, + "loss": 3.0590620040893555, + "step": 5816, + "token_acc": 0.2925702816590225 + }, + { + "epoch": 3.4098504837291115, + "grad_norm": 0.2487046297552632, + "learning_rate": 0.0002882500092564981, + "loss": 3.032207489013672, + "step": 5817, + "token_acc": 0.2959240394020214 + }, + { + "epoch": 3.4104368220463206, + "grad_norm": 0.23897004823638518, + "learning_rate": 0.0002882443680673171, + "loss": 3.050837755203247, + "step": 5818, + "token_acc": 0.29104751470541307 + }, + { + "epoch": 3.4110231603635297, + "grad_norm": 0.23104599012530924, + "learning_rate": 0.0002882387255795165, + "loss": 3.066831111907959, + "step": 5819, + "token_acc": 0.292207307382579 + }, + { + "epoch": 3.411609498680739, + "grad_norm": 0.24669436839663125, + "learning_rate": 0.0002882330817931494, + "loss": 3.068368434906006, + "step": 5820, + "token_acc": 0.2907287275140918 + }, + { + "epoch": 3.412195836997948, + "grad_norm": 0.26568606076028456, + "learning_rate": 0.0002882274367082688, + "loss": 3.0582644939422607, + "step": 5821, + "token_acc": 0.2910883895851363 + }, + { + "epoch": 3.412782175315157, + "grad_norm": 0.23184923362725787, + "learning_rate": 0.00028822179032492764, + "loss": 3.048902988433838, + "step": 5822, + "token_acc": 0.2922377836706955 + }, + { + "epoch": 3.4133685136323657, + "grad_norm": 0.28367765940184414, + "learning_rate": 0.00028821614264317905, + "loss": 3.071821689605713, + "step": 5823, + "token_acc": 0.2891183720670731 + }, + { + "epoch": 3.413954851949575, + "grad_norm": 0.25711330196030424, + "learning_rate": 0.00028821049366307603, + "loss": 3.051515579223633, + "step": 5824, + "token_acc": 0.29348250773503415 + }, + { + "epoch": 3.414541190266784, + "grad_norm": 0.26874177662288773, + "learning_rate": 0.0002882048433846717, + "loss": 2.99855375289917, + "step": 5825, + "token_acc": 0.29907275709033815 + }, + { + "epoch": 3.415127528583993, + "grad_norm": 0.2642349257327431, + "learning_rate": 0.00028819919180801906, + "loss": 3.0628440380096436, + "step": 5826, + "token_acc": 0.2898326705586295 + }, + { + "epoch": 3.415713866901202, + "grad_norm": 0.24727393470939749, + "learning_rate": 0.00028819353893317127, + "loss": 3.074979305267334, + "step": 5827, + "token_acc": 0.2889181754794615 + }, + { + "epoch": 3.416300205218411, + "grad_norm": 0.24692703652006645, + "learning_rate": 0.0002881878847601814, + "loss": 3.1171715259552, + "step": 5828, + "token_acc": 0.2854893244994049 + }, + { + "epoch": 3.41688654353562, + "grad_norm": 0.23445281015238778, + "learning_rate": 0.0002881822292891025, + "loss": 3.0061609745025635, + "step": 5829, + "token_acc": 0.29941642774615945 + }, + { + "epoch": 3.417472881852829, + "grad_norm": 0.27225845059984427, + "learning_rate": 0.0002881765725199878, + "loss": 3.057243824005127, + "step": 5830, + "token_acc": 0.2914988079696506 + }, + { + "epoch": 3.418059220170038, + "grad_norm": 0.2790428769658944, + "learning_rate": 0.0002881709144528904, + "loss": 3.092606782913208, + "step": 5831, + "token_acc": 0.28773975893436954 + }, + { + "epoch": 3.4186455584872473, + "grad_norm": 0.23227715263586404, + "learning_rate": 0.00028816525508786343, + "loss": 3.0575666427612305, + "step": 5832, + "token_acc": 0.2922832845506784 + }, + { + "epoch": 3.4192318968044564, + "grad_norm": 0.23472979976893169, + "learning_rate": 0.00028815959442496006, + "loss": 3.0768470764160156, + "step": 5833, + "token_acc": 0.29000573013595127 + }, + { + "epoch": 3.419818235121665, + "grad_norm": 0.23170259503761328, + "learning_rate": 0.0002881539324642335, + "loss": 3.1084694862365723, + "step": 5834, + "token_acc": 0.287280346869193 + }, + { + "epoch": 3.420404573438874, + "grad_norm": 0.26051720746459, + "learning_rate": 0.0002881482692057369, + "loss": 3.0714192390441895, + "step": 5835, + "token_acc": 0.29107842412595164 + }, + { + "epoch": 3.4209909117560833, + "grad_norm": 0.2896442498795606, + "learning_rate": 0.0002881426046495235, + "loss": 3.0418190956115723, + "step": 5836, + "token_acc": 0.2952998303942752 + }, + { + "epoch": 3.4215772500732924, + "grad_norm": 0.24052865986578006, + "learning_rate": 0.00028813693879564645, + "loss": 3.0656003952026367, + "step": 5837, + "token_acc": 0.29282590300645334 + }, + { + "epoch": 3.4221635883905015, + "grad_norm": 0.2573545566609396, + "learning_rate": 0.000288131271644159, + "loss": 3.0443947315216064, + "step": 5838, + "token_acc": 0.2955301688788596 + }, + { + "epoch": 3.42274992670771, + "grad_norm": 0.2575837960632782, + "learning_rate": 0.0002881256031951144, + "loss": 3.0550546646118164, + "step": 5839, + "token_acc": 0.29212435957511257 + }, + { + "epoch": 3.4233362650249193, + "grad_norm": 0.23076685553126816, + "learning_rate": 0.0002881199334485659, + "loss": 3.037919521331787, + "step": 5840, + "token_acc": 0.29452560999121696 + }, + { + "epoch": 3.4239226033421284, + "grad_norm": 0.22874785367810943, + "learning_rate": 0.0002881142624045667, + "loss": 3.039004325866699, + "step": 5841, + "token_acc": 0.294684937966596 + }, + { + "epoch": 3.4245089416593375, + "grad_norm": 0.2433164491185596, + "learning_rate": 0.0002881085900631701, + "loss": 3.052949905395508, + "step": 5842, + "token_acc": 0.293342659413427 + }, + { + "epoch": 3.4250952799765466, + "grad_norm": 0.24340437545935076, + "learning_rate": 0.00028810291642442944, + "loss": 3.073596954345703, + "step": 5843, + "token_acc": 0.28954056903681236 + }, + { + "epoch": 3.4256816182937557, + "grad_norm": 0.26978918489968845, + "learning_rate": 0.000288097241488398, + "loss": 3.0020980834960938, + "step": 5844, + "token_acc": 0.29992178816943155 + }, + { + "epoch": 3.4262679566109644, + "grad_norm": 0.2399314730757761, + "learning_rate": 0.000288091565255129, + "loss": 3.044473886489868, + "step": 5845, + "token_acc": 0.29330799676857183 + }, + { + "epoch": 3.4268542949281735, + "grad_norm": 0.2454815945638472, + "learning_rate": 0.0002880858877246759, + "loss": 3.0374042987823486, + "step": 5846, + "token_acc": 0.29471217379653153 + }, + { + "epoch": 3.4274406332453826, + "grad_norm": 0.26722477519949106, + "learning_rate": 0.0002880802088970919, + "loss": 3.073298692703247, + "step": 5847, + "token_acc": 0.29004139184366723 + }, + { + "epoch": 3.4280269715625917, + "grad_norm": 0.2691588911379231, + "learning_rate": 0.00028807452877243044, + "loss": 3.027499198913574, + "step": 5848, + "token_acc": 0.29598676676421937 + }, + { + "epoch": 3.4286133098798004, + "grad_norm": 0.28206833537797543, + "learning_rate": 0.0002880688473507448, + "loss": 3.055067777633667, + "step": 5849, + "token_acc": 0.2918902113139286 + }, + { + "epoch": 3.4291996481970095, + "grad_norm": 0.2694115146515316, + "learning_rate": 0.0002880631646320884, + "loss": 3.0189852714538574, + "step": 5850, + "token_acc": 0.298081181012304 + }, + { + "epoch": 3.4297859865142186, + "grad_norm": 0.23840540556512652, + "learning_rate": 0.0002880574806165146, + "loss": 3.0310802459716797, + "step": 5851, + "token_acc": 0.2966639088016614 + }, + { + "epoch": 3.4303723248314277, + "grad_norm": 0.2704564189955994, + "learning_rate": 0.0002880517953040768, + "loss": 3.0325770378112793, + "step": 5852, + "token_acc": 0.2945689368217828 + }, + { + "epoch": 3.430958663148637, + "grad_norm": 0.2749233441313519, + "learning_rate": 0.00028804610869482845, + "loss": 3.0563511848449707, + "step": 5853, + "token_acc": 0.2927565179153267 + }, + { + "epoch": 3.431545001465846, + "grad_norm": 0.2763503785829167, + "learning_rate": 0.00028804042078882293, + "loss": 2.9992480278015137, + "step": 5854, + "token_acc": 0.30080083941549524 + }, + { + "epoch": 3.432131339783055, + "grad_norm": 0.26868046563336107, + "learning_rate": 0.0002880347315861137, + "loss": 3.0626468658447266, + "step": 5855, + "token_acc": 0.28909923971602636 + }, + { + "epoch": 3.4327176781002637, + "grad_norm": 0.2791451853659684, + "learning_rate": 0.0002880290410867541, + "loss": 3.02744722366333, + "step": 5856, + "token_acc": 0.29619501784106445 + }, + { + "epoch": 3.433304016417473, + "grad_norm": 0.2500946776805958, + "learning_rate": 0.00028802334929079766, + "loss": 3.028989791870117, + "step": 5857, + "token_acc": 0.29604928295193966 + }, + { + "epoch": 3.433890354734682, + "grad_norm": 0.2559625729561729, + "learning_rate": 0.00028801765619829785, + "loss": 3.040222644805908, + "step": 5858, + "token_acc": 0.2959968505351712 + }, + { + "epoch": 3.434476693051891, + "grad_norm": 0.2512195670071527, + "learning_rate": 0.00028801196180930816, + "loss": 3.0709176063537598, + "step": 5859, + "token_acc": 0.2913766602080469 + }, + { + "epoch": 3.4350630313690997, + "grad_norm": 0.2748623426259876, + "learning_rate": 0.0002880062661238821, + "loss": 3.0267386436462402, + "step": 5860, + "token_acc": 0.29692791800823465 + }, + { + "epoch": 3.435649369686309, + "grad_norm": 0.2499096517692109, + "learning_rate": 0.00028800056914207305, + "loss": 3.0328783988952637, + "step": 5861, + "token_acc": 0.2947235063897148 + }, + { + "epoch": 3.436235708003518, + "grad_norm": 0.2592893364459033, + "learning_rate": 0.00028799487086393464, + "loss": 3.0702638626098633, + "step": 5862, + "token_acc": 0.29017857142857145 + }, + { + "epoch": 3.436822046320727, + "grad_norm": 0.24648233462216657, + "learning_rate": 0.0002879891712895204, + "loss": 3.018019676208496, + "step": 5863, + "token_acc": 0.2975871521071483 + }, + { + "epoch": 3.437408384637936, + "grad_norm": 0.2594739915318762, + "learning_rate": 0.0002879834704188838, + "loss": 3.0093398094177246, + "step": 5864, + "token_acc": 0.2979458510391926 + }, + { + "epoch": 3.4379947229551453, + "grad_norm": 0.2596504109653254, + "learning_rate": 0.00028797776825207846, + "loss": 3.0316262245178223, + "step": 5865, + "token_acc": 0.29510336757248734 + }, + { + "epoch": 3.438581061272354, + "grad_norm": 0.24154017143289508, + "learning_rate": 0.0002879720647891579, + "loss": 3.0468475818634033, + "step": 5866, + "token_acc": 0.29433976263093325 + }, + { + "epoch": 3.439167399589563, + "grad_norm": 0.2669644616070634, + "learning_rate": 0.0002879663600301757, + "loss": 3.0520689487457275, + "step": 5867, + "token_acc": 0.29263616234336537 + }, + { + "epoch": 3.439753737906772, + "grad_norm": 0.2691172509210264, + "learning_rate": 0.0002879606539751855, + "loss": 3.044283151626587, + "step": 5868, + "token_acc": 0.295510801225006 + }, + { + "epoch": 3.4403400762239813, + "grad_norm": 0.27784847936437657, + "learning_rate": 0.00028795494662424084, + "loss": 3.035041570663452, + "step": 5869, + "token_acc": 0.2961496653736306 + }, + { + "epoch": 3.4409264145411904, + "grad_norm": 0.30885190304983895, + "learning_rate": 0.00028794923797739535, + "loss": 3.063432216644287, + "step": 5870, + "token_acc": 0.2912897316041478 + }, + { + "epoch": 3.441512752858399, + "grad_norm": 0.32053917559752154, + "learning_rate": 0.00028794352803470264, + "loss": 3.0718631744384766, + "step": 5871, + "token_acc": 0.2894765630697742 + }, + { + "epoch": 3.442099091175608, + "grad_norm": 0.23420626035982176, + "learning_rate": 0.0002879378167962164, + "loss": 3.0366263389587402, + "step": 5872, + "token_acc": 0.2949868498181799 + }, + { + "epoch": 3.4426854294928173, + "grad_norm": 0.2615196442401533, + "learning_rate": 0.00028793210426199023, + "loss": 3.0659379959106445, + "step": 5873, + "token_acc": 0.2898499918900394 + }, + { + "epoch": 3.4432717678100264, + "grad_norm": 0.27826540370892555, + "learning_rate": 0.00028792639043207776, + "loss": 3.0123090744018555, + "step": 5874, + "token_acc": 0.29779938587512794 + }, + { + "epoch": 3.4438581061272355, + "grad_norm": 0.24310037189950007, + "learning_rate": 0.00028792067530653275, + "loss": 3.045793294906616, + "step": 5875, + "token_acc": 0.2932456546340315 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 0.25718413845866805, + "learning_rate": 0.00028791495888540885, + "loss": 3.011349678039551, + "step": 5876, + "token_acc": 0.29804031205900366 + }, + { + "epoch": 3.4450307827616533, + "grad_norm": 0.25408892969096114, + "learning_rate": 0.00028790924116875975, + "loss": 3.032029628753662, + "step": 5877, + "token_acc": 0.29645307178928915 + }, + { + "epoch": 3.4456171210788624, + "grad_norm": 0.27978118498543253, + "learning_rate": 0.0002879035221566392, + "loss": 3.0299038887023926, + "step": 5878, + "token_acc": 0.2972638233922371 + }, + { + "epoch": 3.4462034593960715, + "grad_norm": 0.2557515299841084, + "learning_rate": 0.00028789780184910086, + "loss": 3.052673578262329, + "step": 5879, + "token_acc": 0.29290658617997367 + }, + { + "epoch": 3.4467897977132806, + "grad_norm": 0.26183557995586265, + "learning_rate": 0.00028789208024619845, + "loss": 3.0412583351135254, + "step": 5880, + "token_acc": 0.2939765989285149 + }, + { + "epoch": 3.4473761360304898, + "grad_norm": 0.26491239250678933, + "learning_rate": 0.00028788635734798584, + "loss": 3.0487637519836426, + "step": 5881, + "token_acc": 0.2930065519327558 + }, + { + "epoch": 3.4479624743476984, + "grad_norm": 0.2678529327624325, + "learning_rate": 0.00028788063315451657, + "loss": 2.993980646133423, + "step": 5882, + "token_acc": 0.29996914135829456 + }, + { + "epoch": 3.4485488126649075, + "grad_norm": 0.2692288495555472, + "learning_rate": 0.0002878749076658447, + "loss": 3.031723737716675, + "step": 5883, + "token_acc": 0.295772705868886 + }, + { + "epoch": 3.4491351509821166, + "grad_norm": 0.2662074579299492, + "learning_rate": 0.0002878691808820238, + "loss": 3.0756757259368896, + "step": 5884, + "token_acc": 0.28954685137310376 + }, + { + "epoch": 3.4497214892993258, + "grad_norm": 0.2426766661685805, + "learning_rate": 0.0002878634528031077, + "loss": 3.0857162475585938, + "step": 5885, + "token_acc": 0.2879945220970616 + }, + { + "epoch": 3.450307827616535, + "grad_norm": 0.2884759063861678, + "learning_rate": 0.0002878577234291503, + "loss": 3.028648853302002, + "step": 5886, + "token_acc": 0.29562079986574824 + }, + { + "epoch": 3.450894165933744, + "grad_norm": 0.24191478936049615, + "learning_rate": 0.0002878519927602053, + "loss": 3.031815528869629, + "step": 5887, + "token_acc": 0.29497135212168807 + }, + { + "epoch": 3.4514805042509527, + "grad_norm": 0.25276509775927886, + "learning_rate": 0.00028784626079632656, + "loss": 3.034696102142334, + "step": 5888, + "token_acc": 0.2964767586704255 + }, + { + "epoch": 3.4520668425681618, + "grad_norm": 0.23114674582758835, + "learning_rate": 0.000287840527537568, + "loss": 3.085292339324951, + "step": 5889, + "token_acc": 0.28957636342728166 + }, + { + "epoch": 3.452653180885371, + "grad_norm": 0.2580610485049468, + "learning_rate": 0.00028783479298398343, + "loss": 3.0597164630889893, + "step": 5890, + "token_acc": 0.29291268337230847 + }, + { + "epoch": 3.45323951920258, + "grad_norm": 0.25223113378033046, + "learning_rate": 0.00028782905713562666, + "loss": 3.0435709953308105, + "step": 5891, + "token_acc": 0.2927430767236124 + }, + { + "epoch": 3.453825857519789, + "grad_norm": 0.24817996634538592, + "learning_rate": 0.0002878233199925517, + "loss": 3.077256679534912, + "step": 5892, + "token_acc": 0.2898922789488427 + }, + { + "epoch": 3.4544121958369978, + "grad_norm": 0.23887029876657756, + "learning_rate": 0.00028781758155481234, + "loss": 3.069082260131836, + "step": 5893, + "token_acc": 0.29109115481551445 + }, + { + "epoch": 3.454998534154207, + "grad_norm": 0.23187766522503486, + "learning_rate": 0.00028781184182246245, + "loss": 3.0224597454071045, + "step": 5894, + "token_acc": 0.296214113800002 + }, + { + "epoch": 3.455584872471416, + "grad_norm": 0.23618136789972474, + "learning_rate": 0.00028780610079555613, + "loss": 3.049370765686035, + "step": 5895, + "token_acc": 0.2944413184149588 + }, + { + "epoch": 3.456171210788625, + "grad_norm": 0.2623386269801366, + "learning_rate": 0.00028780035847414707, + "loss": 3.0724129676818848, + "step": 5896, + "token_acc": 0.29136701014181776 + }, + { + "epoch": 3.456757549105834, + "grad_norm": 0.268668723929693, + "learning_rate": 0.0002877946148582894, + "loss": 3.0217061042785645, + "step": 5897, + "token_acc": 0.29571284072930987 + }, + { + "epoch": 3.4573438874230433, + "grad_norm": 0.25448000215348926, + "learning_rate": 0.000287788869948037, + "loss": 3.0349197387695312, + "step": 5898, + "token_acc": 0.29477731755690173 + }, + { + "epoch": 3.457930225740252, + "grad_norm": 0.2477400223229597, + "learning_rate": 0.00028778312374344383, + "loss": 3.068495750427246, + "step": 5899, + "token_acc": 0.28919265603337696 + }, + { + "epoch": 3.458516564057461, + "grad_norm": 0.2507772145370304, + "learning_rate": 0.0002877773762445639, + "loss": 3.043917655944824, + "step": 5900, + "token_acc": 0.292791815546593 + }, + { + "epoch": 3.45910290237467, + "grad_norm": 0.2707494778436932, + "learning_rate": 0.00028777162745145113, + "loss": 3.101447820663452, + "step": 5901, + "token_acc": 0.28565676254472244 + }, + { + "epoch": 3.4596892406918793, + "grad_norm": 0.24496554003226576, + "learning_rate": 0.00028776587736415956, + "loss": 3.0539588928222656, + "step": 5902, + "token_acc": 0.29305885472185095 + }, + { + "epoch": 3.460275579009088, + "grad_norm": 0.27185883693454654, + "learning_rate": 0.0002877601259827433, + "loss": 3.05627179145813, + "step": 5903, + "token_acc": 0.29261279406201757 + }, + { + "epoch": 3.460861917326297, + "grad_norm": 0.26695093122312963, + "learning_rate": 0.0002877543733072562, + "loss": 3.0789635181427, + "step": 5904, + "token_acc": 0.288932958324222 + }, + { + "epoch": 3.4614482556435062, + "grad_norm": 0.2316119571777939, + "learning_rate": 0.00028774861933775247, + "loss": 3.05426025390625, + "step": 5905, + "token_acc": 0.29273637090160726 + }, + { + "epoch": 3.4620345939607153, + "grad_norm": 0.2595742366141812, + "learning_rate": 0.000287742864074286, + "loss": 3.063587188720703, + "step": 5906, + "token_acc": 0.292629691899447 + }, + { + "epoch": 3.4626209322779244, + "grad_norm": 0.25404864645851666, + "learning_rate": 0.000287737107516911, + "loss": 3.121598482131958, + "step": 5907, + "token_acc": 0.2835095330815572 + }, + { + "epoch": 3.4632072705951336, + "grad_norm": 0.2458007753746523, + "learning_rate": 0.0002877313496656814, + "loss": 3.0921287536621094, + "step": 5908, + "token_acc": 0.28712553335421565 + }, + { + "epoch": 3.4637936089123427, + "grad_norm": 0.2609605527895886, + "learning_rate": 0.00028772559052065143, + "loss": 3.0306708812713623, + "step": 5909, + "token_acc": 0.2935485128445016 + }, + { + "epoch": 3.4643799472295513, + "grad_norm": 0.23186701590313555, + "learning_rate": 0.0002877198300818751, + "loss": 3.020260810852051, + "step": 5910, + "token_acc": 0.29718920467120596 + }, + { + "epoch": 3.4649662855467604, + "grad_norm": 0.24107468527183554, + "learning_rate": 0.00028771406834940654, + "loss": 3.0536727905273438, + "step": 5911, + "token_acc": 0.2923175422936326 + }, + { + "epoch": 3.4655526238639696, + "grad_norm": 0.2414627106913437, + "learning_rate": 0.0002877083053232999, + "loss": 3.036865711212158, + "step": 5912, + "token_acc": 0.29618454114363907 + }, + { + "epoch": 3.4661389621811787, + "grad_norm": 0.2647117429720385, + "learning_rate": 0.00028770254100360935, + "loss": 3.0682530403137207, + "step": 5913, + "token_acc": 0.289561285434483 + }, + { + "epoch": 3.4667253004983873, + "grad_norm": 0.2646567119144457, + "learning_rate": 0.0002876967753903889, + "loss": 3.0637874603271484, + "step": 5914, + "token_acc": 0.29164232362318804 + }, + { + "epoch": 3.4673116388155965, + "grad_norm": 0.27277655166265846, + "learning_rate": 0.00028769100848369283, + "loss": 3.031582832336426, + "step": 5915, + "token_acc": 0.29534007282727714 + }, + { + "epoch": 3.4678979771328056, + "grad_norm": 0.26319274897690587, + "learning_rate": 0.00028768524028357524, + "loss": 3.096741199493408, + "step": 5916, + "token_acc": 0.2869297964354701 + }, + { + "epoch": 3.4684843154500147, + "grad_norm": 0.2648562447024931, + "learning_rate": 0.0002876794707900904, + "loss": 3.0464441776275635, + "step": 5917, + "token_acc": 0.2949394284530959 + }, + { + "epoch": 3.469070653767224, + "grad_norm": 0.27139623773464505, + "learning_rate": 0.0002876737000032924, + "loss": 3.0862877368927, + "step": 5918, + "token_acc": 0.28685045524423763 + }, + { + "epoch": 3.469656992084433, + "grad_norm": 0.25503525257966664, + "learning_rate": 0.0002876679279232355, + "loss": 3.0170810222625732, + "step": 5919, + "token_acc": 0.29789266507039075 + }, + { + "epoch": 3.4702433304016416, + "grad_norm": 0.2535248699632488, + "learning_rate": 0.000287662154549974, + "loss": 3.0863428115844727, + "step": 5920, + "token_acc": 0.2878763462519808 + }, + { + "epoch": 3.4708296687188507, + "grad_norm": 0.28622616738001055, + "learning_rate": 0.000287656379883562, + "loss": 3.088747978210449, + "step": 5921, + "token_acc": 0.28750070019979035 + }, + { + "epoch": 3.47141600703606, + "grad_norm": 0.29018433652843806, + "learning_rate": 0.00028765060392405386, + "loss": 3.0412755012512207, + "step": 5922, + "token_acc": 0.2930465934408518 + }, + { + "epoch": 3.472002345353269, + "grad_norm": 0.32979870973476266, + "learning_rate": 0.00028764482667150375, + "loss": 3.0308361053466797, + "step": 5923, + "token_acc": 0.29535160131364846 + }, + { + "epoch": 3.472588683670478, + "grad_norm": 0.2832147559924382, + "learning_rate": 0.00028763904812596596, + "loss": 3.0753393173217773, + "step": 5924, + "token_acc": 0.28956547650347936 + }, + { + "epoch": 3.4731750219876867, + "grad_norm": 0.2401684571670994, + "learning_rate": 0.0002876332682874948, + "loss": 3.0678439140319824, + "step": 5925, + "token_acc": 0.29052488107044555 + }, + { + "epoch": 3.473761360304896, + "grad_norm": 0.2997968624275349, + "learning_rate": 0.00028762748715614457, + "loss": 3.070957899093628, + "step": 5926, + "token_acc": 0.2897340367671054 + }, + { + "epoch": 3.474347698622105, + "grad_norm": 0.27538808915267676, + "learning_rate": 0.0002876217047319695, + "loss": 3.069878339767456, + "step": 5927, + "token_acc": 0.2901674851833927 + }, + { + "epoch": 3.474934036939314, + "grad_norm": 0.2471993620422374, + "learning_rate": 0.000287615921015024, + "loss": 3.0317611694335938, + "step": 5928, + "token_acc": 0.29653311464134074 + }, + { + "epoch": 3.475520375256523, + "grad_norm": 0.28196410551914386, + "learning_rate": 0.00028761013600536235, + "loss": 3.052868366241455, + "step": 5929, + "token_acc": 0.2924947789431502 + }, + { + "epoch": 3.4761067135737322, + "grad_norm": 0.2703426575137317, + "learning_rate": 0.0002876043497030389, + "loss": 3.0276036262512207, + "step": 5930, + "token_acc": 0.2970279007851341 + }, + { + "epoch": 3.476693051890941, + "grad_norm": 0.22786254412127804, + "learning_rate": 0.000287598562108108, + "loss": 3.0489706993103027, + "step": 5931, + "token_acc": 0.29315776546097244 + }, + { + "epoch": 3.47727939020815, + "grad_norm": 0.27985972476827925, + "learning_rate": 0.00028759277322062406, + "loss": 3.054405689239502, + "step": 5932, + "token_acc": 0.2930033433695034 + }, + { + "epoch": 3.477865728525359, + "grad_norm": 0.2956338028463759, + "learning_rate": 0.0002875869830406414, + "loss": 3.0708632469177246, + "step": 5933, + "token_acc": 0.2894749579044169 + }, + { + "epoch": 3.4784520668425682, + "grad_norm": 0.23804576792599316, + "learning_rate": 0.00028758119156821444, + "loss": 3.0508337020874023, + "step": 5934, + "token_acc": 0.29379493148152486 + }, + { + "epoch": 3.4790384051597774, + "grad_norm": 0.2628015264478636, + "learning_rate": 0.00028757539880339757, + "loss": 3.0871341228485107, + "step": 5935, + "token_acc": 0.2888905918716632 + }, + { + "epoch": 3.479624743476986, + "grad_norm": 0.25222154213478803, + "learning_rate": 0.0002875696047462452, + "loss": 3.0031325817108154, + "step": 5936, + "token_acc": 0.2994548953098176 + }, + { + "epoch": 3.480211081794195, + "grad_norm": 0.26734110740823697, + "learning_rate": 0.00028756380939681185, + "loss": 3.0571131706237793, + "step": 5937, + "token_acc": 0.291545955936047 + }, + { + "epoch": 3.4807974201114043, + "grad_norm": 0.2609232876097213, + "learning_rate": 0.0002875580127551518, + "loss": 3.032316207885742, + "step": 5938, + "token_acc": 0.29567498386977353 + }, + { + "epoch": 3.4813837584286134, + "grad_norm": 0.25265373833681276, + "learning_rate": 0.00028755221482131964, + "loss": 3.020779609680176, + "step": 5939, + "token_acc": 0.2980172204063931 + }, + { + "epoch": 3.4819700967458225, + "grad_norm": 0.2565409135030726, + "learning_rate": 0.0002875464155953697, + "loss": 3.0442113876342773, + "step": 5940, + "token_acc": 0.29485798148449593 + }, + { + "epoch": 3.4825564350630316, + "grad_norm": 0.22982556274379223, + "learning_rate": 0.0002875406150773566, + "loss": 3.0631606578826904, + "step": 5941, + "token_acc": 0.29148491565958734 + }, + { + "epoch": 3.4831427733802403, + "grad_norm": 0.2652567222373783, + "learning_rate": 0.0002875348132673347, + "loss": 3.082474708557129, + "step": 5942, + "token_acc": 0.28916430688179123 + }, + { + "epoch": 3.4837291116974494, + "grad_norm": 0.2392306436762464, + "learning_rate": 0.0002875290101653586, + "loss": 3.044640064239502, + "step": 5943, + "token_acc": 0.29358879205466243 + }, + { + "epoch": 3.4843154500146585, + "grad_norm": 0.2324049238880952, + "learning_rate": 0.00028752320577148273, + "loss": 3.027885913848877, + "step": 5944, + "token_acc": 0.2960120228307005 + }, + { + "epoch": 3.4849017883318676, + "grad_norm": 0.24977578022755825, + "learning_rate": 0.0002875174000857617, + "loss": 3.0555901527404785, + "step": 5945, + "token_acc": 0.2918689835036682 + }, + { + "epoch": 3.4854881266490767, + "grad_norm": 0.2500482244334264, + "learning_rate": 0.00028751159310825, + "loss": 3.0905888080596924, + "step": 5946, + "token_acc": 0.2882377170377255 + }, + { + "epoch": 3.4860744649662854, + "grad_norm": 0.24127351407901693, + "learning_rate": 0.0002875057848390022, + "loss": 3.0553536415100098, + "step": 5947, + "token_acc": 0.29299905901330825 + }, + { + "epoch": 3.4866608032834945, + "grad_norm": 0.23176934339203645, + "learning_rate": 0.0002874999752780728, + "loss": 3.0293819904327393, + "step": 5948, + "token_acc": 0.29485588481169744 + }, + { + "epoch": 3.4872471416007036, + "grad_norm": 0.23665604447584632, + "learning_rate": 0.00028749416442551645, + "loss": 3.0499768257141113, + "step": 5949, + "token_acc": 0.29260113022226675 + }, + { + "epoch": 3.4878334799179127, + "grad_norm": 0.24881035131384097, + "learning_rate": 0.0002874883522813877, + "loss": 2.9956798553466797, + "step": 5950, + "token_acc": 0.3005015218685032 + }, + { + "epoch": 3.488419818235122, + "grad_norm": 0.23880786715724822, + "learning_rate": 0.0002874825388457411, + "loss": 3.072726249694824, + "step": 5951, + "token_acc": 0.28940583477377385 + }, + { + "epoch": 3.489006156552331, + "grad_norm": 0.25496339314421185, + "learning_rate": 0.00028747672411863137, + "loss": 3.072484254837036, + "step": 5952, + "token_acc": 0.29069322098812717 + }, + { + "epoch": 3.4895924948695396, + "grad_norm": 0.2534952133766439, + "learning_rate": 0.000287470908100113, + "loss": 3.0263898372650146, + "step": 5953, + "token_acc": 0.2963672616596185 + }, + { + "epoch": 3.4901788331867487, + "grad_norm": 0.2523026006795684, + "learning_rate": 0.00028746509079024076, + "loss": 3.0400609970092773, + "step": 5954, + "token_acc": 0.2964876820488828 + }, + { + "epoch": 3.490765171503958, + "grad_norm": 0.24711188754709004, + "learning_rate": 0.0002874592721890692, + "loss": 3.0278077125549316, + "step": 5955, + "token_acc": 0.29633276974746064 + }, + { + "epoch": 3.491351509821167, + "grad_norm": 0.264814195201621, + "learning_rate": 0.000287453452296653, + "loss": 3.058867931365967, + "step": 5956, + "token_acc": 0.29068854356091917 + }, + { + "epoch": 3.4919378481383756, + "grad_norm": 0.2566668418488315, + "learning_rate": 0.00028744763111304683, + "loss": 3.0230700969696045, + "step": 5957, + "token_acc": 0.2970705458826037 + }, + { + "epoch": 3.4925241864555847, + "grad_norm": 0.2546422231036557, + "learning_rate": 0.0002874418086383054, + "loss": 3.0706512928009033, + "step": 5958, + "token_acc": 0.28941153188154084 + }, + { + "epoch": 3.493110524772794, + "grad_norm": 0.2514863669605506, + "learning_rate": 0.0002874359848724834, + "loss": 3.0505502223968506, + "step": 5959, + "token_acc": 0.29373918287809564 + }, + { + "epoch": 3.493696863090003, + "grad_norm": 0.21497206556694667, + "learning_rate": 0.0002874301598156355, + "loss": 3.043281316757202, + "step": 5960, + "token_acc": 0.29333075195128616 + }, + { + "epoch": 3.494283201407212, + "grad_norm": 0.23330689966134266, + "learning_rate": 0.0002874243334678164, + "loss": 3.052555799484253, + "step": 5961, + "token_acc": 0.2937873657139637 + }, + { + "epoch": 3.494869539724421, + "grad_norm": 0.2454403579297247, + "learning_rate": 0.0002874185058290809, + "loss": 3.0593371391296387, + "step": 5962, + "token_acc": 0.2910483934016105 + }, + { + "epoch": 3.49545587804163, + "grad_norm": 0.2588920655683688, + "learning_rate": 0.0002874126768994837, + "loss": 3.0430989265441895, + "step": 5963, + "token_acc": 0.29320786696844736 + }, + { + "epoch": 3.496042216358839, + "grad_norm": 0.24877860530105753, + "learning_rate": 0.00028740684667907964, + "loss": 3.0307462215423584, + "step": 5964, + "token_acc": 0.29447868984247116 + }, + { + "epoch": 3.496628554676048, + "grad_norm": 0.28213616361096755, + "learning_rate": 0.0002874010151679233, + "loss": 3.0079731941223145, + "step": 5965, + "token_acc": 0.2994772574343936 + }, + { + "epoch": 3.497214892993257, + "grad_norm": 0.2633146501155154, + "learning_rate": 0.00028739518236606964, + "loss": 3.0615336894989014, + "step": 5966, + "token_acc": 0.291834908055976 + }, + { + "epoch": 3.4978012313104663, + "grad_norm": 0.23207466409338978, + "learning_rate": 0.0002873893482735734, + "loss": 3.026992082595825, + "step": 5967, + "token_acc": 0.29580799776439853 + }, + { + "epoch": 3.498387569627675, + "grad_norm": 0.28519796952570675, + "learning_rate": 0.00028738351289048935, + "loss": 3.0473947525024414, + "step": 5968, + "token_acc": 0.29364685829394904 + }, + { + "epoch": 3.498973907944884, + "grad_norm": 0.2416825946200029, + "learning_rate": 0.0002873776762168723, + "loss": 3.033376693725586, + "step": 5969, + "token_acc": 0.29598809156077244 + }, + { + "epoch": 3.499560246262093, + "grad_norm": 0.23988346662051854, + "learning_rate": 0.0002873718382527771, + "loss": 3.0284740924835205, + "step": 5970, + "token_acc": 0.2956214465193161 + }, + { + "epoch": 3.5001465845793023, + "grad_norm": 0.3009415079266259, + "learning_rate": 0.00028736599899825856, + "loss": 3.0454025268554688, + "step": 5971, + "token_acc": 0.29310615517263566 + }, + { + "epoch": 3.5007329228965114, + "grad_norm": 0.25916301309346185, + "learning_rate": 0.00028736015845337164, + "loss": 3.0528554916381836, + "step": 5972, + "token_acc": 0.29270067795805677 + }, + { + "epoch": 3.5013192612137205, + "grad_norm": 0.2762354568617323, + "learning_rate": 0.00028735431661817105, + "loss": 3.0423150062561035, + "step": 5973, + "token_acc": 0.294897293306191 + }, + { + "epoch": 3.5019055995309296, + "grad_norm": 0.2533931639602894, + "learning_rate": 0.0002873484734927118, + "loss": 3.037900447845459, + "step": 5974, + "token_acc": 0.29495447948278136 + }, + { + "epoch": 3.5024919378481383, + "grad_norm": 0.23825511691320314, + "learning_rate": 0.0002873426290770487, + "loss": 3.0461227893829346, + "step": 5975, + "token_acc": 0.2925557949551168 + }, + { + "epoch": 3.5030782761653474, + "grad_norm": 0.25672770206650636, + "learning_rate": 0.0002873367833712367, + "loss": 3.057312488555908, + "step": 5976, + "token_acc": 0.2931695721869054 + }, + { + "epoch": 3.5036646144825565, + "grad_norm": 0.24854848213757877, + "learning_rate": 0.00028733093637533066, + "loss": 3.0448484420776367, + "step": 5977, + "token_acc": 0.2936710310721098 + }, + { + "epoch": 3.5042509527997656, + "grad_norm": 0.2697503639510263, + "learning_rate": 0.0002873250880893855, + "loss": 3.0893564224243164, + "step": 5978, + "token_acc": 0.286661320224134 + }, + { + "epoch": 3.5048372911169743, + "grad_norm": 0.2634690388470524, + "learning_rate": 0.00028731923851345624, + "loss": 3.083617925643921, + "step": 5979, + "token_acc": 0.28845488658756446 + }, + { + "epoch": 3.5054236294341834, + "grad_norm": 0.2367165095960873, + "learning_rate": 0.00028731338764759776, + "loss": 3.0532584190368652, + "step": 5980, + "token_acc": 0.294379652036442 + }, + { + "epoch": 3.5060099677513925, + "grad_norm": 0.2692838045846597, + "learning_rate": 0.000287307535491865, + "loss": 3.0589513778686523, + "step": 5981, + "token_acc": 0.2915144889392774 + }, + { + "epoch": 3.5065963060686016, + "grad_norm": 0.22541127090290633, + "learning_rate": 0.000287301682046313, + "loss": 3.093899726867676, + "step": 5982, + "token_acc": 0.28776043355652814 + }, + { + "epoch": 3.5071826443858107, + "grad_norm": 0.25787588483034934, + "learning_rate": 0.0002872958273109967, + "loss": 3.0605220794677734, + "step": 5983, + "token_acc": 0.29029443780414926 + }, + { + "epoch": 3.50776898270302, + "grad_norm": 0.23969022336345502, + "learning_rate": 0.0002872899712859711, + "loss": 3.021665096282959, + "step": 5984, + "token_acc": 0.296673400952828 + }, + { + "epoch": 3.5083553210202285, + "grad_norm": 0.23958011755211261, + "learning_rate": 0.0002872841139712913, + "loss": 3.026945114135742, + "step": 5985, + "token_acc": 0.29573985872695585 + }, + { + "epoch": 3.5089416593374376, + "grad_norm": 0.2466205026952344, + "learning_rate": 0.0002872782553670121, + "loss": 3.064274787902832, + "step": 5986, + "token_acc": 0.2920905968920475 + }, + { + "epoch": 3.5095279976546467, + "grad_norm": 0.24783259459695123, + "learning_rate": 0.0002872723954731888, + "loss": 3.061755418777466, + "step": 5987, + "token_acc": 0.2905616307463637 + }, + { + "epoch": 3.510114335971856, + "grad_norm": 0.25587437082208775, + "learning_rate": 0.0002872665342898763, + "loss": 3.084259510040283, + "step": 5988, + "token_acc": 0.2866721283761817 + }, + { + "epoch": 3.5107006742890645, + "grad_norm": 0.22463752917409496, + "learning_rate": 0.0002872606718171296, + "loss": 3.063847541809082, + "step": 5989, + "token_acc": 0.2905544093389837 + }, + { + "epoch": 3.5112870126062736, + "grad_norm": 0.25661038000596653, + "learning_rate": 0.0002872548080550039, + "loss": 3.0100908279418945, + "step": 5990, + "token_acc": 0.2986947933016001 + }, + { + "epoch": 3.5118733509234827, + "grad_norm": 0.2601587981092489, + "learning_rate": 0.00028724894300355424, + "loss": 3.0452399253845215, + "step": 5991, + "token_acc": 0.2945054888538594 + }, + { + "epoch": 3.512459689240692, + "grad_norm": 0.2711938790603412, + "learning_rate": 0.0002872430766628357, + "loss": 3.0633351802825928, + "step": 5992, + "token_acc": 0.29376806691678453 + }, + { + "epoch": 3.513046027557901, + "grad_norm": 0.27554327738938206, + "learning_rate": 0.00028723720903290343, + "loss": 3.0365192890167236, + "step": 5993, + "token_acc": 0.29459064707441696 + }, + { + "epoch": 3.51363236587511, + "grad_norm": 0.25100098365483037, + "learning_rate": 0.00028723134011381243, + "loss": 3.0345816612243652, + "step": 5994, + "token_acc": 0.2963847018250862 + }, + { + "epoch": 3.514218704192319, + "grad_norm": 0.3186556187753884, + "learning_rate": 0.00028722546990561795, + "loss": 3.0510783195495605, + "step": 5995, + "token_acc": 0.29308762807768773 + }, + { + "epoch": 3.514805042509528, + "grad_norm": 0.27078895616893256, + "learning_rate": 0.0002872195984083751, + "loss": 3.044239044189453, + "step": 5996, + "token_acc": 0.29327975371682186 + }, + { + "epoch": 3.515391380826737, + "grad_norm": 0.2532935944964148, + "learning_rate": 0.00028721372562213907, + "loss": 3.0705838203430176, + "step": 5997, + "token_acc": 0.28930548855938426 + }, + { + "epoch": 3.515977719143946, + "grad_norm": 0.24672301301806082, + "learning_rate": 0.00028720785154696493, + "loss": 3.0779964923858643, + "step": 5998, + "token_acc": 0.28990625601840525 + }, + { + "epoch": 3.516564057461155, + "grad_norm": 0.23989597511071573, + "learning_rate": 0.0002872019761829079, + "loss": 3.0370397567749023, + "step": 5999, + "token_acc": 0.2961519227563568 + }, + { + "epoch": 3.517150395778364, + "grad_norm": 0.24303240594690975, + "learning_rate": 0.0002871960995300232, + "loss": 3.011654853820801, + "step": 6000, + "token_acc": 0.2984375996428799 + }, + { + "epoch": 3.517736734095573, + "grad_norm": 0.24291930409664628, + "learning_rate": 0.00028719022158836603, + "loss": 3.0636425018310547, + "step": 6001, + "token_acc": 0.29041886045809717 + }, + { + "epoch": 3.518323072412782, + "grad_norm": 0.24100816837384106, + "learning_rate": 0.0002871843423579916, + "loss": 3.093930721282959, + "step": 6002, + "token_acc": 0.28690452826029095 + }, + { + "epoch": 3.518909410729991, + "grad_norm": 0.23597134266408792, + "learning_rate": 0.0002871784618389552, + "loss": 3.057577610015869, + "step": 6003, + "token_acc": 0.29090918932670784 + }, + { + "epoch": 3.5194957490472003, + "grad_norm": 0.23588166175593347, + "learning_rate": 0.00028717258003131186, + "loss": 3.039311408996582, + "step": 6004, + "token_acc": 0.29509297867613243 + }, + { + "epoch": 3.5200820873644094, + "grad_norm": 0.24187707503618427, + "learning_rate": 0.00028716669693511703, + "loss": 3.1045784950256348, + "step": 6005, + "token_acc": 0.2864376804967586 + }, + { + "epoch": 3.5206684256816185, + "grad_norm": 0.24842631827574996, + "learning_rate": 0.00028716081255042593, + "loss": 3.036945104598999, + "step": 6006, + "token_acc": 0.2949622433231046 + }, + { + "epoch": 3.521254763998827, + "grad_norm": 0.2548128476514135, + "learning_rate": 0.00028715492687729385, + "loss": 2.998538017272949, + "step": 6007, + "token_acc": 0.3003548660128435 + }, + { + "epoch": 3.5218411023160363, + "grad_norm": 0.2654747126519872, + "learning_rate": 0.000287149039915776, + "loss": 3.0890626907348633, + "step": 6008, + "token_acc": 0.28603268750213046 + }, + { + "epoch": 3.5224274406332454, + "grad_norm": 0.28659579790963585, + "learning_rate": 0.00028714315166592777, + "loss": 3.0761075019836426, + "step": 6009, + "token_acc": 0.288469387227931 + }, + { + "epoch": 3.5230137789504545, + "grad_norm": 0.23289984821241425, + "learning_rate": 0.00028713726212780446, + "loss": 2.9916152954101562, + "step": 6010, + "token_acc": 0.3015373910131817 + }, + { + "epoch": 3.523600117267663, + "grad_norm": 0.2790121683273793, + "learning_rate": 0.0002871313713014613, + "loss": 3.04083251953125, + "step": 6011, + "token_acc": 0.29379644832353957 + }, + { + "epoch": 3.5241864555848723, + "grad_norm": 0.2654451312559019, + "learning_rate": 0.0002871254791869537, + "loss": 3.047785758972168, + "step": 6012, + "token_acc": 0.29437451770994627 + }, + { + "epoch": 3.5247727939020814, + "grad_norm": 0.2824283926725798, + "learning_rate": 0.000287119585784337, + "loss": 3.0639586448669434, + "step": 6013, + "token_acc": 0.29246646898050926 + }, + { + "epoch": 3.5253591322192905, + "grad_norm": 0.2381524811316031, + "learning_rate": 0.0002871136910936666, + "loss": 3.0242481231689453, + "step": 6014, + "token_acc": 0.29778548416988027 + }, + { + "epoch": 3.5259454705364996, + "grad_norm": 0.2688116046157544, + "learning_rate": 0.0002871077951149978, + "loss": 3.1000301837921143, + "step": 6015, + "token_acc": 0.28830322835883176 + }, + { + "epoch": 3.5265318088537088, + "grad_norm": 0.23981603226348366, + "learning_rate": 0.000287101897848386, + "loss": 3.041025400161743, + "step": 6016, + "token_acc": 0.29560134858367204 + }, + { + "epoch": 3.527118147170918, + "grad_norm": 0.22777269529245514, + "learning_rate": 0.0002870959992938867, + "loss": 3.055591344833374, + "step": 6017, + "token_acc": 0.2923565161151799 + }, + { + "epoch": 3.5277044854881265, + "grad_norm": 0.2551678778333477, + "learning_rate": 0.00028709009945155516, + "loss": 3.0481300354003906, + "step": 6018, + "token_acc": 0.29506038610211277 + }, + { + "epoch": 3.5282908238053357, + "grad_norm": 0.22822836879274422, + "learning_rate": 0.0002870841983214469, + "loss": 3.0820505619049072, + "step": 6019, + "token_acc": 0.2909546463165997 + }, + { + "epoch": 3.5288771621225448, + "grad_norm": 0.24655492224619516, + "learning_rate": 0.00028707829590361733, + "loss": 3.0105717182159424, + "step": 6020, + "token_acc": 0.2991955068140739 + }, + { + "epoch": 3.529463500439754, + "grad_norm": 0.2351870446391531, + "learning_rate": 0.00028707239219812183, + "loss": 3.049309730529785, + "step": 6021, + "token_acc": 0.2944839953183219 + }, + { + "epoch": 3.5300498387569625, + "grad_norm": 0.22983536743438027, + "learning_rate": 0.00028706648720501596, + "loss": 3.02833890914917, + "step": 6022, + "token_acc": 0.29668536383577865 + }, + { + "epoch": 3.5306361770741717, + "grad_norm": 0.25556293714242784, + "learning_rate": 0.00028706058092435507, + "loss": 3.0777699947357178, + "step": 6023, + "token_acc": 0.2898716707842979 + }, + { + "epoch": 3.5312225153913808, + "grad_norm": 0.23467796094888332, + "learning_rate": 0.0002870546733561948, + "loss": 3.0052690505981445, + "step": 6024, + "token_acc": 0.30090249022389504 + }, + { + "epoch": 3.53180885370859, + "grad_norm": 0.27017651121211356, + "learning_rate": 0.0002870487645005906, + "loss": 3.047358751296997, + "step": 6025, + "token_acc": 0.29392738466662677 + }, + { + "epoch": 3.532395192025799, + "grad_norm": 0.2547905977159385, + "learning_rate": 0.0002870428543575978, + "loss": 3.027613401412964, + "step": 6026, + "token_acc": 0.2969415387741027 + }, + { + "epoch": 3.532981530343008, + "grad_norm": 0.2732187865672332, + "learning_rate": 0.00028703694292727213, + "loss": 3.0565667152404785, + "step": 6027, + "token_acc": 0.29239311555709635 + }, + { + "epoch": 3.533567868660217, + "grad_norm": 0.27659542728205433, + "learning_rate": 0.00028703103020966895, + "loss": 3.061187982559204, + "step": 6028, + "token_acc": 0.2907359302852205 + }, + { + "epoch": 3.534154206977426, + "grad_norm": 0.24159084928846825, + "learning_rate": 0.00028702511620484403, + "loss": 3.0141677856445312, + "step": 6029, + "token_acc": 0.2993864310248657 + }, + { + "epoch": 3.534740545294635, + "grad_norm": 0.2501779760292787, + "learning_rate": 0.00028701920091285266, + "loss": 3.0402355194091797, + "step": 6030, + "token_acc": 0.2939756248071583 + }, + { + "epoch": 3.535326883611844, + "grad_norm": 0.2846169352221804, + "learning_rate": 0.00028701328433375063, + "loss": 3.0625858306884766, + "step": 6031, + "token_acc": 0.2926795808916406 + }, + { + "epoch": 3.535913221929053, + "grad_norm": 0.27326361739346255, + "learning_rate": 0.0002870073664675934, + "loss": 3.0844950675964355, + "step": 6032, + "token_acc": 0.28777593176575167 + }, + { + "epoch": 3.536499560246262, + "grad_norm": 0.23564983800361874, + "learning_rate": 0.0002870014473144366, + "loss": 3.061190605163574, + "step": 6033, + "token_acc": 0.2933484177082898 + }, + { + "epoch": 3.537085898563471, + "grad_norm": 0.2847364731955953, + "learning_rate": 0.00028699552687433573, + "loss": 3.021754026412964, + "step": 6034, + "token_acc": 0.2983580840828279 + }, + { + "epoch": 3.53767223688068, + "grad_norm": 0.2478977196271876, + "learning_rate": 0.0002869896051473466, + "loss": 3.0920448303222656, + "step": 6035, + "token_acc": 0.2859795631189258 + }, + { + "epoch": 3.538258575197889, + "grad_norm": 0.278030690125152, + "learning_rate": 0.0002869836821335246, + "loss": 3.0065436363220215, + "step": 6036, + "token_acc": 0.2988968307484828 + }, + { + "epoch": 3.5388449135150983, + "grad_norm": 0.24327824051212757, + "learning_rate": 0.0002869777578329256, + "loss": 3.029778003692627, + "step": 6037, + "token_acc": 0.29564744111354807 + }, + { + "epoch": 3.5394312518323074, + "grad_norm": 0.25362697753369945, + "learning_rate": 0.0002869718322456051, + "loss": 3.0354065895080566, + "step": 6038, + "token_acc": 0.2948021245893586 + }, + { + "epoch": 3.540017590149516, + "grad_norm": 0.284496003103699, + "learning_rate": 0.0002869659053716188, + "loss": 3.073519468307495, + "step": 6039, + "token_acc": 0.288876724384041 + }, + { + "epoch": 3.5406039284667252, + "grad_norm": 0.2693785458083241, + "learning_rate": 0.0002869599772110224, + "loss": 3.084733486175537, + "step": 6040, + "token_acc": 0.2879630731791189 + }, + { + "epoch": 3.5411902667839343, + "grad_norm": 0.24779997373044924, + "learning_rate": 0.00028695404776387154, + "loss": 3.0398566722869873, + "step": 6041, + "token_acc": 0.2964581771251068 + }, + { + "epoch": 3.5417766051011434, + "grad_norm": 0.2623558548958517, + "learning_rate": 0.00028694811703022197, + "loss": 3.0617852210998535, + "step": 6042, + "token_acc": 0.28977031184361784 + }, + { + "epoch": 3.542362943418352, + "grad_norm": 0.2389656006869627, + "learning_rate": 0.0002869421850101294, + "loss": 3.04427170753479, + "step": 6043, + "token_acc": 0.2934448413706571 + }, + { + "epoch": 3.5429492817355612, + "grad_norm": 0.25001300866029547, + "learning_rate": 0.0002869362517036495, + "loss": 3.058241367340088, + "step": 6044, + "token_acc": 0.29194315563718987 + }, + { + "epoch": 3.5435356200527703, + "grad_norm": 0.24123573694740974, + "learning_rate": 0.0002869303171108381, + "loss": 3.064008951187134, + "step": 6045, + "token_acc": 0.29138885521477353 + }, + { + "epoch": 3.5441219583699795, + "grad_norm": 0.23725005013032696, + "learning_rate": 0.0002869243812317508, + "loss": 3.076772928237915, + "step": 6046, + "token_acc": 0.29008356545961 + }, + { + "epoch": 3.5447082966871886, + "grad_norm": 0.24737911708835764, + "learning_rate": 0.0002869184440664435, + "loss": 3.050940752029419, + "step": 6047, + "token_acc": 0.29333133189609434 + }, + { + "epoch": 3.5452946350043977, + "grad_norm": 0.2509768615861396, + "learning_rate": 0.00028691250561497194, + "loss": 3.07908296585083, + "step": 6048, + "token_acc": 0.2881753307581387 + }, + { + "epoch": 3.545880973321607, + "grad_norm": 0.23551892199905822, + "learning_rate": 0.0002869065658773918, + "loss": 3.0442733764648438, + "step": 6049, + "token_acc": 0.292755325895967 + }, + { + "epoch": 3.5464673116388155, + "grad_norm": 0.27235314538592653, + "learning_rate": 0.000286900624853759, + "loss": 2.9942917823791504, + "step": 6050, + "token_acc": 0.299810618428786 + }, + { + "epoch": 3.5470536499560246, + "grad_norm": 0.24035301987152533, + "learning_rate": 0.00028689468254412934, + "loss": 3.0822620391845703, + "step": 6051, + "token_acc": 0.2889418302448919 + }, + { + "epoch": 3.5476399882732337, + "grad_norm": 0.2562068620424487, + "learning_rate": 0.0002868887389485586, + "loss": 3.0526232719421387, + "step": 6052, + "token_acc": 0.29242596016546574 + }, + { + "epoch": 3.548226326590443, + "grad_norm": 0.2471107313329575, + "learning_rate": 0.00028688279406710257, + "loss": 3.0399651527404785, + "step": 6053, + "token_acc": 0.2952737015752531 + }, + { + "epoch": 3.5488126649076515, + "grad_norm": 0.2641187759393293, + "learning_rate": 0.0002868768478998172, + "loss": 3.083714008331299, + "step": 6054, + "token_acc": 0.28920240271343567 + }, + { + "epoch": 3.5493990032248606, + "grad_norm": 0.27193159278858786, + "learning_rate": 0.0002868709004467582, + "loss": 3.076934337615967, + "step": 6055, + "token_acc": 0.28843141752812534 + }, + { + "epoch": 3.5499853415420697, + "grad_norm": 0.2794570739372252, + "learning_rate": 0.0002868649517079816, + "loss": 3.0379161834716797, + "step": 6056, + "token_acc": 0.29329187840468196 + }, + { + "epoch": 3.550571679859279, + "grad_norm": 0.2761733784309781, + "learning_rate": 0.0002868590016835432, + "loss": 3.037870407104492, + "step": 6057, + "token_acc": 0.2956857371484757 + }, + { + "epoch": 3.551158018176488, + "grad_norm": 0.23998662260583714, + "learning_rate": 0.0002868530503734989, + "loss": 3.030988931655884, + "step": 6058, + "token_acc": 0.2947896823080556 + }, + { + "epoch": 3.551744356493697, + "grad_norm": 0.24578866462039128, + "learning_rate": 0.0002868470977779046, + "loss": 3.050600528717041, + "step": 6059, + "token_acc": 0.2931530310629262 + }, + { + "epoch": 3.552330694810906, + "grad_norm": 0.24425527088450125, + "learning_rate": 0.0002868411438968162, + "loss": 3.0394887924194336, + "step": 6060, + "token_acc": 0.2954149571349122 + }, + { + "epoch": 3.552917033128115, + "grad_norm": 0.24016920510971262, + "learning_rate": 0.00028683518873028963, + "loss": 3.053927421569824, + "step": 6061, + "token_acc": 0.29436359380423116 + }, + { + "epoch": 3.553503371445324, + "grad_norm": 0.25789641790447304, + "learning_rate": 0.0002868292322783809, + "loss": 3.0603485107421875, + "step": 6062, + "token_acc": 0.2920824878339019 + }, + { + "epoch": 3.554089709762533, + "grad_norm": 0.22453368015172573, + "learning_rate": 0.0002868232745411459, + "loss": 3.0688514709472656, + "step": 6063, + "token_acc": 0.29177623191690466 + }, + { + "epoch": 3.554676048079742, + "grad_norm": 0.24038225929216378, + "learning_rate": 0.0002868173155186406, + "loss": 3.030489444732666, + "step": 6064, + "token_acc": 0.29739601491056344 + }, + { + "epoch": 3.555262386396951, + "grad_norm": 0.22277537484992926, + "learning_rate": 0.000286811355210921, + "loss": 3.0206336975097656, + "step": 6065, + "token_acc": 0.29686855046245575 + }, + { + "epoch": 3.55584872471416, + "grad_norm": 0.2616435465807672, + "learning_rate": 0.000286805393618043, + "loss": 3.035215377807617, + "step": 6066, + "token_acc": 0.29298452484060483 + }, + { + "epoch": 3.556435063031369, + "grad_norm": 0.268883416098163, + "learning_rate": 0.0002867994307400628, + "loss": 3.1055145263671875, + "step": 6067, + "token_acc": 0.28469374111457657 + }, + { + "epoch": 3.557021401348578, + "grad_norm": 0.2525141378296953, + "learning_rate": 0.0002867934665770362, + "loss": 3.079225540161133, + "step": 6068, + "token_acc": 0.28841556801953766 + }, + { + "epoch": 3.5576077396657872, + "grad_norm": 0.25501280086287026, + "learning_rate": 0.0002867875011290193, + "loss": 3.0819454193115234, + "step": 6069, + "token_acc": 0.28886650083349347 + }, + { + "epoch": 3.5581940779829964, + "grad_norm": 0.2500841946708259, + "learning_rate": 0.00028678153439606815, + "loss": 3.036304473876953, + "step": 6070, + "token_acc": 0.2964214742837472 + }, + { + "epoch": 3.5587804163002055, + "grad_norm": 0.24953824216749074, + "learning_rate": 0.00028677556637823887, + "loss": 3.024610757827759, + "step": 6071, + "token_acc": 0.29557669547705845 + }, + { + "epoch": 3.559366754617414, + "grad_norm": 0.26064576866086037, + "learning_rate": 0.0002867695970755874, + "loss": 3.0633652210235596, + "step": 6072, + "token_acc": 0.2897616953155338 + }, + { + "epoch": 3.5599530929346233, + "grad_norm": 0.21267759734040712, + "learning_rate": 0.00028676362648816987, + "loss": 3.0441946983337402, + "step": 6073, + "token_acc": 0.29358587272640135 + }, + { + "epoch": 3.5605394312518324, + "grad_norm": 0.2368463417229675, + "learning_rate": 0.00028675765461604237, + "loss": 3.050931453704834, + "step": 6074, + "token_acc": 0.2955909827548147 + }, + { + "epoch": 3.5611257695690415, + "grad_norm": 0.22531807206989257, + "learning_rate": 0.000286751681459261, + "loss": 3.0889341831207275, + "step": 6075, + "token_acc": 0.28889776793011385 + }, + { + "epoch": 3.56171210788625, + "grad_norm": 0.24203215325624858, + "learning_rate": 0.00028674570701788183, + "loss": 3.07252836227417, + "step": 6076, + "token_acc": 0.29175244265635614 + }, + { + "epoch": 3.5622984462034593, + "grad_norm": 0.22156446649180378, + "learning_rate": 0.00028673973129196106, + "loss": 3.0235276222229004, + "step": 6077, + "token_acc": 0.2978394061413448 + }, + { + "epoch": 3.5628847845206684, + "grad_norm": 0.25741139089566956, + "learning_rate": 0.00028673375428155476, + "loss": 3.0375866889953613, + "step": 6078, + "token_acc": 0.29657648107548124 + }, + { + "epoch": 3.5634711228378775, + "grad_norm": 0.24396014771401753, + "learning_rate": 0.000286727775986719, + "loss": 3.029989719390869, + "step": 6079, + "token_acc": 0.29733488011185977 + }, + { + "epoch": 3.5640574611550866, + "grad_norm": 0.23238009686141486, + "learning_rate": 0.0002867217964075101, + "loss": 3.030158042907715, + "step": 6080, + "token_acc": 0.296097036854846 + }, + { + "epoch": 3.5646437994722957, + "grad_norm": 0.25394760881360384, + "learning_rate": 0.0002867158155439842, + "loss": 3.055544376373291, + "step": 6081, + "token_acc": 0.2924656025530132 + }, + { + "epoch": 3.565230137789505, + "grad_norm": 0.22446081904579027, + "learning_rate": 0.00028670983339619733, + "loss": 3.011784553527832, + "step": 6082, + "token_acc": 0.29798803125755935 + }, + { + "epoch": 3.5658164761067135, + "grad_norm": 0.2591285523065901, + "learning_rate": 0.0002867038499642059, + "loss": 3.059640645980835, + "step": 6083, + "token_acc": 0.2918689477248874 + }, + { + "epoch": 3.5664028144239226, + "grad_norm": 0.2636359209238713, + "learning_rate": 0.00028669786524806594, + "loss": 3.0491106510162354, + "step": 6084, + "token_acc": 0.29392017903767254 + }, + { + "epoch": 3.5669891527411317, + "grad_norm": 0.26069772537809915, + "learning_rate": 0.00028669187924783377, + "loss": 3.0741841793060303, + "step": 6085, + "token_acc": 0.28994687101300504 + }, + { + "epoch": 3.567575491058341, + "grad_norm": 0.2645561320942874, + "learning_rate": 0.0002866858919635656, + "loss": 3.075071334838867, + "step": 6086, + "token_acc": 0.290495011014643 + }, + { + "epoch": 3.5681618293755495, + "grad_norm": 0.27668740723339474, + "learning_rate": 0.00028667990339531767, + "loss": 3.080467700958252, + "step": 6087, + "token_acc": 0.2909800239163085 + }, + { + "epoch": 3.5687481676927586, + "grad_norm": 0.26476697429168394, + "learning_rate": 0.00028667391354314617, + "loss": 3.020512104034424, + "step": 6088, + "token_acc": 0.29754867414879405 + }, + { + "epoch": 3.5693345060099677, + "grad_norm": 0.2606152162559852, + "learning_rate": 0.00028666792240710745, + "loss": 3.06704044342041, + "step": 6089, + "token_acc": 0.2900319519629266 + }, + { + "epoch": 3.569920844327177, + "grad_norm": 0.2936229181469169, + "learning_rate": 0.0002866619299872578, + "loss": 3.0341434478759766, + "step": 6090, + "token_acc": 0.29625872818274496 + }, + { + "epoch": 3.570507182644386, + "grad_norm": 0.25546281797997994, + "learning_rate": 0.00028665593628365344, + "loss": 3.059028387069702, + "step": 6091, + "token_acc": 0.2918790504867958 + }, + { + "epoch": 3.571093520961595, + "grad_norm": 0.24321044711431264, + "learning_rate": 0.0002866499412963507, + "loss": 3.0412228107452393, + "step": 6092, + "token_acc": 0.2940167280026765 + }, + { + "epoch": 3.5716798592788037, + "grad_norm": 0.25321942082991794, + "learning_rate": 0.0002866439450254059, + "loss": 3.0155866146087646, + "step": 6093, + "token_acc": 0.2977933636329109 + }, + { + "epoch": 3.572266197596013, + "grad_norm": 0.26310823569314196, + "learning_rate": 0.00028663794747087537, + "loss": 3.032715320587158, + "step": 6094, + "token_acc": 0.29454439854433573 + }, + { + "epoch": 3.572852535913222, + "grad_norm": 0.24142818841379549, + "learning_rate": 0.00028663194863281545, + "loss": 3.0557267665863037, + "step": 6095, + "token_acc": 0.291492885741487 + }, + { + "epoch": 3.573438874230431, + "grad_norm": 0.2693607352939552, + "learning_rate": 0.00028662594851128247, + "loss": 3.009385108947754, + "step": 6096, + "token_acc": 0.2995410363644438 + }, + { + "epoch": 3.5740252125476397, + "grad_norm": 0.2694324834046713, + "learning_rate": 0.0002866199471063328, + "loss": 3.0033278465270996, + "step": 6097, + "token_acc": 0.29924112657925606 + }, + { + "epoch": 3.574611550864849, + "grad_norm": 0.22317934227313005, + "learning_rate": 0.00028661394441802286, + "loss": 3.000153064727783, + "step": 6098, + "token_acc": 0.30054169363778593 + }, + { + "epoch": 3.575197889182058, + "grad_norm": 0.2239676859098204, + "learning_rate": 0.000286607940446409, + "loss": 3.0329737663269043, + "step": 6099, + "token_acc": 0.2954887061625917 + }, + { + "epoch": 3.575784227499267, + "grad_norm": 0.27554666446966947, + "learning_rate": 0.0002866019351915476, + "loss": 3.062764883041382, + "step": 6100, + "token_acc": 0.29049137083755583 + }, + { + "epoch": 3.576370565816476, + "grad_norm": 0.23410070482984183, + "learning_rate": 0.00028659592865349514, + "loss": 3.0197744369506836, + "step": 6101, + "token_acc": 0.2965598138058602 + }, + { + "epoch": 3.5769569041336853, + "grad_norm": 0.2608647107877979, + "learning_rate": 0.00028658992083230794, + "loss": 3.0443286895751953, + "step": 6102, + "token_acc": 0.29283692463302174 + }, + { + "epoch": 3.5775432424508944, + "grad_norm": 0.26284075172269095, + "learning_rate": 0.0002865839117280425, + "loss": 3.0181045532226562, + "step": 6103, + "token_acc": 0.29859938260581165 + }, + { + "epoch": 3.578129580768103, + "grad_norm": 0.2523698722126072, + "learning_rate": 0.0002865779013407553, + "loss": 3.0449748039245605, + "step": 6104, + "token_acc": 0.29365795572263104 + }, + { + "epoch": 3.578715919085312, + "grad_norm": 0.25188272983563526, + "learning_rate": 0.00028657188967050264, + "loss": 3.0365653038024902, + "step": 6105, + "token_acc": 0.2952062601318563 + }, + { + "epoch": 3.5793022574025213, + "grad_norm": 0.23478233125600992, + "learning_rate": 0.0002865658767173412, + "loss": 3.015404224395752, + "step": 6106, + "token_acc": 0.2978809273241854 + }, + { + "epoch": 3.5798885957197304, + "grad_norm": 0.2444031837063552, + "learning_rate": 0.0002865598624813274, + "loss": 3.0871124267578125, + "step": 6107, + "token_acc": 0.28788604414713664 + }, + { + "epoch": 3.580474934036939, + "grad_norm": 0.2593354618685756, + "learning_rate": 0.0002865538469625177, + "loss": 3.1010775566101074, + "step": 6108, + "token_acc": 0.28649030779581197 + }, + { + "epoch": 3.581061272354148, + "grad_norm": 0.23591893185375268, + "learning_rate": 0.00028654783016096857, + "loss": 3.031008005142212, + "step": 6109, + "token_acc": 0.29463602341527273 + }, + { + "epoch": 3.5816476106713573, + "grad_norm": 0.27312903279446565, + "learning_rate": 0.0002865418120767366, + "loss": 3.0325865745544434, + "step": 6110, + "token_acc": 0.29547954924016534 + }, + { + "epoch": 3.5822339489885664, + "grad_norm": 0.2551680244753278, + "learning_rate": 0.0002865357927098783, + "loss": 3.01479434967041, + "step": 6111, + "token_acc": 0.29834968536611123 + }, + { + "epoch": 3.5828202873057755, + "grad_norm": 0.2605300215864193, + "learning_rate": 0.0002865297720604502, + "loss": 3.0945820808410645, + "step": 6112, + "token_acc": 0.2878335495771521 + }, + { + "epoch": 3.5834066256229846, + "grad_norm": 0.23536949717069902, + "learning_rate": 0.00028652375012850885, + "loss": 3.079036235809326, + "step": 6113, + "token_acc": 0.2891864264944997 + }, + { + "epoch": 3.5839929639401937, + "grad_norm": 0.23095378462270155, + "learning_rate": 0.0002865177269141109, + "loss": 3.0468597412109375, + "step": 6114, + "token_acc": 0.2940650752108824 + }, + { + "epoch": 3.5845793022574024, + "grad_norm": 0.24711036252549537, + "learning_rate": 0.00028651170241731277, + "loss": 3.0463035106658936, + "step": 6115, + "token_acc": 0.2942721105116887 + }, + { + "epoch": 3.5851656405746115, + "grad_norm": 0.25666749978510217, + "learning_rate": 0.00028650567663817127, + "loss": 3.055063486099243, + "step": 6116, + "token_acc": 0.29176996831792723 + }, + { + "epoch": 3.5857519788918206, + "grad_norm": 0.26404471635979787, + "learning_rate": 0.0002864996495767428, + "loss": 3.0587167739868164, + "step": 6117, + "token_acc": 0.29249874361890654 + }, + { + "epoch": 3.5863383172090297, + "grad_norm": 0.2573388827350888, + "learning_rate": 0.00028649362123308404, + "loss": 3.0235342979431152, + "step": 6118, + "token_acc": 0.29753811629595406 + }, + { + "epoch": 3.5869246555262384, + "grad_norm": 0.24789261571909826, + "learning_rate": 0.0002864875916072517, + "loss": 3.0351009368896484, + "step": 6119, + "token_acc": 0.2956268646234599 + }, + { + "epoch": 3.5875109938434475, + "grad_norm": 0.24865731019031992, + "learning_rate": 0.0002864815606993023, + "loss": 3.0768585205078125, + "step": 6120, + "token_acc": 0.2896048058446079 + }, + { + "epoch": 3.5880973321606566, + "grad_norm": 0.26027644726396876, + "learning_rate": 0.00028647552850929255, + "loss": 3.017876625061035, + "step": 6121, + "token_acc": 0.2976597107691469 + }, + { + "epoch": 3.5886836704778657, + "grad_norm": 0.2438327212325603, + "learning_rate": 0.00028646949503727915, + "loss": 2.997507333755493, + "step": 6122, + "token_acc": 0.30004384720538546 + }, + { + "epoch": 3.589270008795075, + "grad_norm": 0.27601691227613323, + "learning_rate": 0.00028646346028331877, + "loss": 3.0222673416137695, + "step": 6123, + "token_acc": 0.2954030533381328 + }, + { + "epoch": 3.589856347112284, + "grad_norm": 0.22713981941577088, + "learning_rate": 0.000286457424247468, + "loss": 3.0285282135009766, + "step": 6124, + "token_acc": 0.2966337824707872 + }, + { + "epoch": 3.590442685429493, + "grad_norm": 0.2762942344038067, + "learning_rate": 0.00028645138692978364, + "loss": 3.0475244522094727, + "step": 6125, + "token_acc": 0.2931074354971802 + }, + { + "epoch": 3.5910290237467017, + "grad_norm": 0.2655786520376476, + "learning_rate": 0.00028644534833032235, + "loss": 3.0623743534088135, + "step": 6126, + "token_acc": 0.29072293600230387 + }, + { + "epoch": 3.591615362063911, + "grad_norm": 0.261517011834492, + "learning_rate": 0.0002864393084491409, + "loss": 3.0372674465179443, + "step": 6127, + "token_acc": 0.2958513931888545 + }, + { + "epoch": 3.59220170038112, + "grad_norm": 0.26388563124712244, + "learning_rate": 0.00028643326728629596, + "loss": 3.0659213066101074, + "step": 6128, + "token_acc": 0.28964239022571786 + }, + { + "epoch": 3.592788038698329, + "grad_norm": 0.27849552554377105, + "learning_rate": 0.0002864272248418444, + "loss": 3.020321846008301, + "step": 6129, + "token_acc": 0.2976649634922209 + }, + { + "epoch": 3.5933743770155377, + "grad_norm": 0.2524102745096216, + "learning_rate": 0.0002864211811158428, + "loss": 3.0469841957092285, + "step": 6130, + "token_acc": 0.2921905634781975 + }, + { + "epoch": 3.593960715332747, + "grad_norm": 0.25052004241709375, + "learning_rate": 0.0002864151361083481, + "loss": 3.0399510860443115, + "step": 6131, + "token_acc": 0.2930737139264087 + }, + { + "epoch": 3.594547053649956, + "grad_norm": 0.24639107346179698, + "learning_rate": 0.000286409089819417, + "loss": 3.0320656299591064, + "step": 6132, + "token_acc": 0.2960135419746271 + }, + { + "epoch": 3.595133391967165, + "grad_norm": 0.27523393260497503, + "learning_rate": 0.0002864030422491063, + "loss": 3.050067663192749, + "step": 6133, + "token_acc": 0.29298050544778237 + }, + { + "epoch": 3.595719730284374, + "grad_norm": 0.24652021697663548, + "learning_rate": 0.0002863969933974728, + "loss": 3.0656063556671143, + "step": 6134, + "token_acc": 0.29237881532347454 + }, + { + "epoch": 3.5963060686015833, + "grad_norm": 0.24991398233729448, + "learning_rate": 0.0002863909432645734, + "loss": 3.041727066040039, + "step": 6135, + "token_acc": 0.29450936598122934 + }, + { + "epoch": 3.5968924069187924, + "grad_norm": 0.29375214489966295, + "learning_rate": 0.0002863848918504649, + "loss": 3.0826520919799805, + "step": 6136, + "token_acc": 0.28887339376431914 + }, + { + "epoch": 3.597478745236001, + "grad_norm": 0.24540861090554375, + "learning_rate": 0.00028637883915520404, + "loss": 3.054462194442749, + "step": 6137, + "token_acc": 0.2930816191584354 + }, + { + "epoch": 3.59806508355321, + "grad_norm": 0.24773845119885868, + "learning_rate": 0.00028637278517884784, + "loss": 3.0125513076782227, + "step": 6138, + "token_acc": 0.29829153963617405 + }, + { + "epoch": 3.5986514218704193, + "grad_norm": 0.23083250538585498, + "learning_rate": 0.000286366729921453, + "loss": 3.0563793182373047, + "step": 6139, + "token_acc": 0.2930490298529291 + }, + { + "epoch": 3.5992377601876284, + "grad_norm": 0.2241803537893363, + "learning_rate": 0.00028636067338307656, + "loss": 3.068307638168335, + "step": 6140, + "token_acc": 0.28960567985821484 + }, + { + "epoch": 3.599824098504837, + "grad_norm": 0.2542748654545773, + "learning_rate": 0.0002863546155637753, + "loss": 3.0647077560424805, + "step": 6141, + "token_acc": 0.2919481435752712 + }, + { + "epoch": 3.600410436822046, + "grad_norm": 0.2548939663083021, + "learning_rate": 0.00028634855646360617, + "loss": 3.0493435859680176, + "step": 6142, + "token_acc": 0.29346120852879126 + }, + { + "epoch": 3.6009967751392553, + "grad_norm": 0.2558406809939796, + "learning_rate": 0.0002863424960826261, + "loss": 3.0604329109191895, + "step": 6143, + "token_acc": 0.29214809293079885 + }, + { + "epoch": 3.6015831134564644, + "grad_norm": 0.25751371870667433, + "learning_rate": 0.00028633643442089205, + "loss": 3.0120139122009277, + "step": 6144, + "token_acc": 0.2995056060739825 + }, + { + "epoch": 3.6021694517736735, + "grad_norm": 0.22879888964459524, + "learning_rate": 0.0002863303714784609, + "loss": 3.0437865257263184, + "step": 6145, + "token_acc": 0.29189373740848756 + }, + { + "epoch": 3.6027557900908826, + "grad_norm": 0.300761005894166, + "learning_rate": 0.00028632430725538955, + "loss": 3.063821315765381, + "step": 6146, + "token_acc": 0.29028607110798893 + }, + { + "epoch": 3.6033421284080913, + "grad_norm": 0.25000948370816856, + "learning_rate": 0.00028631824175173504, + "loss": 3.009230136871338, + "step": 6147, + "token_acc": 0.298487106773066 + }, + { + "epoch": 3.6039284667253004, + "grad_norm": 0.2568777396495953, + "learning_rate": 0.0002863121749675544, + "loss": 3.047050952911377, + "step": 6148, + "token_acc": 0.29299937485268046 + }, + { + "epoch": 3.6045148050425095, + "grad_norm": 0.25302034187186356, + "learning_rate": 0.0002863061069029045, + "loss": 3.066626787185669, + "step": 6149, + "token_acc": 0.2910793809980091 + }, + { + "epoch": 3.6051011433597187, + "grad_norm": 0.2673466253115117, + "learning_rate": 0.00028630003755784244, + "loss": 3.100741147994995, + "step": 6150, + "token_acc": 0.285629177488725 + }, + { + "epoch": 3.6056874816769273, + "grad_norm": 0.26602340384763923, + "learning_rate": 0.00028629396693242515, + "loss": 3.067643642425537, + "step": 6151, + "token_acc": 0.29079610665483024 + }, + { + "epoch": 3.6062738199941364, + "grad_norm": 0.27691833789135867, + "learning_rate": 0.00028628789502670973, + "loss": 3.0707292556762695, + "step": 6152, + "token_acc": 0.29105057754674163 + }, + { + "epoch": 3.6068601583113455, + "grad_norm": 0.2618875803035751, + "learning_rate": 0.0002862818218407532, + "loss": 3.058117151260376, + "step": 6153, + "token_acc": 0.29186998635709116 + }, + { + "epoch": 3.6074464966285547, + "grad_norm": 0.26052487365354143, + "learning_rate": 0.00028627574737461256, + "loss": 3.1006875038146973, + "step": 6154, + "token_acc": 0.28395048755884333 + }, + { + "epoch": 3.6080328349457638, + "grad_norm": 0.24535185019424482, + "learning_rate": 0.000286269671628345, + "loss": 3.036726951599121, + "step": 6155, + "token_acc": 0.2941332154768818 + }, + { + "epoch": 3.608619173262973, + "grad_norm": 0.2500746567933048, + "learning_rate": 0.00028626359460200737, + "loss": 3.077418804168701, + "step": 6156, + "token_acc": 0.2877984598119268 + }, + { + "epoch": 3.609205511580182, + "grad_norm": 0.25028097189032594, + "learning_rate": 0.00028625751629565694, + "loss": 3.0473079681396484, + "step": 6157, + "token_acc": 0.2934210004101126 + }, + { + "epoch": 3.6097918498973907, + "grad_norm": 0.237385920363267, + "learning_rate": 0.00028625143670935076, + "loss": 3.0271854400634766, + "step": 6158, + "token_acc": 0.2986032689450223 + }, + { + "epoch": 3.6103781882145998, + "grad_norm": 0.223297051926556, + "learning_rate": 0.00028624535584314593, + "loss": 3.0162267684936523, + "step": 6159, + "token_acc": 0.2980500001314548 + }, + { + "epoch": 3.610964526531809, + "grad_norm": 0.22288475084897696, + "learning_rate": 0.0002862392736970996, + "loss": 3.0254554748535156, + "step": 6160, + "token_acc": 0.2964708199848631 + }, + { + "epoch": 3.611550864849018, + "grad_norm": 0.23650007737977352, + "learning_rate": 0.00028623319027126884, + "loss": 2.998199462890625, + "step": 6161, + "token_acc": 0.30141188479612424 + }, + { + "epoch": 3.6121372031662267, + "grad_norm": 0.23374753838193305, + "learning_rate": 0.00028622710556571086, + "loss": 3.0646297931671143, + "step": 6162, + "token_acc": 0.2923029592578251 + }, + { + "epoch": 3.6127235414834358, + "grad_norm": 0.2769463797624376, + "learning_rate": 0.0002862210195804828, + "loss": 3.0488460063934326, + "step": 6163, + "token_acc": 0.29308924317005847 + }, + { + "epoch": 3.613309879800645, + "grad_norm": 0.27044672170712525, + "learning_rate": 0.0002862149323156418, + "loss": 3.015994071960449, + "step": 6164, + "token_acc": 0.2983499403446214 + }, + { + "epoch": 3.613896218117854, + "grad_norm": 0.23618806645838863, + "learning_rate": 0.0002862088437712451, + "loss": 3.086371660232544, + "step": 6165, + "token_acc": 0.28756024575649164 + }, + { + "epoch": 3.614482556435063, + "grad_norm": 0.25547032799685593, + "learning_rate": 0.00028620275394734975, + "loss": 3.124640703201294, + "step": 6166, + "token_acc": 0.2813133485648934 + }, + { + "epoch": 3.615068894752272, + "grad_norm": 0.26448708533092424, + "learning_rate": 0.00028619666284401314, + "loss": 3.0205764770507812, + "step": 6167, + "token_acc": 0.2960181026394426 + }, + { + "epoch": 3.6156552330694813, + "grad_norm": 0.24467311555360677, + "learning_rate": 0.00028619057046129243, + "loss": 3.033078193664551, + "step": 6168, + "token_acc": 0.2952841709478267 + }, + { + "epoch": 3.61624157138669, + "grad_norm": 0.25565826836216243, + "learning_rate": 0.0002861844767992448, + "loss": 3.0700271129608154, + "step": 6169, + "token_acc": 0.2894167349672028 + }, + { + "epoch": 3.616827909703899, + "grad_norm": 0.2588942798494076, + "learning_rate": 0.0002861783818579275, + "loss": 3.0236010551452637, + "step": 6170, + "token_acc": 0.29824452460149464 + }, + { + "epoch": 3.6174142480211082, + "grad_norm": 0.24866091364896437, + "learning_rate": 0.00028617228563739786, + "loss": 3.0428285598754883, + "step": 6171, + "token_acc": 0.29582905402309073 + }, + { + "epoch": 3.6180005863383173, + "grad_norm": 0.2520687335949198, + "learning_rate": 0.000286166188137713, + "loss": 3.0607612133026123, + "step": 6172, + "token_acc": 0.291403233042765 + }, + { + "epoch": 3.618586924655526, + "grad_norm": 0.2524602510040712, + "learning_rate": 0.0002861600893589304, + "loss": 3.024083137512207, + "step": 6173, + "token_acc": 0.29615397829229556 + }, + { + "epoch": 3.619173262972735, + "grad_norm": 0.2861131738090535, + "learning_rate": 0.00028615398930110716, + "loss": 3.0351860523223877, + "step": 6174, + "token_acc": 0.2958076283393287 + }, + { + "epoch": 3.6197596012899442, + "grad_norm": 0.2836115668563803, + "learning_rate": 0.0002861478879643007, + "loss": 3.125187873840332, + "step": 6175, + "token_acc": 0.2831471538973646 + }, + { + "epoch": 3.6203459396071533, + "grad_norm": 0.2561066939859495, + "learning_rate": 0.0002861417853485683, + "loss": 3.0650320053100586, + "step": 6176, + "token_acc": 0.28968109268211367 + }, + { + "epoch": 3.6209322779243625, + "grad_norm": 0.2744763830322329, + "learning_rate": 0.0002861356814539673, + "loss": 3.0637879371643066, + "step": 6177, + "token_acc": 0.2922969814768325 + }, + { + "epoch": 3.6215186162415716, + "grad_norm": 0.2894212138777676, + "learning_rate": 0.00028612957628055494, + "loss": 3.0428876876831055, + "step": 6178, + "token_acc": 0.29388227699590647 + }, + { + "epoch": 3.6221049545587807, + "grad_norm": 0.245649964513638, + "learning_rate": 0.0002861234698283887, + "loss": 3.0352120399475098, + "step": 6179, + "token_acc": 0.2947702958159276 + }, + { + "epoch": 3.6226912928759893, + "grad_norm": 0.2507215256292912, + "learning_rate": 0.00028611736209752586, + "loss": 3.030254364013672, + "step": 6180, + "token_acc": 0.2958348439819728 + }, + { + "epoch": 3.6232776311931985, + "grad_norm": 0.23100764330826834, + "learning_rate": 0.00028611125308802387, + "loss": 3.0672810077667236, + "step": 6181, + "token_acc": 0.290608982577188 + }, + { + "epoch": 3.6238639695104076, + "grad_norm": 0.27873995391702655, + "learning_rate": 0.00028610514279994, + "loss": 3.0810956954956055, + "step": 6182, + "token_acc": 0.28780670507701994 + }, + { + "epoch": 3.6244503078276167, + "grad_norm": 0.25015337156171186, + "learning_rate": 0.00028609903123333173, + "loss": 3.0174880027770996, + "step": 6183, + "token_acc": 0.2970970241546885 + }, + { + "epoch": 3.6250366461448253, + "grad_norm": 0.23399044010413797, + "learning_rate": 0.00028609291838825655, + "loss": 3.0309877395629883, + "step": 6184, + "token_acc": 0.29638418501355995 + }, + { + "epoch": 3.6256229844620345, + "grad_norm": 0.23900558143245232, + "learning_rate": 0.0002860868042647717, + "loss": 2.9930672645568848, + "step": 6185, + "token_acc": 0.30055438993325123 + }, + { + "epoch": 3.6262093227792436, + "grad_norm": 0.22857876688954804, + "learning_rate": 0.0002860806888629347, + "loss": 3.0573232173919678, + "step": 6186, + "token_acc": 0.2914137193860202 + }, + { + "epoch": 3.6267956610964527, + "grad_norm": 0.2449892276346442, + "learning_rate": 0.000286074572182803, + "loss": 3.026909828186035, + "step": 6187, + "token_acc": 0.2976281236764083 + }, + { + "epoch": 3.627381999413662, + "grad_norm": 0.2238042058608255, + "learning_rate": 0.0002860684542244341, + "loss": 3.037747621536255, + "step": 6188, + "token_acc": 0.29374566874566876 + }, + { + "epoch": 3.627968337730871, + "grad_norm": 0.2274493480014728, + "learning_rate": 0.0002860623349878854, + "loss": 3.045605182647705, + "step": 6189, + "token_acc": 0.2931885568619403 + }, + { + "epoch": 3.62855467604808, + "grad_norm": 0.22909641035388437, + "learning_rate": 0.0002860562144732145, + "loss": 3.058323860168457, + "step": 6190, + "token_acc": 0.2923006865256688 + }, + { + "epoch": 3.6291410143652887, + "grad_norm": 0.2337598865604347, + "learning_rate": 0.0002860500926804787, + "loss": 3.056485652923584, + "step": 6191, + "token_acc": 0.2907898409869107 + }, + { + "epoch": 3.629727352682498, + "grad_norm": 0.252443491103507, + "learning_rate": 0.00028604396960973564, + "loss": 3.0622525215148926, + "step": 6192, + "token_acc": 0.29132857001401896 + }, + { + "epoch": 3.630313690999707, + "grad_norm": 0.24914410942169754, + "learning_rate": 0.0002860378452610428, + "loss": 3.0539793968200684, + "step": 6193, + "token_acc": 0.2925750362136371 + }, + { + "epoch": 3.630900029316916, + "grad_norm": 0.2186735183118364, + "learning_rate": 0.00028603171963445767, + "loss": 3.0599679946899414, + "step": 6194, + "token_acc": 0.29024346396836 + }, + { + "epoch": 3.6314863676341247, + "grad_norm": 0.2340515365580505, + "learning_rate": 0.00028602559273003793, + "loss": 3.0474188327789307, + "step": 6195, + "token_acc": 0.2932076845799642 + }, + { + "epoch": 3.632072705951334, + "grad_norm": 0.21326655044349402, + "learning_rate": 0.000286019464547841, + "loss": 3.047955274581909, + "step": 6196, + "token_acc": 0.29310788383835834 + }, + { + "epoch": 3.632659044268543, + "grad_norm": 0.23455710203427743, + "learning_rate": 0.0002860133350879245, + "loss": 3.0831246376037598, + "step": 6197, + "token_acc": 0.2889303894857968 + }, + { + "epoch": 3.633245382585752, + "grad_norm": 0.26456820958452226, + "learning_rate": 0.00028600720435034596, + "loss": 3.057317018508911, + "step": 6198, + "token_acc": 0.2919823521258261 + }, + { + "epoch": 3.633831720902961, + "grad_norm": 0.25347601348141063, + "learning_rate": 0.000286001072335163, + "loss": 3.0943918228149414, + "step": 6199, + "token_acc": 0.28618876638354385 + }, + { + "epoch": 3.6344180592201702, + "grad_norm": 0.2667852238027669, + "learning_rate": 0.0002859949390424333, + "loss": 3.040538787841797, + "step": 6200, + "token_acc": 0.2953380706849001 + }, + { + "epoch": 3.635004397537379, + "grad_norm": 0.2676599133263569, + "learning_rate": 0.00028598880447221436, + "loss": 3.0291266441345215, + "step": 6201, + "token_acc": 0.2975971636933873 + }, + { + "epoch": 3.635590735854588, + "grad_norm": 0.2363339305363119, + "learning_rate": 0.00028598266862456386, + "loss": 3.0860610008239746, + "step": 6202, + "token_acc": 0.2860995459243455 + }, + { + "epoch": 3.636177074171797, + "grad_norm": 0.2609171603385685, + "learning_rate": 0.0002859765314995394, + "loss": 3.0501694679260254, + "step": 6203, + "token_acc": 0.2935008888166416 + }, + { + "epoch": 3.6367634124890063, + "grad_norm": 0.24586799200170772, + "learning_rate": 0.0002859703930971987, + "loss": 3.070349931716919, + "step": 6204, + "token_acc": 0.29098656178302196 + }, + { + "epoch": 3.637349750806215, + "grad_norm": 0.2855009590773437, + "learning_rate": 0.00028596425341759934, + "loss": 3.0581700801849365, + "step": 6205, + "token_acc": 0.2931053366921965 + }, + { + "epoch": 3.637936089123424, + "grad_norm": 0.2562927867775573, + "learning_rate": 0.00028595811246079903, + "loss": 3.0518805980682373, + "step": 6206, + "token_acc": 0.29186901555219125 + }, + { + "epoch": 3.638522427440633, + "grad_norm": 0.25392730215652387, + "learning_rate": 0.0002859519702268555, + "loss": 3.072758674621582, + "step": 6207, + "token_acc": 0.2905806954910639 + }, + { + "epoch": 3.6391087657578423, + "grad_norm": 0.2455116659310737, + "learning_rate": 0.0002859458267158264, + "loss": 3.0208051204681396, + "step": 6208, + "token_acc": 0.2975772200519298 + }, + { + "epoch": 3.6396951040750514, + "grad_norm": 0.2592505806253727, + "learning_rate": 0.0002859396819277694, + "loss": 3.0785317420959473, + "step": 6209, + "token_acc": 0.2895008534949524 + }, + { + "epoch": 3.6402814423922605, + "grad_norm": 0.22332606609230557, + "learning_rate": 0.00028593353586274235, + "loss": 3.0526304244995117, + "step": 6210, + "token_acc": 0.292312018243543 + }, + { + "epoch": 3.6408677807094696, + "grad_norm": 0.27188332554210637, + "learning_rate": 0.0002859273885208028, + "loss": 3.056244373321533, + "step": 6211, + "token_acc": 0.2927382297551789 + }, + { + "epoch": 3.6414541190266783, + "grad_norm": 0.24207319500751798, + "learning_rate": 0.00028592123990200865, + "loss": 3.0491786003112793, + "step": 6212, + "token_acc": 0.2913420431259557 + }, + { + "epoch": 3.6420404573438874, + "grad_norm": 0.24832210699905521, + "learning_rate": 0.00028591509000641766, + "loss": 3.0091922283172607, + "step": 6213, + "token_acc": 0.299232985370599 + }, + { + "epoch": 3.6426267956610965, + "grad_norm": 0.2533741810577404, + "learning_rate": 0.0002859089388340875, + "loss": 3.053891658782959, + "step": 6214, + "token_acc": 0.29287441040057327 + }, + { + "epoch": 3.6432131339783056, + "grad_norm": 0.24452458245895556, + "learning_rate": 0.000285902786385076, + "loss": 3.0840253829956055, + "step": 6215, + "token_acc": 0.28888361853400496 + }, + { + "epoch": 3.6437994722955143, + "grad_norm": 0.2260826227990935, + "learning_rate": 0.00028589663265944095, + "loss": 3.0763978958129883, + "step": 6216, + "token_acc": 0.2893275067408716 + }, + { + "epoch": 3.6443858106127234, + "grad_norm": 0.2624485734753881, + "learning_rate": 0.00028589047765724017, + "loss": 3.0182533264160156, + "step": 6217, + "token_acc": 0.2966159834940898 + }, + { + "epoch": 3.6449721489299325, + "grad_norm": 0.2575537641801003, + "learning_rate": 0.00028588432137853146, + "loss": 3.0456085205078125, + "step": 6218, + "token_acc": 0.2950331841977033 + }, + { + "epoch": 3.6455584872471416, + "grad_norm": 0.23851621988862967, + "learning_rate": 0.00028587816382337266, + "loss": 3.0809545516967773, + "step": 6219, + "token_acc": 0.2894723668402083 + }, + { + "epoch": 3.6461448255643507, + "grad_norm": 0.27164660591724127, + "learning_rate": 0.0002858720049918216, + "loss": 3.0093164443969727, + "step": 6220, + "token_acc": 0.298966851510845 + }, + { + "epoch": 3.64673116388156, + "grad_norm": 0.31191312216162975, + "learning_rate": 0.0002858658448839361, + "loss": 3.0800726413726807, + "step": 6221, + "token_acc": 0.28877864656345076 + }, + { + "epoch": 3.647317502198769, + "grad_norm": 0.2893622206488456, + "learning_rate": 0.00028585968349977416, + "loss": 3.0336108207702637, + "step": 6222, + "token_acc": 0.2948400791624106 + }, + { + "epoch": 3.6479038405159776, + "grad_norm": 0.2723611348941157, + "learning_rate": 0.0002858535208393935, + "loss": 3.052973747253418, + "step": 6223, + "token_acc": 0.2920969446937878 + }, + { + "epoch": 3.6484901788331867, + "grad_norm": 0.24734548562079076, + "learning_rate": 0.0002858473569028521, + "loss": 3.0512983798980713, + "step": 6224, + "token_acc": 0.29333491180300697 + }, + { + "epoch": 3.649076517150396, + "grad_norm": 0.31121718894831507, + "learning_rate": 0.00028584119169020787, + "loss": 3.026674509048462, + "step": 6225, + "token_acc": 0.29627413221217164 + }, + { + "epoch": 3.649662855467605, + "grad_norm": 0.2438336269661788, + "learning_rate": 0.0002858350252015186, + "loss": 3.0373945236206055, + "step": 6226, + "token_acc": 0.2940394776926875 + }, + { + "epoch": 3.6502491937848136, + "grad_norm": 0.2968199929974087, + "learning_rate": 0.0002858288574368424, + "loss": 3.076096296310425, + "step": 6227, + "token_acc": 0.28928181915200984 + }, + { + "epoch": 3.6508355321020227, + "grad_norm": 0.25633311263911157, + "learning_rate": 0.0002858226883962371, + "loss": 3.0544490814208984, + "step": 6228, + "token_acc": 0.2936423559430151 + }, + { + "epoch": 3.651421870419232, + "grad_norm": 0.2495506295652773, + "learning_rate": 0.00028581651807976067, + "loss": 2.994093894958496, + "step": 6229, + "token_acc": 0.3019037762764099 + }, + { + "epoch": 3.652008208736441, + "grad_norm": 0.26531197448727034, + "learning_rate": 0.00028581034648747104, + "loss": 3.0873053073883057, + "step": 6230, + "token_acc": 0.2889724041662193 + }, + { + "epoch": 3.65259454705365, + "grad_norm": 0.3039457308766114, + "learning_rate": 0.0002858041736194262, + "loss": 3.062359094619751, + "step": 6231, + "token_acc": 0.29122916131474696 + }, + { + "epoch": 3.653180885370859, + "grad_norm": 0.3294123808697914, + "learning_rate": 0.00028579799947568414, + "loss": 3.0800154209136963, + "step": 6232, + "token_acc": 0.28829030991288457 + }, + { + "epoch": 3.6537672236880683, + "grad_norm": 0.26855439668391645, + "learning_rate": 0.00028579182405630293, + "loss": 3.0684237480163574, + "step": 6233, + "token_acc": 0.2926446032282892 + }, + { + "epoch": 3.654353562005277, + "grad_norm": 0.2744053118200358, + "learning_rate": 0.00028578564736134047, + "loss": 3.0578064918518066, + "step": 6234, + "token_acc": 0.2916688122431745 + }, + { + "epoch": 3.654939900322486, + "grad_norm": 0.2781075118299357, + "learning_rate": 0.00028577946939085485, + "loss": 3.070779800415039, + "step": 6235, + "token_acc": 0.29041626080957833 + }, + { + "epoch": 3.655526238639695, + "grad_norm": 0.28853290625949846, + "learning_rate": 0.00028577329014490405, + "loss": 3.114424467086792, + "step": 6236, + "token_acc": 0.28584262277121963 + }, + { + "epoch": 3.6561125769569043, + "grad_norm": 0.25789046307751595, + "learning_rate": 0.00028576710962354617, + "loss": 3.016322135925293, + "step": 6237, + "token_acc": 0.2966898807558528 + }, + { + "epoch": 3.656698915274113, + "grad_norm": 0.2686188031717648, + "learning_rate": 0.00028576092782683926, + "loss": 3.0384693145751953, + "step": 6238, + "token_acc": 0.2936802973977695 + }, + { + "epoch": 3.657285253591322, + "grad_norm": 0.30136181015688024, + "learning_rate": 0.0002857547447548413, + "loss": 3.026700973510742, + "step": 6239, + "token_acc": 0.2973675552062911 + }, + { + "epoch": 3.657871591908531, + "grad_norm": 0.24176732680901522, + "learning_rate": 0.0002857485604076105, + "loss": 3.014446258544922, + "step": 6240, + "token_acc": 0.298042604047223 + }, + { + "epoch": 3.6584579302257403, + "grad_norm": 0.2988757082956466, + "learning_rate": 0.0002857423747852049, + "loss": 3.0650525093078613, + "step": 6241, + "token_acc": 0.2927221157373204 + }, + { + "epoch": 3.6590442685429494, + "grad_norm": 0.2588918181012523, + "learning_rate": 0.0002857361878876826, + "loss": 3.05169677734375, + "step": 6242, + "token_acc": 0.2926155286564569 + }, + { + "epoch": 3.6596306068601585, + "grad_norm": 0.25534315713592126, + "learning_rate": 0.0002857299997151017, + "loss": 3.074030876159668, + "step": 6243, + "token_acc": 0.28976198122970415 + }, + { + "epoch": 3.660216945177367, + "grad_norm": 0.25186096541950054, + "learning_rate": 0.0002857238102675204, + "loss": 3.0542242527008057, + "step": 6244, + "token_acc": 0.2932612849748801 + }, + { + "epoch": 3.6608032834945763, + "grad_norm": 0.24200283496795183, + "learning_rate": 0.00028571761954499674, + "loss": 3.040381908416748, + "step": 6245, + "token_acc": 0.2949283973546764 + }, + { + "epoch": 3.6613896218117854, + "grad_norm": 0.27428321794677535, + "learning_rate": 0.000285711427547589, + "loss": 3.018749237060547, + "step": 6246, + "token_acc": 0.29770466090183556 + }, + { + "epoch": 3.6619759601289945, + "grad_norm": 0.2554086582709873, + "learning_rate": 0.00028570523427535517, + "loss": 3.0863633155822754, + "step": 6247, + "token_acc": 0.28892004494953777 + }, + { + "epoch": 3.6625622984462036, + "grad_norm": 0.24378981407741745, + "learning_rate": 0.00028569903972835357, + "loss": 3.0609540939331055, + "step": 6248, + "token_acc": 0.2918095956532598 + }, + { + "epoch": 3.6631486367634123, + "grad_norm": 0.25192608311835574, + "learning_rate": 0.0002856928439066423, + "loss": 3.0808660984039307, + "step": 6249, + "token_acc": 0.2868351530650158 + }, + { + "epoch": 3.6637349750806214, + "grad_norm": 0.22854578902432138, + "learning_rate": 0.0002856866468102797, + "loss": 3.0131139755249023, + "step": 6250, + "token_acc": 0.2966140073723012 + }, + { + "epoch": 3.6643213133978305, + "grad_norm": 0.23483310703814317, + "learning_rate": 0.0002856804484393239, + "loss": 3.0671486854553223, + "step": 6251, + "token_acc": 0.29119262142133606 + }, + { + "epoch": 3.6649076517150396, + "grad_norm": 0.2414880184589769, + "learning_rate": 0.00028567424879383306, + "loss": 3.033574104309082, + "step": 6252, + "token_acc": 0.2951343473223962 + }, + { + "epoch": 3.6654939900322487, + "grad_norm": 0.22063673089474994, + "learning_rate": 0.0002856680478738655, + "loss": 3.0770397186279297, + "step": 6253, + "token_acc": 0.29021020280091614 + }, + { + "epoch": 3.666080328349458, + "grad_norm": 0.24556187169892654, + "learning_rate": 0.0002856618456794794, + "loss": 3.039295196533203, + "step": 6254, + "token_acc": 0.2946949748651187 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.2231355415183732, + "learning_rate": 0.0002856556422107331, + "loss": 3.0602447986602783, + "step": 6255, + "token_acc": 0.2919747796925746 + }, + { + "epoch": 3.6672530049838756, + "grad_norm": 0.23473626656898686, + "learning_rate": 0.00028564943746768487, + "loss": 3.0048346519470215, + "step": 6256, + "token_acc": 0.2999019876483359 + }, + { + "epoch": 3.6678393433010847, + "grad_norm": 0.2298781700690723, + "learning_rate": 0.00028564323145039295, + "loss": 3.049794912338257, + "step": 6257, + "token_acc": 0.2926610376506824 + }, + { + "epoch": 3.668425681618294, + "grad_norm": 0.22078621206417584, + "learning_rate": 0.0002856370241589156, + "loss": 3.069427013397217, + "step": 6258, + "token_acc": 0.291476882514071 + }, + { + "epoch": 3.6690120199355025, + "grad_norm": 0.2496029972358602, + "learning_rate": 0.00028563081559331125, + "loss": 3.0456881523132324, + "step": 6259, + "token_acc": 0.29351389228212516 + }, + { + "epoch": 3.6695983582527116, + "grad_norm": 0.21231488870326154, + "learning_rate": 0.0002856246057536381, + "loss": 3.003896951675415, + "step": 6260, + "token_acc": 0.29941746970351196 + }, + { + "epoch": 3.6701846965699207, + "grad_norm": 0.24962407751473809, + "learning_rate": 0.0002856183946399546, + "loss": 3.067966938018799, + "step": 6261, + "token_acc": 0.29019375484517623 + }, + { + "epoch": 3.67077103488713, + "grad_norm": 0.23036958057995113, + "learning_rate": 0.00028561218225231895, + "loss": 3.085524797439575, + "step": 6262, + "token_acc": 0.2899528036506845 + }, + { + "epoch": 3.671357373204339, + "grad_norm": 0.24390954942617307, + "learning_rate": 0.00028560596859078963, + "loss": 3.068650722503662, + "step": 6263, + "token_acc": 0.2891924101322655 + }, + { + "epoch": 3.671943711521548, + "grad_norm": 0.22876964180172474, + "learning_rate": 0.000285599753655425, + "loss": 3.001950263977051, + "step": 6264, + "token_acc": 0.3002924565288801 + }, + { + "epoch": 3.672530049838757, + "grad_norm": 0.2226076018504195, + "learning_rate": 0.00028559353744628333, + "loss": 3.01513671875, + "step": 6265, + "token_acc": 0.29728039332457706 + }, + { + "epoch": 3.673116388155966, + "grad_norm": 0.23936517036321858, + "learning_rate": 0.0002855873199634232, + "loss": 3.0642800331115723, + "step": 6266, + "token_acc": 0.29035332025492727 + }, + { + "epoch": 3.673702726473175, + "grad_norm": 0.2295026491582116, + "learning_rate": 0.0002855811012069028, + "loss": 3.065189838409424, + "step": 6267, + "token_acc": 0.2913887888737821 + }, + { + "epoch": 3.674289064790384, + "grad_norm": 0.23481535624969016, + "learning_rate": 0.00028557488117678066, + "loss": 3.0329036712646484, + "step": 6268, + "token_acc": 0.29511549172029516 + }, + { + "epoch": 3.674875403107593, + "grad_norm": 0.24376107587618337, + "learning_rate": 0.00028556865987311523, + "loss": 3.007115602493286, + "step": 6269, + "token_acc": 0.30091812568833143 + }, + { + "epoch": 3.675461741424802, + "grad_norm": 0.2079329555934218, + "learning_rate": 0.00028556243729596496, + "loss": 3.0368456840515137, + "step": 6270, + "token_acc": 0.2947684725025873 + }, + { + "epoch": 3.676048079742011, + "grad_norm": 0.24340327717037627, + "learning_rate": 0.0002855562134453882, + "loss": 3.0828568935394287, + "step": 6271, + "token_acc": 0.28807240245680477 + }, + { + "epoch": 3.67663441805922, + "grad_norm": 0.2431885110915972, + "learning_rate": 0.0002855499883214435, + "loss": 3.066739082336426, + "step": 6272, + "token_acc": 0.2911608736742623 + }, + { + "epoch": 3.677220756376429, + "grad_norm": 0.22183506098538788, + "learning_rate": 0.00028554376192418935, + "loss": 3.066556453704834, + "step": 6273, + "token_acc": 0.29004406415057077 + }, + { + "epoch": 3.6778070946936383, + "grad_norm": 0.2349421391947139, + "learning_rate": 0.0002855375342536842, + "loss": 3.0259203910827637, + "step": 6274, + "token_acc": 0.29550064539922555 + }, + { + "epoch": 3.6783934330108474, + "grad_norm": 0.2391504109763641, + "learning_rate": 0.0002855313053099865, + "loss": 3.035552501678467, + "step": 6275, + "token_acc": 0.29476501234459157 + }, + { + "epoch": 3.6789797713280565, + "grad_norm": 0.2528250370642422, + "learning_rate": 0.00028552507509315485, + "loss": 3.041868209838867, + "step": 6276, + "token_acc": 0.2941566195648758 + }, + { + "epoch": 3.679566109645265, + "grad_norm": 0.2723296401651812, + "learning_rate": 0.00028551884360324776, + "loss": 3.0419082641601562, + "step": 6277, + "token_acc": 0.29392214313459675 + }, + { + "epoch": 3.6801524479624743, + "grad_norm": 0.2449935336146774, + "learning_rate": 0.0002855126108403237, + "loss": 3.024402379989624, + "step": 6278, + "token_acc": 0.2961206507843044 + }, + { + "epoch": 3.6807387862796834, + "grad_norm": 0.24396717339275592, + "learning_rate": 0.0002855063768044413, + "loss": 3.0179951190948486, + "step": 6279, + "token_acc": 0.29862773439790347 + }, + { + "epoch": 3.6813251245968925, + "grad_norm": 0.2718716812987276, + "learning_rate": 0.00028550014149565905, + "loss": 3.0523593425750732, + "step": 6280, + "token_acc": 0.2929203814570085 + }, + { + "epoch": 3.681911462914101, + "grad_norm": 0.23445317676148258, + "learning_rate": 0.00028549390491403563, + "loss": 3.0370612144470215, + "step": 6281, + "token_acc": 0.29510116621626176 + }, + { + "epoch": 3.6824978012313103, + "grad_norm": 0.26097352342735486, + "learning_rate": 0.0002854876670596295, + "loss": 3.021806240081787, + "step": 6282, + "token_acc": 0.2967319774053429 + }, + { + "epoch": 3.6830841395485194, + "grad_norm": 0.28311740173105376, + "learning_rate": 0.0002854814279324993, + "loss": 3.0518534183502197, + "step": 6283, + "token_acc": 0.2936101347242122 + }, + { + "epoch": 3.6836704778657285, + "grad_norm": 0.2570165347182792, + "learning_rate": 0.0002854751875327037, + "loss": 3.116191864013672, + "step": 6284, + "token_acc": 0.28238022245804345 + }, + { + "epoch": 3.6842568161829377, + "grad_norm": 0.2667994279990382, + "learning_rate": 0.00028546894586030125, + "loss": 3.0413732528686523, + "step": 6285, + "token_acc": 0.2934960319037588 + }, + { + "epoch": 3.6848431545001468, + "grad_norm": 0.24408015027958915, + "learning_rate": 0.0002854627029153506, + "loss": 3.0263116359710693, + "step": 6286, + "token_acc": 0.29689051109529013 + }, + { + "epoch": 3.685429492817356, + "grad_norm": 0.2382946833418734, + "learning_rate": 0.00028545645869791034, + "loss": 3.0616979598999023, + "step": 6287, + "token_acc": 0.2904464800004078 + }, + { + "epoch": 3.6860158311345645, + "grad_norm": 0.24487542150914396, + "learning_rate": 0.00028545021320803925, + "loss": 3.0452194213867188, + "step": 6288, + "token_acc": 0.2941673955885298 + }, + { + "epoch": 3.6866021694517737, + "grad_norm": 0.25132215773364, + "learning_rate": 0.00028544396644579587, + "loss": 3.0534563064575195, + "step": 6289, + "token_acc": 0.29234032512571956 + }, + { + "epoch": 3.6871885077689828, + "grad_norm": 0.26664778014370516, + "learning_rate": 0.000285437718411239, + "loss": 3.0950026512145996, + "step": 6290, + "token_acc": 0.2868106797754413 + }, + { + "epoch": 3.687774846086192, + "grad_norm": 0.26974034346403797, + "learning_rate": 0.00028543146910442723, + "loss": 3.020292043685913, + "step": 6291, + "token_acc": 0.2972572333696326 + }, + { + "epoch": 3.6883611844034006, + "grad_norm": 0.2645694897598869, + "learning_rate": 0.0002854252185254193, + "loss": 3.016523599624634, + "step": 6292, + "token_acc": 0.29942350792190797 + }, + { + "epoch": 3.6889475227206097, + "grad_norm": 0.26934204278972035, + "learning_rate": 0.00028541896667427395, + "loss": 3.0356132984161377, + "step": 6293, + "token_acc": 0.2952798444108295 + }, + { + "epoch": 3.6895338610378188, + "grad_norm": 0.26816659151293093, + "learning_rate": 0.0002854127135510499, + "loss": 3.0199217796325684, + "step": 6294, + "token_acc": 0.29616029333311666 + }, + { + "epoch": 3.690120199355028, + "grad_norm": 0.27938123406242554, + "learning_rate": 0.00028540645915580586, + "loss": 3.0653321743011475, + "step": 6295, + "token_acc": 0.2909547325525825 + }, + { + "epoch": 3.690706537672237, + "grad_norm": 0.2652623182298458, + "learning_rate": 0.00028540020348860064, + "loss": 3.029810905456543, + "step": 6296, + "token_acc": 0.29663038519194423 + }, + { + "epoch": 3.691292875989446, + "grad_norm": 0.2417985982007034, + "learning_rate": 0.00028539394654949286, + "loss": 3.0540506839752197, + "step": 6297, + "token_acc": 0.29160489535513817 + }, + { + "epoch": 3.6918792143066548, + "grad_norm": 0.24552760084472797, + "learning_rate": 0.00028538768833854145, + "loss": 3.002300262451172, + "step": 6298, + "token_acc": 0.29771533912611203 + }, + { + "epoch": 3.692465552623864, + "grad_norm": 0.27408660619844993, + "learning_rate": 0.00028538142885580517, + "loss": 3.0698652267456055, + "step": 6299, + "token_acc": 0.2936253150036925 + }, + { + "epoch": 3.693051890941073, + "grad_norm": 0.21819721342608012, + "learning_rate": 0.0002853751681013428, + "loss": 3.0275278091430664, + "step": 6300, + "token_acc": 0.2962553554820315 + }, + { + "epoch": 3.693638229258282, + "grad_norm": 0.2520690924503071, + "learning_rate": 0.0002853689060752131, + "loss": 3.0237326622009277, + "step": 6301, + "token_acc": 0.2977183692683303 + }, + { + "epoch": 3.6942245675754912, + "grad_norm": 0.2607935739738976, + "learning_rate": 0.00028536264277747496, + "loss": 3.064016342163086, + "step": 6302, + "token_acc": 0.2926070665758963 + }, + { + "epoch": 3.6948109058927, + "grad_norm": 0.2319666157670964, + "learning_rate": 0.0002853563782081872, + "loss": 3.045077323913574, + "step": 6303, + "token_acc": 0.2939700391122739 + }, + { + "epoch": 3.695397244209909, + "grad_norm": 0.252151786142626, + "learning_rate": 0.00028535011236740864, + "loss": 3.103978157043457, + "step": 6304, + "token_acc": 0.28707327524639337 + }, + { + "epoch": 3.695983582527118, + "grad_norm": 0.2446633525792926, + "learning_rate": 0.0002853438452551982, + "loss": 3.0283732414245605, + "step": 6305, + "token_acc": 0.2949966336846352 + }, + { + "epoch": 3.6965699208443272, + "grad_norm": 0.2569562495855419, + "learning_rate": 0.0002853375768716147, + "loss": 3.0257489681243896, + "step": 6306, + "token_acc": 0.29564465254074884 + }, + { + "epoch": 3.6971562591615363, + "grad_norm": 0.23603421873556632, + "learning_rate": 0.000285331307216717, + "loss": 3.0712578296661377, + "step": 6307, + "token_acc": 0.2899419215837297 + }, + { + "epoch": 3.6977425974787455, + "grad_norm": 0.24353115834599856, + "learning_rate": 0.0002853250362905641, + "loss": 3.09810209274292, + "step": 6308, + "token_acc": 0.2863721321493315 + }, + { + "epoch": 3.698328935795954, + "grad_norm": 0.23899595536552137, + "learning_rate": 0.0002853187640932148, + "loss": 3.030752658843994, + "step": 6309, + "token_acc": 0.29486384881121724 + }, + { + "epoch": 3.6989152741131632, + "grad_norm": 0.22489393718951928, + "learning_rate": 0.00028531249062472804, + "loss": 3.03317928314209, + "step": 6310, + "token_acc": 0.29577992574540296 + }, + { + "epoch": 3.6995016124303723, + "grad_norm": 0.23873722432503616, + "learning_rate": 0.00028530621588516276, + "loss": 3.0431041717529297, + "step": 6311, + "token_acc": 0.2951366577406165 + }, + { + "epoch": 3.7000879507475815, + "grad_norm": 0.2255437162308836, + "learning_rate": 0.00028529993987457794, + "loss": 2.994525671005249, + "step": 6312, + "token_acc": 0.2988028972863408 + }, + { + "epoch": 3.70067428906479, + "grad_norm": 0.22394343794989172, + "learning_rate": 0.00028529366259303246, + "loss": 3.036836862564087, + "step": 6313, + "token_acc": 0.29527252337190607 + }, + { + "epoch": 3.7012606273819992, + "grad_norm": 0.2560270260913191, + "learning_rate": 0.00028528738404058537, + "loss": 3.0621659755706787, + "step": 6314, + "token_acc": 0.2905362726210403 + }, + { + "epoch": 3.7018469656992083, + "grad_norm": 0.23908309431002314, + "learning_rate": 0.0002852811042172956, + "loss": 3.013627290725708, + "step": 6315, + "token_acc": 0.2973761163152451 + }, + { + "epoch": 3.7024333040164175, + "grad_norm": 0.2226503869397667, + "learning_rate": 0.00028527482312322216, + "loss": 3.0785021781921387, + "step": 6316, + "token_acc": 0.289618438792825 + }, + { + "epoch": 3.7030196423336266, + "grad_norm": 0.24512917350220673, + "learning_rate": 0.000285268540758424, + "loss": 3.0493693351745605, + "step": 6317, + "token_acc": 0.2942794709231362 + }, + { + "epoch": 3.7036059806508357, + "grad_norm": 0.23357419535857016, + "learning_rate": 0.0002852622571229603, + "loss": 3.0529932975769043, + "step": 6318, + "token_acc": 0.2942680721012911 + }, + { + "epoch": 3.704192318968045, + "grad_norm": 0.2504386521606627, + "learning_rate": 0.00028525597221688983, + "loss": 3.040954351425171, + "step": 6319, + "token_acc": 0.29651677488194106 + }, + { + "epoch": 3.7047786572852535, + "grad_norm": 0.2563645836142691, + "learning_rate": 0.0002852496860402718, + "loss": 3.036862373352051, + "step": 6320, + "token_acc": 0.29533590558329553 + }, + { + "epoch": 3.7053649956024626, + "grad_norm": 0.23995465300416452, + "learning_rate": 0.0002852433985931652, + "loss": 3.0738558769226074, + "step": 6321, + "token_acc": 0.2909190982956109 + }, + { + "epoch": 3.7059513339196717, + "grad_norm": 0.22603509487138204, + "learning_rate": 0.00028523710987562914, + "loss": 3.0628132820129395, + "step": 6322, + "token_acc": 0.29029722797920215 + }, + { + "epoch": 3.706537672236881, + "grad_norm": 0.25268831251955265, + "learning_rate": 0.00028523081988772263, + "loss": 3.0035760402679443, + "step": 6323, + "token_acc": 0.3014510891438464 + }, + { + "epoch": 3.7071240105540895, + "grad_norm": 0.23890538853445126, + "learning_rate": 0.00028522452862950485, + "loss": 3.036486864089966, + "step": 6324, + "token_acc": 0.29546627542933496 + }, + { + "epoch": 3.7077103488712986, + "grad_norm": 0.323565978030968, + "learning_rate": 0.00028521823610103483, + "loss": 3.0416018962860107, + "step": 6325, + "token_acc": 0.2940617951004564 + }, + { + "epoch": 3.7082966871885077, + "grad_norm": 0.24874471140667018, + "learning_rate": 0.00028521194230237166, + "loss": 3.0493903160095215, + "step": 6326, + "token_acc": 0.2933832742043971 + }, + { + "epoch": 3.708883025505717, + "grad_norm": 0.23721255375813155, + "learning_rate": 0.00028520564723357453, + "loss": 3.0542659759521484, + "step": 6327, + "token_acc": 0.2919549364863407 + }, + { + "epoch": 3.709469363822926, + "grad_norm": 0.26445135370276834, + "learning_rate": 0.0002851993508947025, + "loss": 3.080838918685913, + "step": 6328, + "token_acc": 0.2882543075760924 + }, + { + "epoch": 3.710055702140135, + "grad_norm": 0.24042224923126732, + "learning_rate": 0.00028519305328581477, + "loss": 3.061725616455078, + "step": 6329, + "token_acc": 0.29148789702868855 + }, + { + "epoch": 3.710642040457344, + "grad_norm": 0.2445990834729015, + "learning_rate": 0.0002851867544069705, + "loss": 3.0411431789398193, + "step": 6330, + "token_acc": 0.29518577759131553 + }, + { + "epoch": 3.711228378774553, + "grad_norm": 0.22385064252800183, + "learning_rate": 0.0002851804542582288, + "loss": 3.027780532836914, + "step": 6331, + "token_acc": 0.29649943104294413 + }, + { + "epoch": 3.711814717091762, + "grad_norm": 0.27817159024959176, + "learning_rate": 0.0002851741528396489, + "loss": 3.0399398803710938, + "step": 6332, + "token_acc": 0.29482829213368134 + }, + { + "epoch": 3.712401055408971, + "grad_norm": 0.25271228598506756, + "learning_rate": 0.00028516785015128993, + "loss": 3.0684733390808105, + "step": 6333, + "token_acc": 0.2900664740563268 + }, + { + "epoch": 3.71298739372618, + "grad_norm": 0.25852219919806274, + "learning_rate": 0.0002851615461932113, + "loss": 3.0366153717041016, + "step": 6334, + "token_acc": 0.2954201907958929 + }, + { + "epoch": 3.713573732043389, + "grad_norm": 0.2386639261627425, + "learning_rate": 0.00028515524096547196, + "loss": 3.0091781616210938, + "step": 6335, + "token_acc": 0.2968561755499233 + }, + { + "epoch": 3.714160070360598, + "grad_norm": 0.22826335240758994, + "learning_rate": 0.0002851489344681313, + "loss": 3.0571823120117188, + "step": 6336, + "token_acc": 0.2925041475242471 + }, + { + "epoch": 3.714746408677807, + "grad_norm": 0.2466755645272033, + "learning_rate": 0.00028514262670124846, + "loss": 3.0207712650299072, + "step": 6337, + "token_acc": 0.29824639266497666 + }, + { + "epoch": 3.715332746995016, + "grad_norm": 0.22324258184515208, + "learning_rate": 0.0002851363176648828, + "loss": 3.0651774406433105, + "step": 6338, + "token_acc": 0.28958385075673976 + }, + { + "epoch": 3.7159190853122253, + "grad_norm": 0.24490794121404746, + "learning_rate": 0.0002851300073590935, + "loss": 3.1067333221435547, + "step": 6339, + "token_acc": 0.28673464641400803 + }, + { + "epoch": 3.7165054236294344, + "grad_norm": 0.24905518737434362, + "learning_rate": 0.0002851236957839399, + "loss": 3.0595171451568604, + "step": 6340, + "token_acc": 0.291733398063888 + }, + { + "epoch": 3.7170917619466435, + "grad_norm": 0.2386830044564574, + "learning_rate": 0.0002851173829394812, + "loss": 3.034095287322998, + "step": 6341, + "token_acc": 0.2957442964798183 + }, + { + "epoch": 3.717678100263852, + "grad_norm": 0.21168080318368557, + "learning_rate": 0.00028511106882577684, + "loss": 3.0476434230804443, + "step": 6342, + "token_acc": 0.2944569451247783 + }, + { + "epoch": 3.7182644385810613, + "grad_norm": 0.2436826928759395, + "learning_rate": 0.000285104753442886, + "loss": 3.0797276496887207, + "step": 6343, + "token_acc": 0.2902818101530389 + }, + { + "epoch": 3.7188507768982704, + "grad_norm": 0.21375735911281327, + "learning_rate": 0.0002850984367908681, + "loss": 3.0940794944763184, + "step": 6344, + "token_acc": 0.28747019583679373 + }, + { + "epoch": 3.7194371152154795, + "grad_norm": 0.23570682805209767, + "learning_rate": 0.00028509211886978237, + "loss": 3.060107707977295, + "step": 6345, + "token_acc": 0.29070612474246116 + }, + { + "epoch": 3.720023453532688, + "grad_norm": 0.2386088563016908, + "learning_rate": 0.00028508579967968827, + "loss": 3.0342483520507812, + "step": 6346, + "token_acc": 0.2949751625497862 + }, + { + "epoch": 3.7206097918498973, + "grad_norm": 0.23327718435811648, + "learning_rate": 0.0002850794792206451, + "loss": 3.041989803314209, + "step": 6347, + "token_acc": 0.2940312556854287 + }, + { + "epoch": 3.7211961301671064, + "grad_norm": 0.24661049823514744, + "learning_rate": 0.0002850731574927123, + "loss": 3.012538433074951, + "step": 6348, + "token_acc": 0.29728749498921486 + }, + { + "epoch": 3.7217824684843155, + "grad_norm": 0.25265829111705396, + "learning_rate": 0.0002850668344959491, + "loss": 3.053062915802002, + "step": 6349, + "token_acc": 0.2930271388021937 + }, + { + "epoch": 3.7223688068015246, + "grad_norm": 0.2706767943312733, + "learning_rate": 0.00028506051023041506, + "loss": 3.068315029144287, + "step": 6350, + "token_acc": 0.29156016149355823 + }, + { + "epoch": 3.7229551451187337, + "grad_norm": 0.28424886803556937, + "learning_rate": 0.0002850541846961695, + "loss": 3.0167627334594727, + "step": 6351, + "token_acc": 0.29693101087383794 + }, + { + "epoch": 3.7235414834359424, + "grad_norm": 0.2873740574103147, + "learning_rate": 0.0002850478578932719, + "loss": 3.108415126800537, + "step": 6352, + "token_acc": 0.28665363154159923 + }, + { + "epoch": 3.7241278217531515, + "grad_norm": 0.2680451038955801, + "learning_rate": 0.0002850415298217816, + "loss": 3.094160556793213, + "step": 6353, + "token_acc": 0.28661077520126393 + }, + { + "epoch": 3.7247141600703606, + "grad_norm": 0.24137178719796246, + "learning_rate": 0.00028503520048175815, + "loss": 3.0768256187438965, + "step": 6354, + "token_acc": 0.2896692314449744 + }, + { + "epoch": 3.7253004983875697, + "grad_norm": 0.23462863299580317, + "learning_rate": 0.0002850288698732609, + "loss": 3.0688531398773193, + "step": 6355, + "token_acc": 0.2903470343682921 + }, + { + "epoch": 3.7258868367047784, + "grad_norm": 0.2454133515991619, + "learning_rate": 0.0002850225379963494, + "loss": 3.0415146350860596, + "step": 6356, + "token_acc": 0.29254136208151743 + }, + { + "epoch": 3.7264731750219875, + "grad_norm": 0.25109491492211666, + "learning_rate": 0.0002850162048510831, + "loss": 3.0239834785461426, + "step": 6357, + "token_acc": 0.2973652096590287 + }, + { + "epoch": 3.7270595133391966, + "grad_norm": 0.25680519445512295, + "learning_rate": 0.0002850098704375215, + "loss": 3.0739731788635254, + "step": 6358, + "token_acc": 0.288901028627056 + }, + { + "epoch": 3.7276458516564057, + "grad_norm": 0.24842415464280718, + "learning_rate": 0.00028500353475572406, + "loss": 3.051112651824951, + "step": 6359, + "token_acc": 0.29269594051185416 + }, + { + "epoch": 3.728232189973615, + "grad_norm": 0.25902259619998164, + "learning_rate": 0.0002849971978057504, + "loss": 3.069091796875, + "step": 6360, + "token_acc": 0.291835238818156 + }, + { + "epoch": 3.728818528290824, + "grad_norm": 0.2702791140852835, + "learning_rate": 0.0002849908595876599, + "loss": 3.053168296813965, + "step": 6361, + "token_acc": 0.2925977511121995 + }, + { + "epoch": 3.729404866608033, + "grad_norm": 0.24184871618343232, + "learning_rate": 0.0002849845201015122, + "loss": 3.051150321960449, + "step": 6362, + "token_acc": 0.2946499369077694 + }, + { + "epoch": 3.7299912049252417, + "grad_norm": 0.2600497621089124, + "learning_rate": 0.00028497817934736684, + "loss": 3.046060085296631, + "step": 6363, + "token_acc": 0.29459531077217854 + }, + { + "epoch": 3.730577543242451, + "grad_norm": 0.2530514816247341, + "learning_rate": 0.0002849718373252834, + "loss": 3.081827402114868, + "step": 6364, + "token_acc": 0.2881765902142732 + }, + { + "epoch": 3.73116388155966, + "grad_norm": 0.26290493113051316, + "learning_rate": 0.00028496549403532137, + "loss": 3.0431737899780273, + "step": 6365, + "token_acc": 0.29433937123485293 + }, + { + "epoch": 3.731750219876869, + "grad_norm": 0.22599488903163048, + "learning_rate": 0.0002849591494775404, + "loss": 3.0695924758911133, + "step": 6366, + "token_acc": 0.29004230460337516 + }, + { + "epoch": 3.7323365581940777, + "grad_norm": 0.25546874435239947, + "learning_rate": 0.00028495280365200014, + "loss": 3.0388965606689453, + "step": 6367, + "token_acc": 0.29544651561881535 + }, + { + "epoch": 3.732922896511287, + "grad_norm": 0.2554775953291527, + "learning_rate": 0.0002849464565587601, + "loss": 3.0486574172973633, + "step": 6368, + "token_acc": 0.29296297650321457 + }, + { + "epoch": 3.733509234828496, + "grad_norm": 0.25269194270285406, + "learning_rate": 0.0002849401081978799, + "loss": 3.0564098358154297, + "step": 6369, + "token_acc": 0.29282154208114913 + }, + { + "epoch": 3.734095573145705, + "grad_norm": 0.2628053181491472, + "learning_rate": 0.00028493375856941927, + "loss": 3.1092867851257324, + "step": 6370, + "token_acc": 0.2854127702257933 + }, + { + "epoch": 3.734681911462914, + "grad_norm": 0.23895046123958516, + "learning_rate": 0.00028492740767343774, + "loss": 3.0111608505249023, + "step": 6371, + "token_acc": 0.29776406272861494 + }, + { + "epoch": 3.7352682497801233, + "grad_norm": 0.2752433124044039, + "learning_rate": 0.0002849210555099951, + "loss": 3.046128273010254, + "step": 6372, + "token_acc": 0.294464993048451 + }, + { + "epoch": 3.7358545880973324, + "grad_norm": 0.23982772538034236, + "learning_rate": 0.00028491470207915094, + "loss": 2.9912474155426025, + "step": 6373, + "token_acc": 0.3004064020612423 + }, + { + "epoch": 3.736440926414541, + "grad_norm": 0.2507539485663484, + "learning_rate": 0.0002849083473809649, + "loss": 3.010636806488037, + "step": 6374, + "token_acc": 0.2973783735110242 + }, + { + "epoch": 3.73702726473175, + "grad_norm": 0.2090374503572626, + "learning_rate": 0.0002849019914154968, + "loss": 3.039000988006592, + "step": 6375, + "token_acc": 0.29455690000589996 + }, + { + "epoch": 3.7376136030489593, + "grad_norm": 0.2596178406343501, + "learning_rate": 0.0002848956341828063, + "loss": 3.0357675552368164, + "step": 6376, + "token_acc": 0.29370622075950953 + }, + { + "epoch": 3.7381999413661684, + "grad_norm": 0.24495939720971815, + "learning_rate": 0.00028488927568295306, + "loss": 3.1018295288085938, + "step": 6377, + "token_acc": 0.2845117713457473 + }, + { + "epoch": 3.738786279683377, + "grad_norm": 0.2424867944672574, + "learning_rate": 0.0002848829159159968, + "loss": 3.044104814529419, + "step": 6378, + "token_acc": 0.2938176201937749 + }, + { + "epoch": 3.739372618000586, + "grad_norm": 0.21686618941266966, + "learning_rate": 0.0002848765548819973, + "loss": 3.0173449516296387, + "step": 6379, + "token_acc": 0.2979773037721971 + }, + { + "epoch": 3.7399589563177953, + "grad_norm": 0.25439399360676873, + "learning_rate": 0.00028487019258101434, + "loss": 3.0611729621887207, + "step": 6380, + "token_acc": 0.29304729742102054 + }, + { + "epoch": 3.7405452946350044, + "grad_norm": 0.2514092173767438, + "learning_rate": 0.0002848638290131077, + "loss": 3.053497791290283, + "step": 6381, + "token_acc": 0.29288549247168627 + }, + { + "epoch": 3.7411316329522135, + "grad_norm": 0.2548914787929738, + "learning_rate": 0.00028485746417833706, + "loss": 3.1131839752197266, + "step": 6382, + "token_acc": 0.28481981392391875 + }, + { + "epoch": 3.7417179712694226, + "grad_norm": 0.24814612398835684, + "learning_rate": 0.0002848510980767623, + "loss": 3.052476644515991, + "step": 6383, + "token_acc": 0.2922432408174902 + }, + { + "epoch": 3.7423043095866317, + "grad_norm": 0.23686027209091398, + "learning_rate": 0.00028484473070844316, + "loss": 3.0030674934387207, + "step": 6384, + "token_acc": 0.30066860668391526 + }, + { + "epoch": 3.7428906479038404, + "grad_norm": 0.2736626989457586, + "learning_rate": 0.00028483836207343947, + "loss": 3.0640578269958496, + "step": 6385, + "token_acc": 0.29195518716798613 + }, + { + "epoch": 3.7434769862210495, + "grad_norm": 0.24992428664228156, + "learning_rate": 0.0002848319921718111, + "loss": 3.0661873817443848, + "step": 6386, + "token_acc": 0.2889865894183486 + }, + { + "epoch": 3.7440633245382586, + "grad_norm": 0.2752225351196584, + "learning_rate": 0.00028482562100361783, + "loss": 3.0281476974487305, + "step": 6387, + "token_acc": 0.29642614562001113 + }, + { + "epoch": 3.7446496628554677, + "grad_norm": 0.23917917465863353, + "learning_rate": 0.00028481924856891953, + "loss": 3.067188262939453, + "step": 6388, + "token_acc": 0.29242867167089154 + }, + { + "epoch": 3.7452360011726764, + "grad_norm": 0.25384496386963346, + "learning_rate": 0.0002848128748677761, + "loss": 3.054591655731201, + "step": 6389, + "token_acc": 0.2904961554365634 + }, + { + "epoch": 3.7458223394898855, + "grad_norm": 0.2709709902763423, + "learning_rate": 0.0002848064999002473, + "loss": 3.0426647663116455, + "step": 6390, + "token_acc": 0.2935410868440227 + }, + { + "epoch": 3.7464086778070946, + "grad_norm": 0.2366056656163999, + "learning_rate": 0.00028480012366639314, + "loss": 3.054283618927002, + "step": 6391, + "token_acc": 0.29268045981661955 + }, + { + "epoch": 3.7469950161243037, + "grad_norm": 0.2462531332293585, + "learning_rate": 0.00028479374616627344, + "loss": 3.09869384765625, + "step": 6392, + "token_acc": 0.2858542872101921 + }, + { + "epoch": 3.747581354441513, + "grad_norm": 0.23544219368219935, + "learning_rate": 0.0002847873673999482, + "loss": 3.0563108921051025, + "step": 6393, + "token_acc": 0.2947198436715474 + }, + { + "epoch": 3.748167692758722, + "grad_norm": 0.23223961505635574, + "learning_rate": 0.00028478098736747716, + "loss": 3.035799503326416, + "step": 6394, + "token_acc": 0.2962619938502575 + }, + { + "epoch": 3.748754031075931, + "grad_norm": 0.24297599340063644, + "learning_rate": 0.0002847746060689204, + "loss": 3.0335381031036377, + "step": 6395, + "token_acc": 0.29560658531579537 + }, + { + "epoch": 3.7493403693931397, + "grad_norm": 0.25120157199270393, + "learning_rate": 0.0002847682235043379, + "loss": 3.0461766719818115, + "step": 6396, + "token_acc": 0.29328470450543864 + }, + { + "epoch": 3.749926707710349, + "grad_norm": 0.25705045307561425, + "learning_rate": 0.00028476183967378945, + "loss": 3.061815023422241, + "step": 6397, + "token_acc": 0.2926278414779022 + }, + { + "epoch": 3.750513046027558, + "grad_norm": 0.2313712386816058, + "learning_rate": 0.00028475545457733517, + "loss": 3.0745768547058105, + "step": 6398, + "token_acc": 0.29098399419881665 + }, + { + "epoch": 3.751099384344767, + "grad_norm": 0.22846635013309133, + "learning_rate": 0.00028474906821503496, + "loss": 3.052258014678955, + "step": 6399, + "token_acc": 0.29240991038705166 + }, + { + "epoch": 3.7516857226619758, + "grad_norm": 0.26409885071022154, + "learning_rate": 0.0002847426805869488, + "loss": 3.018313407897949, + "step": 6400, + "token_acc": 0.298551751279024 + }, + { + "epoch": 3.752272060979185, + "grad_norm": 0.23608630858426222, + "learning_rate": 0.0002847362916931368, + "loss": 3.0369508266448975, + "step": 6401, + "token_acc": 0.29457809020697856 + }, + { + "epoch": 3.752858399296394, + "grad_norm": 0.24282613930187477, + "learning_rate": 0.0002847299015336588, + "loss": 3.061882495880127, + "step": 6402, + "token_acc": 0.2946495084409657 + }, + { + "epoch": 3.753444737613603, + "grad_norm": 0.2606626241276367, + "learning_rate": 0.000284723510108575, + "loss": 3.0644145011901855, + "step": 6403, + "token_acc": 0.289663515272646 + }, + { + "epoch": 3.754031075930812, + "grad_norm": 0.21666394049372992, + "learning_rate": 0.00028471711741794535, + "loss": 3.0359692573547363, + "step": 6404, + "token_acc": 0.29566102461642857 + }, + { + "epoch": 3.7546174142480213, + "grad_norm": 0.2204038364055347, + "learning_rate": 0.0002847107234618299, + "loss": 3.0340352058410645, + "step": 6405, + "token_acc": 0.2953189936867485 + }, + { + "epoch": 3.75520375256523, + "grad_norm": 0.24590685353033298, + "learning_rate": 0.0002847043282402888, + "loss": 3.0559425354003906, + "step": 6406, + "token_acc": 0.29301301477003455 + }, + { + "epoch": 3.755790090882439, + "grad_norm": 0.2453661244887871, + "learning_rate": 0.00028469793175338196, + "loss": 3.0458221435546875, + "step": 6407, + "token_acc": 0.2931477931085059 + }, + { + "epoch": 3.756376429199648, + "grad_norm": 0.22450336507868635, + "learning_rate": 0.00028469153400116957, + "loss": 3.0267984867095947, + "step": 6408, + "token_acc": 0.2961428480312952 + }, + { + "epoch": 3.7569627675168573, + "grad_norm": 0.2406599274072631, + "learning_rate": 0.0002846851349837117, + "loss": 3.0346498489379883, + "step": 6409, + "token_acc": 0.29565938508171963 + }, + { + "epoch": 3.757549105834066, + "grad_norm": 0.2266248558943796, + "learning_rate": 0.00028467873470106855, + "loss": 3.054716110229492, + "step": 6410, + "token_acc": 0.2938531237188099 + }, + { + "epoch": 3.758135444151275, + "grad_norm": 0.24346540358595822, + "learning_rate": 0.00028467233315330017, + "loss": 3.062631607055664, + "step": 6411, + "token_acc": 0.2923513038570567 + }, + { + "epoch": 3.758721782468484, + "grad_norm": 0.22536872848043504, + "learning_rate": 0.00028466593034046664, + "loss": 3.0225768089294434, + "step": 6412, + "token_acc": 0.29782877072106784 + }, + { + "epoch": 3.7593081207856933, + "grad_norm": 0.2772536681402159, + "learning_rate": 0.00028465952626262813, + "loss": 3.032212972640991, + "step": 6413, + "token_acc": 0.2963345896887421 + }, + { + "epoch": 3.7598944591029024, + "grad_norm": 0.29264129588827326, + "learning_rate": 0.0002846531209198449, + "loss": 3.0384366512298584, + "step": 6414, + "token_acc": 0.29443342548417484 + }, + { + "epoch": 3.7604807974201115, + "grad_norm": 0.22938356386841893, + "learning_rate": 0.000284646714312177, + "loss": 3.070744514465332, + "step": 6415, + "token_acc": 0.29095955590800954 + }, + { + "epoch": 3.7610671357373207, + "grad_norm": 0.2847067260309995, + "learning_rate": 0.0002846403064396847, + "loss": 3.0593061447143555, + "step": 6416, + "token_acc": 0.29246225660043446 + }, + { + "epoch": 3.7616534740545293, + "grad_norm": 0.28283883830815393, + "learning_rate": 0.0002846338973024281, + "loss": 3.0490171909332275, + "step": 6417, + "token_acc": 0.292628020357004 + }, + { + "epoch": 3.7622398123717384, + "grad_norm": 0.23334805599281574, + "learning_rate": 0.0002846274869004675, + "loss": 3.074446201324463, + "step": 6418, + "token_acc": 0.28952731609925586 + }, + { + "epoch": 3.7628261506889475, + "grad_norm": 0.240273699956498, + "learning_rate": 0.00028462107523386307, + "loss": 3.0408897399902344, + "step": 6419, + "token_acc": 0.2939683275314539 + }, + { + "epoch": 3.7634124890061567, + "grad_norm": 0.2818884646412402, + "learning_rate": 0.000284614662302675, + "loss": 3.09450364112854, + "step": 6420, + "token_acc": 0.2875641326665701 + }, + { + "epoch": 3.7639988273233653, + "grad_norm": 0.3033954108076701, + "learning_rate": 0.0002846082481069636, + "loss": 3.042027235031128, + "step": 6421, + "token_acc": 0.2939763852981156 + }, + { + "epoch": 3.7645851656405744, + "grad_norm": 0.24273642170049947, + "learning_rate": 0.00028460183264678913, + "loss": 3.060802936553955, + "step": 6422, + "token_acc": 0.29244712671899215 + }, + { + "epoch": 3.7651715039577835, + "grad_norm": 0.26299360229745156, + "learning_rate": 0.0002845954159222118, + "loss": 3.025076389312744, + "step": 6423, + "token_acc": 0.29623854217096274 + }, + { + "epoch": 3.7657578422749927, + "grad_norm": 0.24528267993543085, + "learning_rate": 0.00028458899793329184, + "loss": 3.011979103088379, + "step": 6424, + "token_acc": 0.298663008648267 + }, + { + "epoch": 3.7663441805922018, + "grad_norm": 0.25178545979686223, + "learning_rate": 0.00028458257868008965, + "loss": 3.0271973609924316, + "step": 6425, + "token_acc": 0.2956327965146579 + }, + { + "epoch": 3.766930518909411, + "grad_norm": 0.2637632497807345, + "learning_rate": 0.00028457615816266556, + "loss": 3.070158004760742, + "step": 6426, + "token_acc": 0.28954282573993145 + }, + { + "epoch": 3.76751685722662, + "grad_norm": 0.2544184252975495, + "learning_rate": 0.00028456973638107974, + "loss": 3.0141348838806152, + "step": 6427, + "token_acc": 0.29697565837792467 + }, + { + "epoch": 3.7681031955438287, + "grad_norm": 0.24115774032473214, + "learning_rate": 0.00028456331333539256, + "loss": 3.0248701572418213, + "step": 6428, + "token_acc": 0.2980353313698941 + }, + { + "epoch": 3.7686895338610378, + "grad_norm": 0.25593151109143103, + "learning_rate": 0.0002845568890256644, + "loss": 3.034853935241699, + "step": 6429, + "token_acc": 0.29537424696576425 + }, + { + "epoch": 3.769275872178247, + "grad_norm": 0.24726855518624943, + "learning_rate": 0.00028455046345195564, + "loss": 3.068155527114868, + "step": 6430, + "token_acc": 0.2902495721391974 + }, + { + "epoch": 3.769862210495456, + "grad_norm": 0.22265960546239366, + "learning_rate": 0.0002845440366143265, + "loss": 3.039623975753784, + "step": 6431, + "token_acc": 0.29595725948892937 + }, + { + "epoch": 3.7704485488126647, + "grad_norm": 0.2652900310559628, + "learning_rate": 0.00028453760851283747, + "loss": 3.075779914855957, + "step": 6432, + "token_acc": 0.2897386873938548 + }, + { + "epoch": 3.771034887129874, + "grad_norm": 0.23278231456836948, + "learning_rate": 0.0002845311791475489, + "loss": 3.0230231285095215, + "step": 6433, + "token_acc": 0.29580449403082726 + }, + { + "epoch": 3.771621225447083, + "grad_norm": 0.24816611134423577, + "learning_rate": 0.0002845247485185212, + "loss": 3.0687031745910645, + "step": 6434, + "token_acc": 0.29038213581276556 + }, + { + "epoch": 3.772207563764292, + "grad_norm": 0.21455168927030738, + "learning_rate": 0.0002845183166258147, + "loss": 3.035602331161499, + "step": 6435, + "token_acc": 0.29569096791977667 + }, + { + "epoch": 3.772793902081501, + "grad_norm": 0.24432139057861427, + "learning_rate": 0.00028451188346948993, + "loss": 3.0453948974609375, + "step": 6436, + "token_acc": 0.2946295136925438 + }, + { + "epoch": 3.7733802403987102, + "grad_norm": 0.2421936390638523, + "learning_rate": 0.0002845054490496073, + "loss": 3.0765252113342285, + "step": 6437, + "token_acc": 0.28885841458135125 + }, + { + "epoch": 3.7739665787159193, + "grad_norm": 0.25820734129475115, + "learning_rate": 0.0002844990133662272, + "loss": 3.046961784362793, + "step": 6438, + "token_acc": 0.29247162759805806 + }, + { + "epoch": 3.774552917033128, + "grad_norm": 0.23577313944541295, + "learning_rate": 0.0002844925764194101, + "loss": 3.047924041748047, + "step": 6439, + "token_acc": 0.2929797298540939 + }, + { + "epoch": 3.775139255350337, + "grad_norm": 0.2560432761791687, + "learning_rate": 0.00028448613820921645, + "loss": 3.0556278228759766, + "step": 6440, + "token_acc": 0.2931772153240942 + }, + { + "epoch": 3.7757255936675462, + "grad_norm": 0.2233623452701283, + "learning_rate": 0.0002844796987357068, + "loss": 3.036797046661377, + "step": 6441, + "token_acc": 0.2945163808792695 + }, + { + "epoch": 3.7763119319847553, + "grad_norm": 0.24501848887832836, + "learning_rate": 0.0002844732579989416, + "loss": 3.0581722259521484, + "step": 6442, + "token_acc": 0.290115338052144 + }, + { + "epoch": 3.776898270301964, + "grad_norm": 0.238232780501294, + "learning_rate": 0.0002844668159989813, + "loss": 3.066704750061035, + "step": 6443, + "token_acc": 0.2900033051127552 + }, + { + "epoch": 3.777484608619173, + "grad_norm": 0.22967718366115042, + "learning_rate": 0.0002844603727358865, + "loss": 3.0477566719055176, + "step": 6444, + "token_acc": 0.2925387640404926 + }, + { + "epoch": 3.7780709469363822, + "grad_norm": 0.21710124007849205, + "learning_rate": 0.0002844539282097177, + "loss": 3.077892780303955, + "step": 6445, + "token_acc": 0.28852728380992765 + }, + { + "epoch": 3.7786572852535913, + "grad_norm": 0.2162954666512737, + "learning_rate": 0.00028444748242053533, + "loss": 3.0255706310272217, + "step": 6446, + "token_acc": 0.29465866255701106 + }, + { + "epoch": 3.7792436235708005, + "grad_norm": 0.22442976405759818, + "learning_rate": 0.0002844410353684001, + "loss": 3.0664663314819336, + "step": 6447, + "token_acc": 0.29091161800263293 + }, + { + "epoch": 3.7798299618880096, + "grad_norm": 0.2244595182591214, + "learning_rate": 0.0002844345870533725, + "loss": 3.061717987060547, + "step": 6448, + "token_acc": 0.291406070820223 + }, + { + "epoch": 3.7804163002052187, + "grad_norm": 0.23517134813167523, + "learning_rate": 0.0002844281374755131, + "loss": 3.0814316272735596, + "step": 6449, + "token_acc": 0.28957951102956864 + }, + { + "epoch": 3.7810026385224274, + "grad_norm": 0.22336815453940145, + "learning_rate": 0.0002844216866348825, + "loss": 3.0562524795532227, + "step": 6450, + "token_acc": 0.2917234400274106 + }, + { + "epoch": 3.7815889768396365, + "grad_norm": 0.20703671576447452, + "learning_rate": 0.0002844152345315413, + "loss": 3.011404037475586, + "step": 6451, + "token_acc": 0.29775394782185644 + }, + { + "epoch": 3.7821753151568456, + "grad_norm": 0.2334966017862748, + "learning_rate": 0.0002844087811655501, + "loss": 3.0344905853271484, + "step": 6452, + "token_acc": 0.29646743880833787 + }, + { + "epoch": 3.7827616534740547, + "grad_norm": 0.21529533160813272, + "learning_rate": 0.0002844023265369695, + "loss": 3.0114846229553223, + "step": 6453, + "token_acc": 0.2979767471152388 + }, + { + "epoch": 3.7833479917912634, + "grad_norm": 0.23670448252195947, + "learning_rate": 0.00028439587064586015, + "loss": 3.0306639671325684, + "step": 6454, + "token_acc": 0.29540054685532224 + }, + { + "epoch": 3.7839343301084725, + "grad_norm": 0.2104277151462322, + "learning_rate": 0.0002843894134922827, + "loss": 3.0959527492523193, + "step": 6455, + "token_acc": 0.2862502476822159 + }, + { + "epoch": 3.7845206684256816, + "grad_norm": 0.2553915305388533, + "learning_rate": 0.00028438295507629785, + "loss": 3.0344364643096924, + "step": 6456, + "token_acc": 0.29503686976577603 + }, + { + "epoch": 3.7851070067428907, + "grad_norm": 0.2594435434190842, + "learning_rate": 0.0002843764953979661, + "loss": 3.0273489952087402, + "step": 6457, + "token_acc": 0.2947190003698589 + }, + { + "epoch": 3.7856933450601, + "grad_norm": 0.24300527670216832, + "learning_rate": 0.0002843700344573483, + "loss": 3.0300164222717285, + "step": 6458, + "token_acc": 0.2942813838550247 + }, + { + "epoch": 3.786279683377309, + "grad_norm": 0.22614606437977886, + "learning_rate": 0.00028436357225450516, + "loss": 3.039964437484741, + "step": 6459, + "token_acc": 0.294184095767643 + }, + { + "epoch": 3.7868660216945176, + "grad_norm": 0.23540313962612164, + "learning_rate": 0.00028435710878949727, + "loss": 3.035740375518799, + "step": 6460, + "token_acc": 0.2955252415646153 + }, + { + "epoch": 3.7874523600117267, + "grad_norm": 0.27215698184753095, + "learning_rate": 0.0002843506440623854, + "loss": 3.0343897342681885, + "step": 6461, + "token_acc": 0.2948392576900965 + }, + { + "epoch": 3.788038698328936, + "grad_norm": 0.23735971413185564, + "learning_rate": 0.0002843441780732303, + "loss": 3.0503010749816895, + "step": 6462, + "token_acc": 0.2909729592216943 + }, + { + "epoch": 3.788625036646145, + "grad_norm": 0.23159033072329363, + "learning_rate": 0.00028433771082209266, + "loss": 3.0826187133789062, + "step": 6463, + "token_acc": 0.2892745872172959 + }, + { + "epoch": 3.7892113749633536, + "grad_norm": 0.2488320411798707, + "learning_rate": 0.0002843312423090332, + "loss": 3.0095126628875732, + "step": 6464, + "token_acc": 0.2984580823036863 + }, + { + "epoch": 3.7897977132805627, + "grad_norm": 0.2580145832332817, + "learning_rate": 0.00028432477253411277, + "loss": 3.0778167247772217, + "step": 6465, + "token_acc": 0.2910244603442867 + }, + { + "epoch": 3.790384051597772, + "grad_norm": 0.23593527792498536, + "learning_rate": 0.0002843183014973921, + "loss": 3.0669336318969727, + "step": 6466, + "token_acc": 0.29168375531478585 + }, + { + "epoch": 3.790970389914981, + "grad_norm": 0.2144930487710654, + "learning_rate": 0.00028431182919893194, + "loss": 3.0286874771118164, + "step": 6467, + "token_acc": 0.2966722920900973 + }, + { + "epoch": 3.79155672823219, + "grad_norm": 0.2398808315952976, + "learning_rate": 0.0002843053556387932, + "loss": 3.003859519958496, + "step": 6468, + "token_acc": 0.2996412852672552 + }, + { + "epoch": 3.792143066549399, + "grad_norm": 0.22255860697861415, + "learning_rate": 0.0002842988808170366, + "loss": 2.9843082427978516, + "step": 6469, + "token_acc": 0.3018933523004735 + }, + { + "epoch": 3.7927294048666083, + "grad_norm": 0.24553254366153746, + "learning_rate": 0.000284292404733723, + "loss": 3.0547640323638916, + "step": 6470, + "token_acc": 0.29386423414563634 + }, + { + "epoch": 3.793315743183817, + "grad_norm": 0.22423763226762575, + "learning_rate": 0.00028428592738891323, + "loss": 3.035776376724243, + "step": 6471, + "token_acc": 0.2957616805969193 + }, + { + "epoch": 3.793902081501026, + "grad_norm": 0.25264490307938403, + "learning_rate": 0.00028427944878266806, + "loss": 3.0488390922546387, + "step": 6472, + "token_acc": 0.29412754049930045 + }, + { + "epoch": 3.794488419818235, + "grad_norm": 0.27483429483454197, + "learning_rate": 0.00028427296891504847, + "loss": 3.0293054580688477, + "step": 6473, + "token_acc": 0.296178912943845 + }, + { + "epoch": 3.7950747581354443, + "grad_norm": 0.2621509294896071, + "learning_rate": 0.00028426648778611524, + "loss": 3.0560498237609863, + "step": 6474, + "token_acc": 0.2920218964671246 + }, + { + "epoch": 3.795661096452653, + "grad_norm": 0.2843109639280927, + "learning_rate": 0.0002842600053959293, + "loss": 3.0406782627105713, + "step": 6475, + "token_acc": 0.29298797438379015 + }, + { + "epoch": 3.796247434769862, + "grad_norm": 0.2925644969396739, + "learning_rate": 0.0002842535217445515, + "loss": 3.070838212966919, + "step": 6476, + "token_acc": 0.2903781307112152 + }, + { + "epoch": 3.796833773087071, + "grad_norm": 0.25515216160340337, + "learning_rate": 0.00028424703683204285, + "loss": 3.05586838722229, + "step": 6477, + "token_acc": 0.29095701383774564 + }, + { + "epoch": 3.7974201114042803, + "grad_norm": 0.2410106031428619, + "learning_rate": 0.0002842405506584641, + "loss": 3.031773567199707, + "step": 6478, + "token_acc": 0.2963058891243492 + }, + { + "epoch": 3.7980064497214894, + "grad_norm": 0.24351233982252637, + "learning_rate": 0.0002842340632238763, + "loss": 3.045006275177002, + "step": 6479, + "token_acc": 0.29487010516421186 + }, + { + "epoch": 3.7985927880386985, + "grad_norm": 0.24132649031562983, + "learning_rate": 0.00028422757452834034, + "loss": 3.0639381408691406, + "step": 6480, + "token_acc": 0.29068289066201286 + }, + { + "epoch": 3.7991791263559076, + "grad_norm": 0.2514291527075244, + "learning_rate": 0.00028422108457191726, + "loss": 3.038318395614624, + "step": 6481, + "token_acc": 0.29253146361389465 + }, + { + "epoch": 3.7997654646731163, + "grad_norm": 0.26371432246007004, + "learning_rate": 0.0002842145933546679, + "loss": 3.021477460861206, + "step": 6482, + "token_acc": 0.2964851344922132 + }, + { + "epoch": 3.8003518029903254, + "grad_norm": 0.2359566190959969, + "learning_rate": 0.00028420810087665326, + "loss": 3.060835838317871, + "step": 6483, + "token_acc": 0.29099584463932254 + }, + { + "epoch": 3.8009381413075345, + "grad_norm": 0.2203606915876757, + "learning_rate": 0.0002842016071379344, + "loss": 3.0279574394226074, + "step": 6484, + "token_acc": 0.295239651193661 + }, + { + "epoch": 3.8015244796247436, + "grad_norm": 0.2585880866905589, + "learning_rate": 0.0002841951121385723, + "loss": 3.0840396881103516, + "step": 6485, + "token_acc": 0.2889995227838469 + }, + { + "epoch": 3.8021108179419523, + "grad_norm": 0.23776772573882637, + "learning_rate": 0.00028418861587862793, + "loss": 3.0965988636016846, + "step": 6486, + "token_acc": 0.28499383438201537 + }, + { + "epoch": 3.8026971562591614, + "grad_norm": 0.2265598711408663, + "learning_rate": 0.0002841821183581623, + "loss": 3.031135082244873, + "step": 6487, + "token_acc": 0.295494839732758 + }, + { + "epoch": 3.8032834945763705, + "grad_norm": 0.22902049968928967, + "learning_rate": 0.00028417561957723653, + "loss": 3.0620620250701904, + "step": 6488, + "token_acc": 0.2929412199786562 + }, + { + "epoch": 3.8038698328935796, + "grad_norm": 0.23826953501110648, + "learning_rate": 0.00028416911953591163, + "loss": 3.0391287803649902, + "step": 6489, + "token_acc": 0.2946972411322107 + }, + { + "epoch": 3.8044561712107887, + "grad_norm": 0.21449763283532503, + "learning_rate": 0.00028416261823424865, + "loss": 3.0535244941711426, + "step": 6490, + "token_acc": 0.29301729822959466 + }, + { + "epoch": 3.805042509527998, + "grad_norm": 0.2675763512311035, + "learning_rate": 0.0002841561156723086, + "loss": 3.0464394092559814, + "step": 6491, + "token_acc": 0.2936951130106537 + }, + { + "epoch": 3.805628847845207, + "grad_norm": 0.2755577913535871, + "learning_rate": 0.00028414961185015266, + "loss": 3.0616469383239746, + "step": 6492, + "token_acc": 0.29148671052492636 + }, + { + "epoch": 3.8062151861624156, + "grad_norm": 0.2548119208249405, + "learning_rate": 0.0002841431067678419, + "loss": 3.067382335662842, + "step": 6493, + "token_acc": 0.28984290378141986 + }, + { + "epoch": 3.8068015244796247, + "grad_norm": 0.25701191009551894, + "learning_rate": 0.0002841366004254374, + "loss": 3.02752685546875, + "step": 6494, + "token_acc": 0.2957345996542564 + }, + { + "epoch": 3.807387862796834, + "grad_norm": 0.2380749726948019, + "learning_rate": 0.00028413009282300027, + "loss": 3.0252604484558105, + "step": 6495, + "token_acc": 0.2964519116001991 + }, + { + "epoch": 3.807974201114043, + "grad_norm": 0.25607305813708314, + "learning_rate": 0.00028412358396059164, + "loss": 3.0116400718688965, + "step": 6496, + "token_acc": 0.29925378671443564 + }, + { + "epoch": 3.8085605394312516, + "grad_norm": 0.25046564668792964, + "learning_rate": 0.0002841170738382727, + "loss": 3.0679514408111572, + "step": 6497, + "token_acc": 0.2903453436862777 + }, + { + "epoch": 3.8091468777484607, + "grad_norm": 0.22118235754033555, + "learning_rate": 0.0002841105624561046, + "loss": 3.0552923679351807, + "step": 6498, + "token_acc": 0.29341797099965444 + }, + { + "epoch": 3.80973321606567, + "grad_norm": 0.23387149028809978, + "learning_rate": 0.00028410404981414844, + "loss": 3.027594566345215, + "step": 6499, + "token_acc": 0.29755377705009856 + }, + { + "epoch": 3.810319554382879, + "grad_norm": 0.23973350415852107, + "learning_rate": 0.0002840975359124654, + "loss": 3.052293300628662, + "step": 6500, + "token_acc": 0.29125434471478223 + }, + { + "epoch": 3.810905892700088, + "grad_norm": 0.23511353140805805, + "learning_rate": 0.00028409102075111685, + "loss": 3.0663585662841797, + "step": 6501, + "token_acc": 0.2901506931886679 + }, + { + "epoch": 3.811492231017297, + "grad_norm": 0.2375920723671012, + "learning_rate": 0.0002840845043301637, + "loss": 3.032205104827881, + "step": 6502, + "token_acc": 0.2949895610438436 + }, + { + "epoch": 3.8120785693345063, + "grad_norm": 0.26184012379987287, + "learning_rate": 0.0002840779866496674, + "loss": 3.060265064239502, + "step": 6503, + "token_acc": 0.2922478859278686 + }, + { + "epoch": 3.812664907651715, + "grad_norm": 0.23985899838496655, + "learning_rate": 0.0002840714677096891, + "loss": 3.0341007709503174, + "step": 6504, + "token_acc": 0.29550844813547456 + }, + { + "epoch": 3.813251245968924, + "grad_norm": 0.2813590564689873, + "learning_rate": 0.00028406494751029, + "loss": 3.06073260307312, + "step": 6505, + "token_acc": 0.2936139448951723 + }, + { + "epoch": 3.813837584286133, + "grad_norm": 0.2228921636201004, + "learning_rate": 0.00028405842605153136, + "loss": 3.0706372261047363, + "step": 6506, + "token_acc": 0.2906427271187876 + }, + { + "epoch": 3.8144239226033423, + "grad_norm": 0.24484077165948195, + "learning_rate": 0.00028405190333347444, + "loss": 3.0102782249450684, + "step": 6507, + "token_acc": 0.2960465502736724 + }, + { + "epoch": 3.815010260920551, + "grad_norm": 0.25411018658818735, + "learning_rate": 0.00028404537935618055, + "loss": 3.027517080307007, + "step": 6508, + "token_acc": 0.2979024928488132 + }, + { + "epoch": 3.81559659923776, + "grad_norm": 0.228530114895454, + "learning_rate": 0.00028403885411971096, + "loss": 3.075068950653076, + "step": 6509, + "token_acc": 0.2890450909314959 + }, + { + "epoch": 3.816182937554969, + "grad_norm": 0.24903481996537805, + "learning_rate": 0.0002840323276241269, + "loss": 3.0522079467773438, + "step": 6510, + "token_acc": 0.2933166053759727 + }, + { + "epoch": 3.8167692758721783, + "grad_norm": 0.22676293829231473, + "learning_rate": 0.0002840257998694898, + "loss": 3.0508840084075928, + "step": 6511, + "token_acc": 0.29262379427906204 + }, + { + "epoch": 3.8173556141893874, + "grad_norm": 0.26223626146474843, + "learning_rate": 0.00028401927085586084, + "loss": 3.0425639152526855, + "step": 6512, + "token_acc": 0.2934956948567413 + }, + { + "epoch": 3.8179419525065965, + "grad_norm": 0.2220260750220639, + "learning_rate": 0.0002840127405833015, + "loss": 3.0392003059387207, + "step": 6513, + "token_acc": 0.2936649517650744 + }, + { + "epoch": 3.818528290823805, + "grad_norm": 0.25706360985713084, + "learning_rate": 0.00028400620905187304, + "loss": 3.0403008460998535, + "step": 6514, + "token_acc": 0.29398352716603626 + }, + { + "epoch": 3.8191146291410143, + "grad_norm": 0.22380380442925485, + "learning_rate": 0.0002839996762616368, + "loss": 3.033729076385498, + "step": 6515, + "token_acc": 0.2963674100792127 + }, + { + "epoch": 3.8197009674582234, + "grad_norm": 0.21657871416713492, + "learning_rate": 0.00028399314221265416, + "loss": 3.0058112144470215, + "step": 6516, + "token_acc": 0.2987487276113746 + }, + { + "epoch": 3.8202873057754325, + "grad_norm": 0.2261427260937507, + "learning_rate": 0.0002839866069049865, + "loss": 3.080451488494873, + "step": 6517, + "token_acc": 0.28769606963048 + }, + { + "epoch": 3.820873644092641, + "grad_norm": 0.23740032818495918, + "learning_rate": 0.0002839800703386952, + "loss": 3.0359115600585938, + "step": 6518, + "token_acc": 0.2953185005519536 + }, + { + "epoch": 3.8214599824098503, + "grad_norm": 0.2291898773250533, + "learning_rate": 0.0002839735325138417, + "loss": 3.025585412979126, + "step": 6519, + "token_acc": 0.294110987317279 + }, + { + "epoch": 3.8220463207270594, + "grad_norm": 0.2364667564669498, + "learning_rate": 0.0002839669934304875, + "loss": 3.054555892944336, + "step": 6520, + "token_acc": 0.2926513571035829 + }, + { + "epoch": 3.8226326590442685, + "grad_norm": 0.22201012325243466, + "learning_rate": 0.0002839604530886938, + "loss": 3.0446062088012695, + "step": 6521, + "token_acc": 0.2935001607124658 + }, + { + "epoch": 3.8232189973614776, + "grad_norm": 0.2198711445737446, + "learning_rate": 0.0002839539114885222, + "loss": 3.0163462162017822, + "step": 6522, + "token_acc": 0.2970870650265452 + }, + { + "epoch": 3.8238053356786867, + "grad_norm": 0.24552332159739496, + "learning_rate": 0.00028394736863003405, + "loss": 3.0710837841033936, + "step": 6523, + "token_acc": 0.28817053073450394 + }, + { + "epoch": 3.824391673995896, + "grad_norm": 0.27632625025623053, + "learning_rate": 0.00028394082451329086, + "loss": 3.0504679679870605, + "step": 6524, + "token_acc": 0.2934688071324065 + }, + { + "epoch": 3.8249780123131045, + "grad_norm": 0.22437532149812653, + "learning_rate": 0.0002839342791383542, + "loss": 3.0849342346191406, + "step": 6525, + "token_acc": 0.2867604749699221 + }, + { + "epoch": 3.8255643506303136, + "grad_norm": 0.2299069603408196, + "learning_rate": 0.00028392773250528544, + "loss": 3.0610640048980713, + "step": 6526, + "token_acc": 0.29115947601565423 + }, + { + "epoch": 3.8261506889475227, + "grad_norm": 0.28249171469048323, + "learning_rate": 0.00028392118461414604, + "loss": 3.043231248855591, + "step": 6527, + "token_acc": 0.29332953173038384 + }, + { + "epoch": 3.826737027264732, + "grad_norm": 0.24041256744041373, + "learning_rate": 0.0002839146354649976, + "loss": 3.105743885040283, + "step": 6528, + "token_acc": 0.2856768962468987 + }, + { + "epoch": 3.8273233655819405, + "grad_norm": 0.26072902337251747, + "learning_rate": 0.0002839080850579017, + "loss": 3.0270261764526367, + "step": 6529, + "token_acc": 0.29712473995941957 + }, + { + "epoch": 3.8279097038991496, + "grad_norm": 0.28961981988578656, + "learning_rate": 0.00028390153339291965, + "loss": 3.0521256923675537, + "step": 6530, + "token_acc": 0.2930738972714096 + }, + { + "epoch": 3.8284960422163588, + "grad_norm": 0.27464138690630246, + "learning_rate": 0.0002838949804701132, + "loss": 3.0400891304016113, + "step": 6531, + "token_acc": 0.29284179744202865 + }, + { + "epoch": 3.829082380533568, + "grad_norm": 0.29897593806011746, + "learning_rate": 0.0002838884262895438, + "loss": 3.082455635070801, + "step": 6532, + "token_acc": 0.289261347705536 + }, + { + "epoch": 3.829668718850777, + "grad_norm": 0.24484068384177268, + "learning_rate": 0.00028388187085127313, + "loss": 3.011160373687744, + "step": 6533, + "token_acc": 0.2992803405474891 + }, + { + "epoch": 3.830255057167986, + "grad_norm": 0.24027257562765503, + "learning_rate": 0.0002838753141553626, + "loss": 2.9956626892089844, + "step": 6534, + "token_acc": 0.30118050226534493 + }, + { + "epoch": 3.830841395485195, + "grad_norm": 0.23625127813329788, + "learning_rate": 0.0002838687562018739, + "loss": 3.0871615409851074, + "step": 6535, + "token_acc": 0.28674924285383707 + }, + { + "epoch": 3.831427733802404, + "grad_norm": 0.2546060530380916, + "learning_rate": 0.0002838621969908687, + "loss": 3.0426220893859863, + "step": 6536, + "token_acc": 0.2954379418179907 + }, + { + "epoch": 3.832014072119613, + "grad_norm": 0.2338593927723718, + "learning_rate": 0.0002838556365224085, + "loss": 3.013202428817749, + "step": 6537, + "token_acc": 0.29727467498061017 + }, + { + "epoch": 3.832600410436822, + "grad_norm": 0.24328735372256707, + "learning_rate": 0.000283849074796555, + "loss": 3.0140700340270996, + "step": 6538, + "token_acc": 0.2944670814441689 + }, + { + "epoch": 3.833186748754031, + "grad_norm": 0.22363886437574604, + "learning_rate": 0.0002838425118133697, + "loss": 3.128633975982666, + "step": 6539, + "token_acc": 0.28217249129796657 + }, + { + "epoch": 3.83377308707124, + "grad_norm": 0.24082098391461015, + "learning_rate": 0.00028383594757291447, + "loss": 3.076694965362549, + "step": 6540, + "token_acc": 0.2886501223203549 + }, + { + "epoch": 3.834359425388449, + "grad_norm": 0.24792064969559943, + "learning_rate": 0.0002838293820752508, + "loss": 3.0116188526153564, + "step": 6541, + "token_acc": 0.2971055107646189 + }, + { + "epoch": 3.834945763705658, + "grad_norm": 0.22158772043492048, + "learning_rate": 0.00028382281532044043, + "loss": 3.062791109085083, + "step": 6542, + "token_acc": 0.2899432899968863 + }, + { + "epoch": 3.835532102022867, + "grad_norm": 0.26723617338002437, + "learning_rate": 0.00028381624730854506, + "loss": 3.088080406188965, + "step": 6543, + "token_acc": 0.28852527473689216 + }, + { + "epoch": 3.8361184403400763, + "grad_norm": 0.24723662733711946, + "learning_rate": 0.00028380967803962634, + "loss": 3.069936752319336, + "step": 6544, + "token_acc": 0.29049436475409834 + }, + { + "epoch": 3.8367047786572854, + "grad_norm": 0.25560667001572485, + "learning_rate": 0.000283803107513746, + "loss": 3.062473773956299, + "step": 6545, + "token_acc": 0.2924926140919662 + }, + { + "epoch": 3.8372911169744945, + "grad_norm": 0.2201425032186063, + "learning_rate": 0.00028379653573096583, + "loss": 3.0476179122924805, + "step": 6546, + "token_acc": 0.2943775035096563 + }, + { + "epoch": 3.837877455291703, + "grad_norm": 0.22879317999410065, + "learning_rate": 0.0002837899626913475, + "loss": 3.075305700302124, + "step": 6547, + "token_acc": 0.2887838385496555 + }, + { + "epoch": 3.8384637936089123, + "grad_norm": 0.231829566067993, + "learning_rate": 0.00028378338839495266, + "loss": 3.0726513862609863, + "step": 6548, + "token_acc": 0.29075507375585485 + }, + { + "epoch": 3.8390501319261214, + "grad_norm": 0.22922536546469777, + "learning_rate": 0.0002837768128418432, + "loss": 3.06248140335083, + "step": 6549, + "token_acc": 0.2908382215747166 + }, + { + "epoch": 3.8396364702433305, + "grad_norm": 0.2170888109301465, + "learning_rate": 0.0002837702360320809, + "loss": 3.0245556831359863, + "step": 6550, + "token_acc": 0.2957644140193173 + }, + { + "epoch": 3.840222808560539, + "grad_norm": 0.22510698391222092, + "learning_rate": 0.0002837636579657274, + "loss": 3.0730843544006348, + "step": 6551, + "token_acc": 0.29074954959625676 + }, + { + "epoch": 3.8408091468777483, + "grad_norm": 0.22832968319802974, + "learning_rate": 0.00028375707864284466, + "loss": 3.028977870941162, + "step": 6552, + "token_acc": 0.29615125021582506 + }, + { + "epoch": 3.8413954851949574, + "grad_norm": 0.2419614836297206, + "learning_rate": 0.0002837504980634944, + "loss": 3.0658440589904785, + "step": 6553, + "token_acc": 0.2919831799545376 + }, + { + "epoch": 3.8419818235121665, + "grad_norm": 0.2395143803848042, + "learning_rate": 0.0002837439162277384, + "loss": 3.073482036590576, + "step": 6554, + "token_acc": 0.29014902332714504 + }, + { + "epoch": 3.8425681618293757, + "grad_norm": 0.23555388501704738, + "learning_rate": 0.00028373733313563854, + "loss": 3.0484771728515625, + "step": 6555, + "token_acc": 0.2924699282736587 + }, + { + "epoch": 3.8431545001465848, + "grad_norm": 0.2602837275289091, + "learning_rate": 0.00028373074878725664, + "loss": 3.0312623977661133, + "step": 6556, + "token_acc": 0.29444929329772684 + }, + { + "epoch": 3.843740838463794, + "grad_norm": 0.24920855251526336, + "learning_rate": 0.00028372416318265463, + "loss": 3.004746437072754, + "step": 6557, + "token_acc": 0.29906336324808125 + }, + { + "epoch": 3.8443271767810026, + "grad_norm": 0.23820684883811902, + "learning_rate": 0.00028371757632189424, + "loss": 3.012692451477051, + "step": 6558, + "token_acc": 0.29896210691245206 + }, + { + "epoch": 3.8449135150982117, + "grad_norm": 0.23118051539468903, + "learning_rate": 0.00028371098820503745, + "loss": 3.0799238681793213, + "step": 6559, + "token_acc": 0.2893817772523972 + }, + { + "epoch": 3.8454998534154208, + "grad_norm": 0.24441139752035976, + "learning_rate": 0.00028370439883214604, + "loss": 3.059861660003662, + "step": 6560, + "token_acc": 0.29166084761239724 + }, + { + "epoch": 3.84608619173263, + "grad_norm": 0.26408986275883317, + "learning_rate": 0.000283697808203282, + "loss": 3.035032272338867, + "step": 6561, + "token_acc": 0.2957660644205461 + }, + { + "epoch": 3.8466725300498386, + "grad_norm": 0.23471125890296882, + "learning_rate": 0.0002836912163185072, + "loss": 3.061880111694336, + "step": 6562, + "token_acc": 0.290609470350439 + }, + { + "epoch": 3.8472588683670477, + "grad_norm": 0.23295793855579897, + "learning_rate": 0.0002836846231778836, + "loss": 3.0336246490478516, + "step": 6563, + "token_acc": 0.2975588977190205 + }, + { + "epoch": 3.847845206684257, + "grad_norm": 0.2270672510369036, + "learning_rate": 0.0002836780287814731, + "loss": 3.0157861709594727, + "step": 6564, + "token_acc": 0.29745962539484805 + }, + { + "epoch": 3.848431545001466, + "grad_norm": 0.23653561620008845, + "learning_rate": 0.00028367143312933767, + "loss": 3.043527841567993, + "step": 6565, + "token_acc": 0.2921853866582421 + }, + { + "epoch": 3.849017883318675, + "grad_norm": 0.2334127533213531, + "learning_rate": 0.0002836648362215392, + "loss": 3.072495937347412, + "step": 6566, + "token_acc": 0.28940172809799847 + }, + { + "epoch": 3.849604221635884, + "grad_norm": 0.2653310870410907, + "learning_rate": 0.00028365823805813977, + "loss": 3.0122547149658203, + "step": 6567, + "token_acc": 0.29722806625179116 + }, + { + "epoch": 3.850190559953093, + "grad_norm": 0.2196161804131879, + "learning_rate": 0.00028365163863920125, + "loss": 3.036485195159912, + "step": 6568, + "token_acc": 0.2938674009332802 + }, + { + "epoch": 3.850776898270302, + "grad_norm": 0.23293781740038608, + "learning_rate": 0.00028364503796478573, + "loss": 3.096240520477295, + "step": 6569, + "token_acc": 0.28593394751021806 + }, + { + "epoch": 3.851363236587511, + "grad_norm": 0.23721017324432295, + "learning_rate": 0.0002836384360349551, + "loss": 3.0066869258880615, + "step": 6570, + "token_acc": 0.2988688805491606 + }, + { + "epoch": 3.85194957490472, + "grad_norm": 0.24398114440684018, + "learning_rate": 0.0002836318328497715, + "loss": 3.0344302654266357, + "step": 6571, + "token_acc": 0.29558047526904657 + }, + { + "epoch": 3.852535913221929, + "grad_norm": 0.2785461609069765, + "learning_rate": 0.00028362522840929687, + "loss": 3.075298309326172, + "step": 6572, + "token_acc": 0.28743494105953893 + }, + { + "epoch": 3.853122251539138, + "grad_norm": 0.2441423321421813, + "learning_rate": 0.00028361862271359333, + "loss": 3.0563488006591797, + "step": 6573, + "token_acc": 0.2941047164313393 + }, + { + "epoch": 3.853708589856347, + "grad_norm": 0.24198112370701821, + "learning_rate": 0.00028361201576272287, + "loss": 3.0212860107421875, + "step": 6574, + "token_acc": 0.2979178230149563 + }, + { + "epoch": 3.854294928173556, + "grad_norm": 0.23408258799803067, + "learning_rate": 0.00028360540755674755, + "loss": 3.0577335357666016, + "step": 6575, + "token_acc": 0.2915138781611074 + }, + { + "epoch": 3.8548812664907652, + "grad_norm": 0.22085812956420903, + "learning_rate": 0.00028359879809572946, + "loss": 3.0607337951660156, + "step": 6576, + "token_acc": 0.291428508344626 + }, + { + "epoch": 3.8554676048079743, + "grad_norm": 0.23997824332040857, + "learning_rate": 0.00028359218737973065, + "loss": 3.0403435230255127, + "step": 6577, + "token_acc": 0.29587652118639723 + }, + { + "epoch": 3.8560539431251835, + "grad_norm": 0.22920300110721506, + "learning_rate": 0.0002835855754088133, + "loss": 3.029397487640381, + "step": 6578, + "token_acc": 0.29548002366642795 + }, + { + "epoch": 3.856640281442392, + "grad_norm": 0.24161968705865386, + "learning_rate": 0.0002835789621830395, + "loss": 3.0522141456604004, + "step": 6579, + "token_acc": 0.2932667122271358 + }, + { + "epoch": 3.8572266197596012, + "grad_norm": 0.2422886062200418, + "learning_rate": 0.00028357234770247125, + "loss": 3.025510787963867, + "step": 6580, + "token_acc": 0.2970521246488453 + }, + { + "epoch": 3.8578129580768104, + "grad_norm": 0.24668972253277688, + "learning_rate": 0.0002835657319671709, + "loss": 3.0520505905151367, + "step": 6581, + "token_acc": 0.29247619304540184 + }, + { + "epoch": 3.8583992963940195, + "grad_norm": 0.2281592570633168, + "learning_rate": 0.00028355911497720046, + "loss": 3.050171375274658, + "step": 6582, + "token_acc": 0.2925867722307111 + }, + { + "epoch": 3.858985634711228, + "grad_norm": 0.22129408477938184, + "learning_rate": 0.00028355249673262206, + "loss": 3.0273194313049316, + "step": 6583, + "token_acc": 0.2969939588425961 + }, + { + "epoch": 3.8595719730284372, + "grad_norm": 0.2563267316279477, + "learning_rate": 0.00028354587723349793, + "loss": 3.033951759338379, + "step": 6584, + "token_acc": 0.2962750219466993 + }, + { + "epoch": 3.8601583113456464, + "grad_norm": 0.25106010474037704, + "learning_rate": 0.0002835392564798903, + "loss": 3.064647912979126, + "step": 6585, + "token_acc": 0.29215564173216774 + }, + { + "epoch": 3.8607446496628555, + "grad_norm": 0.2642445226595879, + "learning_rate": 0.0002835326344718612, + "loss": 3.0457444190979004, + "step": 6586, + "token_acc": 0.2945502785609329 + }, + { + "epoch": 3.8613309879800646, + "grad_norm": 0.24833822176356746, + "learning_rate": 0.000283526011209473, + "loss": 3.0403056144714355, + "step": 6587, + "token_acc": 0.29512088813681 + }, + { + "epoch": 3.8619173262972737, + "grad_norm": 0.2442761448133746, + "learning_rate": 0.00028351938669278785, + "loss": 3.0538597106933594, + "step": 6588, + "token_acc": 0.2918890366295696 + }, + { + "epoch": 3.862503664614483, + "grad_norm": 0.25445983569170216, + "learning_rate": 0.000283512760921868, + "loss": 3.079535961151123, + "step": 6589, + "token_acc": 0.28856359728064673 + }, + { + "epoch": 3.8630900029316915, + "grad_norm": 0.26314473891964785, + "learning_rate": 0.00028350613389677566, + "loss": 3.050187587738037, + "step": 6590, + "token_acc": 0.2924566627070445 + }, + { + "epoch": 3.8636763412489006, + "grad_norm": 0.23840006166732203, + "learning_rate": 0.0002834995056175731, + "loss": 3.0651443004608154, + "step": 6591, + "token_acc": 0.2905859585840102 + }, + { + "epoch": 3.8642626795661097, + "grad_norm": 0.2666155385353076, + "learning_rate": 0.0002834928760843225, + "loss": 3.0429234504699707, + "step": 6592, + "token_acc": 0.2937864640973021 + }, + { + "epoch": 3.864849017883319, + "grad_norm": 0.27011910155036123, + "learning_rate": 0.0002834862452970863, + "loss": 3.0211944580078125, + "step": 6593, + "token_acc": 0.2956624166650979 + }, + { + "epoch": 3.8654353562005275, + "grad_norm": 0.24370965920831367, + "learning_rate": 0.0002834796132559266, + "loss": 3.0175890922546387, + "step": 6594, + "token_acc": 0.2987232791005468 + }, + { + "epoch": 3.8660216945177366, + "grad_norm": 0.25155780301573755, + "learning_rate": 0.0002834729799609059, + "loss": 3.0199384689331055, + "step": 6595, + "token_acc": 0.2954673419770993 + }, + { + "epoch": 3.8666080328349457, + "grad_norm": 0.23705558787808953, + "learning_rate": 0.0002834663454120864, + "loss": 3.058058977127075, + "step": 6596, + "token_acc": 0.29252557939335483 + }, + { + "epoch": 3.867194371152155, + "grad_norm": 0.24452065383412255, + "learning_rate": 0.0002834597096095304, + "loss": 3.0496368408203125, + "step": 6597, + "token_acc": 0.2914907599695066 + }, + { + "epoch": 3.867780709469364, + "grad_norm": 0.22064986875141399, + "learning_rate": 0.0002834530725533003, + "loss": 3.0335209369659424, + "step": 6598, + "token_acc": 0.2944046194149399 + }, + { + "epoch": 3.868367047786573, + "grad_norm": 0.23612306403429068, + "learning_rate": 0.0002834464342434584, + "loss": 2.99648380279541, + "step": 6599, + "token_acc": 0.3014428541055853 + }, + { + "epoch": 3.868953386103782, + "grad_norm": 0.2227464578256123, + "learning_rate": 0.00028343979468006705, + "loss": 3.0807061195373535, + "step": 6600, + "token_acc": 0.2886596029922338 + }, + { + "epoch": 3.869539724420991, + "grad_norm": 0.24062171324977696, + "learning_rate": 0.00028343315386318866, + "loss": 3.0487899780273438, + "step": 6601, + "token_acc": 0.2934997277764137 + }, + { + "epoch": 3.8701260627382, + "grad_norm": 0.22833682173483896, + "learning_rate": 0.00028342651179288556, + "loss": 3.0597872734069824, + "step": 6602, + "token_acc": 0.29292725686202226 + }, + { + "epoch": 3.870712401055409, + "grad_norm": 0.2460784792910518, + "learning_rate": 0.00028341986846922024, + "loss": 3.0221238136291504, + "step": 6603, + "token_acc": 0.2973992997224102 + }, + { + "epoch": 3.871298739372618, + "grad_norm": 0.2373824010213088, + "learning_rate": 0.00028341322389225504, + "loss": 3.001662015914917, + "step": 6604, + "token_acc": 0.2989401155995495 + }, + { + "epoch": 3.871885077689827, + "grad_norm": 0.2533704241208796, + "learning_rate": 0.0002834065780620523, + "loss": 3.0400359630584717, + "step": 6605, + "token_acc": 0.2952310975823099 + }, + { + "epoch": 3.872471416007036, + "grad_norm": 0.26152496031236205, + "learning_rate": 0.00028339993097867456, + "loss": 3.020984172821045, + "step": 6606, + "token_acc": 0.295472418829345 + }, + { + "epoch": 3.873057754324245, + "grad_norm": 0.2310290416501084, + "learning_rate": 0.0002833932826421843, + "loss": 3.012481212615967, + "step": 6607, + "token_acc": 0.29777241009125066 + }, + { + "epoch": 3.873644092641454, + "grad_norm": 0.24399349264956485, + "learning_rate": 0.00028338663305264383, + "loss": 3.0890214443206787, + "step": 6608, + "token_acc": 0.2859329061439879 + }, + { + "epoch": 3.8742304309586633, + "grad_norm": 0.24575260109750743, + "learning_rate": 0.00028337998221011565, + "loss": 3.0164601802825928, + "step": 6609, + "token_acc": 0.2988069530838732 + }, + { + "epoch": 3.8748167692758724, + "grad_norm": 0.2620952843350049, + "learning_rate": 0.00028337333011466234, + "loss": 3.037508964538574, + "step": 6610, + "token_acc": 0.29496751647508385 + }, + { + "epoch": 3.875403107593081, + "grad_norm": 0.25403054902000055, + "learning_rate": 0.00028336667676634626, + "loss": 3.0987722873687744, + "step": 6611, + "token_acc": 0.2855646543515521 + }, + { + "epoch": 3.87598944591029, + "grad_norm": 0.27660082799746244, + "learning_rate": 0.00028336002216523, + "loss": 3.0839157104492188, + "step": 6612, + "token_acc": 0.28824707707796016 + }, + { + "epoch": 3.8765757842274993, + "grad_norm": 0.2746152488750893, + "learning_rate": 0.00028335336631137606, + "loss": 3.0903825759887695, + "step": 6613, + "token_acc": 0.28755831284200783 + }, + { + "epoch": 3.8771621225447084, + "grad_norm": 0.26731093173228676, + "learning_rate": 0.0002833467092048469, + "loss": 3.054202079772949, + "step": 6614, + "token_acc": 0.29244808526804106 + }, + { + "epoch": 3.8777484608619175, + "grad_norm": 0.2356788986306146, + "learning_rate": 0.00028334005084570507, + "loss": 3.08854341506958, + "step": 6615, + "token_acc": 0.2858565916979574 + }, + { + "epoch": 3.878334799179126, + "grad_norm": 0.24847653646846418, + "learning_rate": 0.0002833333912340132, + "loss": 3.055333137512207, + "step": 6616, + "token_acc": 0.29032573331790185 + }, + { + "epoch": 3.8789211374963353, + "grad_norm": 0.2680996499132124, + "learning_rate": 0.00028332673036983376, + "loss": 3.033504009246826, + "step": 6617, + "token_acc": 0.2951832670794993 + }, + { + "epoch": 3.8795074758135444, + "grad_norm": 0.2373471970828621, + "learning_rate": 0.00028332006825322934, + "loss": 3.044600009918213, + "step": 6618, + "token_acc": 0.2925640502194549 + }, + { + "epoch": 3.8800938141307535, + "grad_norm": 0.2451909437184239, + "learning_rate": 0.00028331340488426255, + "loss": 3.0642850399017334, + "step": 6619, + "token_acc": 0.29007979263284456 + }, + { + "epoch": 3.8806801524479626, + "grad_norm": 0.2322820559028209, + "learning_rate": 0.00028330674026299596, + "loss": 3.0462963581085205, + "step": 6620, + "token_acc": 0.2942489288740281 + }, + { + "epoch": 3.8812664907651717, + "grad_norm": 0.2318134110493184, + "learning_rate": 0.0002833000743894922, + "loss": 3.0335474014282227, + "step": 6621, + "token_acc": 0.294932730367727 + }, + { + "epoch": 3.8818528290823804, + "grad_norm": 0.22933370111301155, + "learning_rate": 0.0002832934072638138, + "loss": 3.0827155113220215, + "step": 6622, + "token_acc": 0.2874256210055725 + }, + { + "epoch": 3.8824391673995895, + "grad_norm": 0.2459140686649064, + "learning_rate": 0.0002832867388860235, + "loss": 3.0600996017456055, + "step": 6623, + "token_acc": 0.29206578757082946 + }, + { + "epoch": 3.8830255057167986, + "grad_norm": 0.2552504096456068, + "learning_rate": 0.00028328006925618386, + "loss": 3.0486087799072266, + "step": 6624, + "token_acc": 0.29198817223596235 + }, + { + "epoch": 3.8836118440340077, + "grad_norm": 0.24279725935769475, + "learning_rate": 0.0002832733983743576, + "loss": 3.0308966636657715, + "step": 6625, + "token_acc": 0.2939305303498829 + }, + { + "epoch": 3.8841981823512164, + "grad_norm": 0.2330557417018749, + "learning_rate": 0.0002832667262406074, + "loss": 3.0677649974823, + "step": 6626, + "token_acc": 0.28943839104932473 + }, + { + "epoch": 3.8847845206684255, + "grad_norm": 0.2321188613082975, + "learning_rate": 0.0002832600528549958, + "loss": 3.0766537189483643, + "step": 6627, + "token_acc": 0.2888601946751986 + }, + { + "epoch": 3.8853708589856346, + "grad_norm": 0.24103151703933487, + "learning_rate": 0.0002832533782175856, + "loss": 3.073457717895508, + "step": 6628, + "token_acc": 0.2888510706724198 + }, + { + "epoch": 3.8859571973028437, + "grad_norm": 0.23142845261762923, + "learning_rate": 0.00028324670232843946, + "loss": 3.0297064781188965, + "step": 6629, + "token_acc": 0.2975073144078358 + }, + { + "epoch": 3.886543535620053, + "grad_norm": 0.2348195312800062, + "learning_rate": 0.0002832400251876201, + "loss": 3.0747756958007812, + "step": 6630, + "token_acc": 0.28996943944404907 + }, + { + "epoch": 3.887129873937262, + "grad_norm": 0.23133719353892598, + "learning_rate": 0.00028323334679519025, + "loss": 3.0564613342285156, + "step": 6631, + "token_acc": 0.2911819747783 + }, + { + "epoch": 3.887716212254471, + "grad_norm": 0.21218171517766488, + "learning_rate": 0.00028322666715121267, + "loss": 3.043491840362549, + "step": 6632, + "token_acc": 0.2940700398732921 + }, + { + "epoch": 3.8883025505716797, + "grad_norm": 0.2190640585921852, + "learning_rate": 0.0002832199862557501, + "loss": 3.0259041786193848, + "step": 6633, + "token_acc": 0.2967145502138798 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 0.22122582908901495, + "learning_rate": 0.0002832133041088652, + "loss": 3.01729416847229, + "step": 6634, + "token_acc": 0.29782185836735564 + }, + { + "epoch": 3.889475227206098, + "grad_norm": 0.23598118996322903, + "learning_rate": 0.00028320662071062085, + "loss": 3.1118011474609375, + "step": 6635, + "token_acc": 0.28577138709779215 + }, + { + "epoch": 3.890061565523307, + "grad_norm": 0.21292860098561023, + "learning_rate": 0.0002831999360610798, + "loss": 3.0199241638183594, + "step": 6636, + "token_acc": 0.295727015569823 + }, + { + "epoch": 3.8906479038405157, + "grad_norm": 0.2377487729979134, + "learning_rate": 0.00028319325016030485, + "loss": 3.0562400817871094, + "step": 6637, + "token_acc": 0.2933976184043725 + }, + { + "epoch": 3.891234242157725, + "grad_norm": 0.23839287887917152, + "learning_rate": 0.0002831865630083588, + "loss": 3.0656983852386475, + "step": 6638, + "token_acc": 0.29149322090124025 + }, + { + "epoch": 3.891820580474934, + "grad_norm": 0.2344282854684225, + "learning_rate": 0.00028317987460530446, + "loss": 3.053492546081543, + "step": 6639, + "token_acc": 0.2933444956048556 + }, + { + "epoch": 3.892406918792143, + "grad_norm": 0.240011301026954, + "learning_rate": 0.00028317318495120464, + "loss": 3.0419912338256836, + "step": 6640, + "token_acc": 0.29301302656064965 + }, + { + "epoch": 3.892993257109352, + "grad_norm": 0.2574692756331428, + "learning_rate": 0.0002831664940461222, + "loss": 3.056633949279785, + "step": 6641, + "token_acc": 0.2919304437720757 + }, + { + "epoch": 3.8935795954265613, + "grad_norm": 0.23571932464319284, + "learning_rate": 0.00028315980189012, + "loss": 3.031526565551758, + "step": 6642, + "token_acc": 0.29528465157070166 + }, + { + "epoch": 3.8941659337437704, + "grad_norm": 0.25688858352677796, + "learning_rate": 0.0002831531084832609, + "loss": 3.0825862884521484, + "step": 6643, + "token_acc": 0.28788494565858647 + }, + { + "epoch": 3.894752272060979, + "grad_norm": 0.2430288280200242, + "learning_rate": 0.0002831464138256078, + "loss": 3.008242607116699, + "step": 6644, + "token_acc": 0.2990035356753757 + }, + { + "epoch": 3.895338610378188, + "grad_norm": 0.21856867612893754, + "learning_rate": 0.0002831397179172235, + "loss": 3.0225906372070312, + "step": 6645, + "token_acc": 0.2971281419405728 + }, + { + "epoch": 3.8959249486953973, + "grad_norm": 0.23243212292273516, + "learning_rate": 0.000283133020758171, + "loss": 3.030179977416992, + "step": 6646, + "token_acc": 0.2954980339382215 + }, + { + "epoch": 3.8965112870126064, + "grad_norm": 0.20196987227985078, + "learning_rate": 0.00028312632234851314, + "loss": 3.0433619022369385, + "step": 6647, + "token_acc": 0.29319429975827577 + }, + { + "epoch": 3.897097625329815, + "grad_norm": 0.21897602745318612, + "learning_rate": 0.0002831196226883129, + "loss": 3.0525031089782715, + "step": 6648, + "token_acc": 0.294011519871521 + }, + { + "epoch": 3.897683963647024, + "grad_norm": 0.2200529194092275, + "learning_rate": 0.00028311292177763314, + "loss": 3.0778188705444336, + "step": 6649, + "token_acc": 0.2871573528333422 + }, + { + "epoch": 3.8982703019642333, + "grad_norm": 0.2267538829482773, + "learning_rate": 0.0002831062196165369, + "loss": 3.0224833488464355, + "step": 6650, + "token_acc": 0.2988349577753551 + }, + { + "epoch": 3.8988566402814424, + "grad_norm": 0.24861182609681137, + "learning_rate": 0.00028309951620508707, + "loss": 3.1081113815307617, + "step": 6651, + "token_acc": 0.28512684490152074 + }, + { + "epoch": 3.8994429785986515, + "grad_norm": 0.2263817309059537, + "learning_rate": 0.00028309281154334666, + "loss": 3.0390381813049316, + "step": 6652, + "token_acc": 0.2940059439178993 + }, + { + "epoch": 3.9000293169158606, + "grad_norm": 0.24415035959988496, + "learning_rate": 0.0002830861056313786, + "loss": 3.051593780517578, + "step": 6653, + "token_acc": 0.2917268461170762 + }, + { + "epoch": 3.9006156552330697, + "grad_norm": 0.24121070679955597, + "learning_rate": 0.0002830793984692459, + "loss": 3.0274524688720703, + "step": 6654, + "token_acc": 0.2951560144786358 + }, + { + "epoch": 3.9012019935502784, + "grad_norm": 0.21522579831353206, + "learning_rate": 0.00028307269005701163, + "loss": 3.0161027908325195, + "step": 6655, + "token_acc": 0.298262869711356 + }, + { + "epoch": 3.9017883318674875, + "grad_norm": 0.25189247809608206, + "learning_rate": 0.00028306598039473874, + "loss": 3.039217710494995, + "step": 6656, + "token_acc": 0.2941456313203525 + }, + { + "epoch": 3.9023746701846966, + "grad_norm": 0.23040877993184575, + "learning_rate": 0.00028305926948249025, + "loss": 3.077394962310791, + "step": 6657, + "token_acc": 0.2898218357629248 + }, + { + "epoch": 3.9029610085019057, + "grad_norm": 0.24157620149121253, + "learning_rate": 0.0002830525573203292, + "loss": 3.0572948455810547, + "step": 6658, + "token_acc": 0.2920584980869472 + }, + { + "epoch": 3.9035473468191144, + "grad_norm": 0.2559401477063671, + "learning_rate": 0.0002830458439083187, + "loss": 2.98641300201416, + "step": 6659, + "token_acc": 0.30239962641817814 + }, + { + "epoch": 3.9041336851363235, + "grad_norm": 0.2402687713045518, + "learning_rate": 0.0002830391292465218, + "loss": 3.0067455768585205, + "step": 6660, + "token_acc": 0.2990709078355795 + }, + { + "epoch": 3.9047200234535326, + "grad_norm": 0.24293552929677187, + "learning_rate": 0.00028303241333500154, + "loss": 3.026005744934082, + "step": 6661, + "token_acc": 0.29507943236318945 + }, + { + "epoch": 3.9053063617707418, + "grad_norm": 0.25220635120742474, + "learning_rate": 0.000283025696173821, + "loss": 3.0515077114105225, + "step": 6662, + "token_acc": 0.29288520413624786 + }, + { + "epoch": 3.905892700087951, + "grad_norm": 0.2422910642741095, + "learning_rate": 0.0002830189777630433, + "loss": 3.047861337661743, + "step": 6663, + "token_acc": 0.29250611053426595 + }, + { + "epoch": 3.90647903840516, + "grad_norm": 0.2658601922689883, + "learning_rate": 0.0002830122581027316, + "loss": 3.049682140350342, + "step": 6664, + "token_acc": 0.29529421425496494 + }, + { + "epoch": 3.9070653767223686, + "grad_norm": 0.2533509313923523, + "learning_rate": 0.0002830055371929489, + "loss": 3.0606789588928223, + "step": 6665, + "token_acc": 0.29164674361638754 + }, + { + "epoch": 3.9076517150395778, + "grad_norm": 0.23658192961767557, + "learning_rate": 0.00028299881503375844, + "loss": 3.066725492477417, + "step": 6666, + "token_acc": 0.2918875125931499 + }, + { + "epoch": 3.908238053356787, + "grad_norm": 0.23952168878554228, + "learning_rate": 0.00028299209162522334, + "loss": 3.0528571605682373, + "step": 6667, + "token_acc": 0.2923870185016682 + }, + { + "epoch": 3.908824391673996, + "grad_norm": 0.2514848020353997, + "learning_rate": 0.0002829853669674067, + "loss": 3.046949863433838, + "step": 6668, + "token_acc": 0.29156380198224774 + }, + { + "epoch": 3.909410729991205, + "grad_norm": 0.23267417649085526, + "learning_rate": 0.0002829786410603718, + "loss": 3.048292398452759, + "step": 6669, + "token_acc": 0.2930078427626901 + }, + { + "epoch": 3.9099970683084138, + "grad_norm": 0.23497326802674273, + "learning_rate": 0.00028297191390418174, + "loss": 3.0763468742370605, + "step": 6670, + "token_acc": 0.28934701925307355 + }, + { + "epoch": 3.910583406625623, + "grad_norm": 0.220059342250436, + "learning_rate": 0.0002829651854988997, + "loss": 3.0775368213653564, + "step": 6671, + "token_acc": 0.2896742716706242 + }, + { + "epoch": 3.911169744942832, + "grad_norm": 0.23823364822274698, + "learning_rate": 0.0002829584558445889, + "loss": 3.0467519760131836, + "step": 6672, + "token_acc": 0.2921551505882474 + }, + { + "epoch": 3.911756083260041, + "grad_norm": 0.21311213306909843, + "learning_rate": 0.00028295172494131264, + "loss": 3.062983989715576, + "step": 6673, + "token_acc": 0.2925994728868556 + }, + { + "epoch": 3.91234242157725, + "grad_norm": 0.21365425497274132, + "learning_rate": 0.00028294499278913403, + "loss": 3.0269436836242676, + "step": 6674, + "token_acc": 0.29536568084603876 + }, + { + "epoch": 3.9129287598944593, + "grad_norm": 0.2424438649726307, + "learning_rate": 0.0002829382593881164, + "loss": 3.036242961883545, + "step": 6675, + "token_acc": 0.29537313243377045 + }, + { + "epoch": 3.913515098211668, + "grad_norm": 0.229441906280608, + "learning_rate": 0.00028293152473832294, + "loss": 3.0574142932891846, + "step": 6676, + "token_acc": 0.2928149593744818 + }, + { + "epoch": 3.914101436528877, + "grad_norm": 0.2151297514005966, + "learning_rate": 0.00028292478883981686, + "loss": 3.011045455932617, + "step": 6677, + "token_acc": 0.29947034535286926 + }, + { + "epoch": 3.914687774846086, + "grad_norm": 0.22124619801365442, + "learning_rate": 0.0002829180516926616, + "loss": 2.9960131645202637, + "step": 6678, + "token_acc": 0.30058884507283085 + }, + { + "epoch": 3.9152741131632953, + "grad_norm": 0.21602277680782794, + "learning_rate": 0.00028291131329692036, + "loss": 3.0507607460021973, + "step": 6679, + "token_acc": 0.2923447635714685 + }, + { + "epoch": 3.915860451480504, + "grad_norm": 0.2301622597546687, + "learning_rate": 0.0002829045736526564, + "loss": 3.059081554412842, + "step": 6680, + "token_acc": 0.2915253636759361 + }, + { + "epoch": 3.916446789797713, + "grad_norm": 0.21828523206645992, + "learning_rate": 0.00028289783275993306, + "loss": 3.0572023391723633, + "step": 6681, + "token_acc": 0.29058084607210866 + }, + { + "epoch": 3.917033128114922, + "grad_norm": 0.22377395322168317, + "learning_rate": 0.0002828910906188137, + "loss": 3.071209669113159, + "step": 6682, + "token_acc": 0.2894880138815431 + }, + { + "epoch": 3.9176194664321313, + "grad_norm": 0.2436337013254541, + "learning_rate": 0.00028288434722936154, + "loss": 3.084959030151367, + "step": 6683, + "token_acc": 0.28897769674355495 + }, + { + "epoch": 3.9182058047493404, + "grad_norm": 0.2153056830038619, + "learning_rate": 0.00028287760259164, + "loss": 3.0685982704162598, + "step": 6684, + "token_acc": 0.2904121484077443 + }, + { + "epoch": 3.9187921430665495, + "grad_norm": 0.247665457436989, + "learning_rate": 0.0002828708567057125, + "loss": 3.0498600006103516, + "step": 6685, + "token_acc": 0.29285000936271144 + }, + { + "epoch": 3.9193784813837587, + "grad_norm": 0.21787675831426945, + "learning_rate": 0.00028286410957164236, + "loss": 3.0233664512634277, + "step": 6686, + "token_acc": 0.2992248062015504 + }, + { + "epoch": 3.9199648197009673, + "grad_norm": 0.23429970808200723, + "learning_rate": 0.0002828573611894929, + "loss": 3.039573907852173, + "step": 6687, + "token_acc": 0.2936452677249227 + }, + { + "epoch": 3.9205511580181764, + "grad_norm": 0.24268162443909458, + "learning_rate": 0.00028285061155932756, + "loss": 3.0115113258361816, + "step": 6688, + "token_acc": 0.2984331315550459 + }, + { + "epoch": 3.9211374963353856, + "grad_norm": 0.2377186158223472, + "learning_rate": 0.0002828438606812098, + "loss": 3.03489089012146, + "step": 6689, + "token_acc": 0.2942436405227374 + }, + { + "epoch": 3.9217238346525947, + "grad_norm": 0.22840378749967277, + "learning_rate": 0.0002828371085552029, + "loss": 3.066713809967041, + "step": 6690, + "token_acc": 0.28850421889630984 + }, + { + "epoch": 3.9223101729698033, + "grad_norm": 0.22149131932269528, + "learning_rate": 0.0002828303551813704, + "loss": 3.077237844467163, + "step": 6691, + "token_acc": 0.2905670777923241 + }, + { + "epoch": 3.9228965112870124, + "grad_norm": 0.2619601614382145, + "learning_rate": 0.0002828236005597757, + "loss": 3.077540159225464, + "step": 6692, + "token_acc": 0.2898365925257431 + }, + { + "epoch": 3.9234828496042216, + "grad_norm": 0.24872294504738326, + "learning_rate": 0.00028281684469048224, + "loss": 3.041816234588623, + "step": 6693, + "token_acc": 0.29410757965523326 + }, + { + "epoch": 3.9240691879214307, + "grad_norm": 0.22309713252723834, + "learning_rate": 0.00028281008757355354, + "loss": 3.0555336475372314, + "step": 6694, + "token_acc": 0.29141607435757244 + }, + { + "epoch": 3.92465552623864, + "grad_norm": 0.23591044867040722, + "learning_rate": 0.000282803329209053, + "loss": 3.0738399028778076, + "step": 6695, + "token_acc": 0.2887602361454961 + }, + { + "epoch": 3.925241864555849, + "grad_norm": 0.24836022703718452, + "learning_rate": 0.00028279656959704415, + "loss": 3.0278615951538086, + "step": 6696, + "token_acc": 0.2959969600482199 + }, + { + "epoch": 3.925828202873058, + "grad_norm": 0.23225732879979902, + "learning_rate": 0.0002827898087375905, + "loss": 3.044938087463379, + "step": 6697, + "token_acc": 0.29305999351826806 + }, + { + "epoch": 3.9264145411902667, + "grad_norm": 0.23969872674106554, + "learning_rate": 0.00028278304663075546, + "loss": 3.036458730697632, + "step": 6698, + "token_acc": 0.2935539590219149 + }, + { + "epoch": 3.927000879507476, + "grad_norm": 0.22358460034022487, + "learning_rate": 0.0002827762832766027, + "loss": 3.0479438304901123, + "step": 6699, + "token_acc": 0.29341431304383603 + }, + { + "epoch": 3.927587217824685, + "grad_norm": 0.22974965522157517, + "learning_rate": 0.0002827695186751956, + "loss": 3.0245275497436523, + "step": 6700, + "token_acc": 0.29747486804063555 + }, + { + "epoch": 3.928173556141894, + "grad_norm": 0.23460071253228973, + "learning_rate": 0.00028276275282659787, + "loss": 3.077690839767456, + "step": 6701, + "token_acc": 0.28635902958130105 + }, + { + "epoch": 3.9287598944591027, + "grad_norm": 0.24000515527506108, + "learning_rate": 0.0002827559857308729, + "loss": 3.107961654663086, + "step": 6702, + "token_acc": 0.2855098693362941 + }, + { + "epoch": 3.929346232776312, + "grad_norm": 0.2673207884776005, + "learning_rate": 0.00028274921738808437, + "loss": 3.0990211963653564, + "step": 6703, + "token_acc": 0.2870714370080854 + }, + { + "epoch": 3.929932571093521, + "grad_norm": 0.2208192082560746, + "learning_rate": 0.00028274244779829584, + "loss": 3.0569448471069336, + "step": 6704, + "token_acc": 0.29163594127736586 + }, + { + "epoch": 3.93051890941073, + "grad_norm": 0.25270103199251015, + "learning_rate": 0.00028273567696157083, + "loss": 3.0339744091033936, + "step": 6705, + "token_acc": 0.2953227827250867 + }, + { + "epoch": 3.931105247727939, + "grad_norm": 0.21903473677944438, + "learning_rate": 0.0002827289048779731, + "loss": 3.049659252166748, + "step": 6706, + "token_acc": 0.29200966655526134 + }, + { + "epoch": 3.9316915860451482, + "grad_norm": 0.2224460949350582, + "learning_rate": 0.0002827221315475661, + "loss": 3.0640735626220703, + "step": 6707, + "token_acc": 0.2895073136970784 + }, + { + "epoch": 3.9322779243623573, + "grad_norm": 0.2659978445207278, + "learning_rate": 0.00028271535697041347, + "loss": 3.0741114616394043, + "step": 6708, + "token_acc": 0.28919554480808246 + }, + { + "epoch": 3.932864262679566, + "grad_norm": 0.21538648632539104, + "learning_rate": 0.00028270858114657894, + "loss": 3.0534486770629883, + "step": 6709, + "token_acc": 0.29199621766557426 + }, + { + "epoch": 3.933450600996775, + "grad_norm": 0.2643604392624956, + "learning_rate": 0.00028270180407612616, + "loss": 3.0857062339782715, + "step": 6710, + "token_acc": 0.2891350005526985 + }, + { + "epoch": 3.9340369393139842, + "grad_norm": 0.23725361249887983, + "learning_rate": 0.0002826950257591187, + "loss": 3.066263198852539, + "step": 6711, + "token_acc": 0.29015242773632033 + }, + { + "epoch": 3.9346232776311933, + "grad_norm": 0.24037034977374608, + "learning_rate": 0.0002826882461956203, + "loss": 3.0116324424743652, + "step": 6712, + "token_acc": 0.2982022425949882 + }, + { + "epoch": 3.935209615948402, + "grad_norm": 0.24112948827113753, + "learning_rate": 0.0002826814653856946, + "loss": 3.0219054222106934, + "step": 6713, + "token_acc": 0.2963651780522232 + }, + { + "epoch": 3.935795954265611, + "grad_norm": 0.24683971154497653, + "learning_rate": 0.00028267468332940533, + "loss": 3.0734739303588867, + "step": 6714, + "token_acc": 0.29043177413412924 + }, + { + "epoch": 3.9363822925828202, + "grad_norm": 0.2548656739647278, + "learning_rate": 0.0002826679000268162, + "loss": 3.035954236984253, + "step": 6715, + "token_acc": 0.29439086460745334 + }, + { + "epoch": 3.9369686309000294, + "grad_norm": 0.2530366248111408, + "learning_rate": 0.0002826611154779909, + "loss": 3.0658559799194336, + "step": 6716, + "token_acc": 0.28949949910880396 + }, + { + "epoch": 3.9375549692172385, + "grad_norm": 0.24643513463827965, + "learning_rate": 0.0002826543296829933, + "loss": 3.034306287765503, + "step": 6717, + "token_acc": 0.29474731105576796 + }, + { + "epoch": 3.9381413075344476, + "grad_norm": 0.24602544672348126, + "learning_rate": 0.0002826475426418869, + "loss": 3.092775821685791, + "step": 6718, + "token_acc": 0.2878916682142885 + }, + { + "epoch": 3.9387276458516562, + "grad_norm": 0.2636126528223883, + "learning_rate": 0.0002826407543547356, + "loss": 2.9968972206115723, + "step": 6719, + "token_acc": 0.2993412675283486 + }, + { + "epoch": 3.9393139841688654, + "grad_norm": 0.2540048650595831, + "learning_rate": 0.00028263396482160316, + "loss": 3.062431812286377, + "step": 6720, + "token_acc": 0.28976514474859455 + }, + { + "epoch": 3.9399003224860745, + "grad_norm": 0.2492990567503665, + "learning_rate": 0.00028262717404255335, + "loss": 3.090872049331665, + "step": 6721, + "token_acc": 0.2874532609885051 + }, + { + "epoch": 3.9404866608032836, + "grad_norm": 0.2571005349092623, + "learning_rate": 0.00028262038201764996, + "loss": 3.0898656845092773, + "step": 6722, + "token_acc": 0.28764725523545187 + }, + { + "epoch": 3.9410729991204922, + "grad_norm": 0.2683856831346269, + "learning_rate": 0.0002826135887469568, + "loss": 3.0476696491241455, + "step": 6723, + "token_acc": 0.2931884832415876 + }, + { + "epoch": 3.9416593374377014, + "grad_norm": 0.22763003596053677, + "learning_rate": 0.00028260679423053764, + "loss": 3.079838275909424, + "step": 6724, + "token_acc": 0.2879815719414467 + }, + { + "epoch": 3.9422456757549105, + "grad_norm": 0.24956691581811927, + "learning_rate": 0.0002825999984684564, + "loss": 3.0442395210266113, + "step": 6725, + "token_acc": 0.29456341436703165 + }, + { + "epoch": 3.9428320140721196, + "grad_norm": 0.2338324957094939, + "learning_rate": 0.00028259320146077675, + "loss": 3.0346221923828125, + "step": 6726, + "token_acc": 0.2945282817140875 + }, + { + "epoch": 3.9434183523893287, + "grad_norm": 0.2003148380291176, + "learning_rate": 0.00028258640320756275, + "loss": 3.077000617980957, + "step": 6727, + "token_acc": 0.28916574925444105 + }, + { + "epoch": 3.944004690706538, + "grad_norm": 0.22970754977012314, + "learning_rate": 0.0002825796037088781, + "loss": 3.0199272632598877, + "step": 6728, + "token_acc": 0.296481563021596 + }, + { + "epoch": 3.944591029023747, + "grad_norm": 0.20966191895594977, + "learning_rate": 0.00028257280296478676, + "loss": 3.0140631198883057, + "step": 6729, + "token_acc": 0.2978401722279798 + }, + { + "epoch": 3.9451773673409556, + "grad_norm": 0.24261714395519132, + "learning_rate": 0.00028256600097535255, + "loss": 3.0535202026367188, + "step": 6730, + "token_acc": 0.29231966182134106 + }, + { + "epoch": 3.9457637056581647, + "grad_norm": 0.24356005486577106, + "learning_rate": 0.00028255919774063944, + "loss": 3.0494778156280518, + "step": 6731, + "token_acc": 0.29256322945897734 + }, + { + "epoch": 3.946350043975374, + "grad_norm": 0.22007016055548814, + "learning_rate": 0.00028255239326071126, + "loss": 3.0662083625793457, + "step": 6732, + "token_acc": 0.28933919687545495 + }, + { + "epoch": 3.946936382292583, + "grad_norm": 0.24723471292230187, + "learning_rate": 0.00028254558753563195, + "loss": 3.0725972652435303, + "step": 6733, + "token_acc": 0.2899561651287845 + }, + { + "epoch": 3.9475227206097916, + "grad_norm": 0.22379044997419228, + "learning_rate": 0.00028253878056546544, + "loss": 3.030632972717285, + "step": 6734, + "token_acc": 0.2946472541765055 + }, + { + "epoch": 3.9481090589270007, + "grad_norm": 0.26448024299971945, + "learning_rate": 0.0002825319723502757, + "loss": 3.016324043273926, + "step": 6735, + "token_acc": 0.29764991573667376 + }, + { + "epoch": 3.94869539724421, + "grad_norm": 0.20742505329074623, + "learning_rate": 0.0002825251628901267, + "loss": 3.032114028930664, + "step": 6736, + "token_acc": 0.2954364323967613 + }, + { + "epoch": 3.949281735561419, + "grad_norm": 0.22612951347272905, + "learning_rate": 0.0002825183521850823, + "loss": 3.0563697814941406, + "step": 6737, + "token_acc": 0.29301349914773533 + }, + { + "epoch": 3.949868073878628, + "grad_norm": 0.21894697569714666, + "learning_rate": 0.00028251154023520666, + "loss": 3.0927939414978027, + "step": 6738, + "token_acc": 0.2872158259652694 + }, + { + "epoch": 3.950454412195837, + "grad_norm": 0.22694070792351537, + "learning_rate": 0.0002825047270405636, + "loss": 3.068131446838379, + "step": 6739, + "token_acc": 0.2902431478038964 + }, + { + "epoch": 3.9510407505130463, + "grad_norm": 0.22449641283921373, + "learning_rate": 0.00028249791260121713, + "loss": 3.0866427421569824, + "step": 6740, + "token_acc": 0.2868734296083856 + }, + { + "epoch": 3.951627088830255, + "grad_norm": 0.23892090333774155, + "learning_rate": 0.0002824910969172314, + "loss": 3.0920932292938232, + "step": 6741, + "token_acc": 0.2856788265880571 + }, + { + "epoch": 3.952213427147464, + "grad_norm": 0.2215545460394356, + "learning_rate": 0.0002824842799886703, + "loss": 3.056422233581543, + "step": 6742, + "token_acc": 0.2921752030638441 + }, + { + "epoch": 3.952799765464673, + "grad_norm": 0.22046291260055736, + "learning_rate": 0.00028247746181559797, + "loss": 3.0602200031280518, + "step": 6743, + "token_acc": 0.29340861262053586 + }, + { + "epoch": 3.9533861037818823, + "grad_norm": 0.2172021756251513, + "learning_rate": 0.00028247064239807836, + "loss": 3.085911750793457, + "step": 6744, + "token_acc": 0.28657611711545067 + }, + { + "epoch": 3.953972442099091, + "grad_norm": 0.2038759613470377, + "learning_rate": 0.0002824638217361756, + "loss": 3.0290327072143555, + "step": 6745, + "token_acc": 0.29515317437254335 + }, + { + "epoch": 3.9545587804163, + "grad_norm": 0.20382682934560034, + "learning_rate": 0.0002824569998299537, + "loss": 3.0598413944244385, + "step": 6746, + "token_acc": 0.2921572943507141 + }, + { + "epoch": 3.955145118733509, + "grad_norm": 0.23065389452544796, + "learning_rate": 0.0002824501766794768, + "loss": 3.057029962539673, + "step": 6747, + "token_acc": 0.2927866257366862 + }, + { + "epoch": 3.9557314570507183, + "grad_norm": 0.2292896347765202, + "learning_rate": 0.0002824433522848089, + "loss": 3.0636110305786133, + "step": 6748, + "token_acc": 0.29130186852679457 + }, + { + "epoch": 3.9563177953679274, + "grad_norm": 0.21588582234504008, + "learning_rate": 0.00028243652664601424, + "loss": 3.063899517059326, + "step": 6749, + "token_acc": 0.2915826544420954 + }, + { + "epoch": 3.9569041336851365, + "grad_norm": 0.22809188685895732, + "learning_rate": 0.0002824296997631569, + "loss": 3.066152811050415, + "step": 6750, + "token_acc": 0.2892397546567405 + }, + { + "epoch": 3.9574904720023456, + "grad_norm": 0.24281547053455135, + "learning_rate": 0.0002824228716363009, + "loss": 3.0969042778015137, + "step": 6751, + "token_acc": 0.2870656946868269 + }, + { + "epoch": 3.9580768103195543, + "grad_norm": 0.25536229483991557, + "learning_rate": 0.0002824160422655105, + "loss": 3.0408742427825928, + "step": 6752, + "token_acc": 0.2921956865777896 + }, + { + "epoch": 3.9586631486367634, + "grad_norm": 0.24482224174153644, + "learning_rate": 0.0002824092116508498, + "loss": 3.0348386764526367, + "step": 6753, + "token_acc": 0.2960987391302055 + }, + { + "epoch": 3.9592494869539725, + "grad_norm": 0.2209886531571347, + "learning_rate": 0.000282402379792383, + "loss": 3.0129623413085938, + "step": 6754, + "token_acc": 0.29878544019471304 + }, + { + "epoch": 3.9598358252711816, + "grad_norm": 0.24886883430813175, + "learning_rate": 0.00028239554669017426, + "loss": 3.0871403217315674, + "step": 6755, + "token_acc": 0.2859506002543268 + }, + { + "epoch": 3.9604221635883903, + "grad_norm": 0.2617009560290949, + "learning_rate": 0.0002823887123442878, + "loss": 3.058096408843994, + "step": 6756, + "token_acc": 0.293069045567124 + }, + { + "epoch": 3.9610085019055994, + "grad_norm": 0.26339141345354955, + "learning_rate": 0.00028238187675478775, + "loss": 3.0440804958343506, + "step": 6757, + "token_acc": 0.2928679319629605 + }, + { + "epoch": 3.9615948402228085, + "grad_norm": 0.2766444856924938, + "learning_rate": 0.00028237503992173835, + "loss": 3.019359827041626, + "step": 6758, + "token_acc": 0.296670774903163 + }, + { + "epoch": 3.9621811785400176, + "grad_norm": 0.2182704398338543, + "learning_rate": 0.00028236820184520376, + "loss": 3.038116455078125, + "step": 6759, + "token_acc": 0.2940398511960773 + }, + { + "epoch": 3.9627675168572267, + "grad_norm": 0.24775618947801742, + "learning_rate": 0.0002823613625252484, + "loss": 3.04744553565979, + "step": 6760, + "token_acc": 0.2922109727946809 + }, + { + "epoch": 3.963353855174436, + "grad_norm": 0.26219953207804036, + "learning_rate": 0.00028235452196193636, + "loss": 3.0423405170440674, + "step": 6761, + "token_acc": 0.29404705963559713 + }, + { + "epoch": 3.963940193491645, + "grad_norm": 0.2220418614641057, + "learning_rate": 0.0002823476801553319, + "loss": 3.0230350494384766, + "step": 6762, + "token_acc": 0.29711144919832305 + }, + { + "epoch": 3.9645265318088536, + "grad_norm": 0.24710636080384274, + "learning_rate": 0.00028234083710549935, + "loss": 3.052283525466919, + "step": 6763, + "token_acc": 0.2929190069602352 + }, + { + "epoch": 3.9651128701260627, + "grad_norm": 0.2550042049378017, + "learning_rate": 0.00028233399281250295, + "loss": 3.040428638458252, + "step": 6764, + "token_acc": 0.29244324456988585 + }, + { + "epoch": 3.965699208443272, + "grad_norm": 0.24840276683217738, + "learning_rate": 0.000282327147276407, + "loss": 3.0092415809631348, + "step": 6765, + "token_acc": 0.29776719481739994 + }, + { + "epoch": 3.966285546760481, + "grad_norm": 0.20819458405542243, + "learning_rate": 0.00028232030049727585, + "loss": 2.9837443828582764, + "step": 6766, + "token_acc": 0.3022064371763849 + }, + { + "epoch": 3.9668718850776896, + "grad_norm": 0.24306443732799674, + "learning_rate": 0.00028231345247517383, + "loss": 3.0735344886779785, + "step": 6767, + "token_acc": 0.28807418646018623 + }, + { + "epoch": 3.9674582233948987, + "grad_norm": 0.25195504799063106, + "learning_rate": 0.0002823066032101651, + "loss": 3.084482192993164, + "step": 6768, + "token_acc": 0.2886679849369734 + }, + { + "epoch": 3.968044561712108, + "grad_norm": 0.2447755461825122, + "learning_rate": 0.0002822997527023142, + "loss": 3.080294132232666, + "step": 6769, + "token_acc": 0.2895466390910437 + }, + { + "epoch": 3.968630900029317, + "grad_norm": 0.2707098064708686, + "learning_rate": 0.0002822929009516854, + "loss": 3.0475926399230957, + "step": 6770, + "token_acc": 0.2932736771746989 + }, + { + "epoch": 3.969217238346526, + "grad_norm": 0.2397874020903097, + "learning_rate": 0.000282286047958343, + "loss": 2.9995970726013184, + "step": 6771, + "token_acc": 0.29842250026965805 + }, + { + "epoch": 3.969803576663735, + "grad_norm": 0.2532080909041808, + "learning_rate": 0.0002822791937223515, + "loss": 3.0110816955566406, + "step": 6772, + "token_acc": 0.2982383849412358 + }, + { + "epoch": 3.970389914980944, + "grad_norm": 0.23527375119915844, + "learning_rate": 0.0002822723382437752, + "loss": 3.0472917556762695, + "step": 6773, + "token_acc": 0.2930018491049113 + }, + { + "epoch": 3.970976253298153, + "grad_norm": 0.27297847923919916, + "learning_rate": 0.00028226548152267847, + "loss": 3.00388765335083, + "step": 6774, + "token_acc": 0.2988401617583037 + }, + { + "epoch": 3.971562591615362, + "grad_norm": 0.23163793971605773, + "learning_rate": 0.00028225862355912585, + "loss": 2.9881153106689453, + "step": 6775, + "token_acc": 0.30237324336279714 + }, + { + "epoch": 3.972148929932571, + "grad_norm": 0.2750650026383408, + "learning_rate": 0.0002822517643531817, + "loss": 3.047593832015991, + "step": 6776, + "token_acc": 0.29272037010144597 + }, + { + "epoch": 3.97273526824978, + "grad_norm": 0.23138577226141568, + "learning_rate": 0.00028224490390491036, + "loss": 3.0961873531341553, + "step": 6777, + "token_acc": 0.2854853260883614 + }, + { + "epoch": 3.973321606566989, + "grad_norm": 0.2589282498922663, + "learning_rate": 0.0002822380422143764, + "loss": 3.0568552017211914, + "step": 6778, + "token_acc": 0.29347016951756766 + }, + { + "epoch": 3.973907944884198, + "grad_norm": 0.24069344142515253, + "learning_rate": 0.0002822311792816442, + "loss": 3.0322418212890625, + "step": 6779, + "token_acc": 0.2938101052263572 + }, + { + "epoch": 3.974494283201407, + "grad_norm": 0.23617331941176517, + "learning_rate": 0.0002822243151067782, + "loss": 3.077845573425293, + "step": 6780, + "token_acc": 0.2880276408827941 + }, + { + "epoch": 3.9750806215186163, + "grad_norm": 0.25616509608122656, + "learning_rate": 0.000282217449689843, + "loss": 3.0179669857025146, + "step": 6781, + "token_acc": 0.29785113830901705 + }, + { + "epoch": 3.9756669598358254, + "grad_norm": 0.25183341893354994, + "learning_rate": 0.00028221058303090304, + "loss": 3.127781867980957, + "step": 6782, + "token_acc": 0.2830237457158142 + }, + { + "epoch": 3.9762532981530345, + "grad_norm": 0.22894376745665704, + "learning_rate": 0.0002822037151300228, + "loss": 3.043041706085205, + "step": 6783, + "token_acc": 0.2932302450801726 + }, + { + "epoch": 3.976839636470243, + "grad_norm": 0.23249955084840923, + "learning_rate": 0.0002821968459872668, + "loss": 3.039473533630371, + "step": 6784, + "token_acc": 0.2930569047363446 + }, + { + "epoch": 3.9774259747874523, + "grad_norm": 0.23642861366278178, + "learning_rate": 0.00028218997560269956, + "loss": 3.0834927558898926, + "step": 6785, + "token_acc": 0.2877023920819716 + }, + { + "epoch": 3.9780123131046614, + "grad_norm": 0.21646606616616612, + "learning_rate": 0.00028218310397638564, + "loss": 3.063615560531616, + "step": 6786, + "token_acc": 0.29074605564854217 + }, + { + "epoch": 3.9785986514218705, + "grad_norm": 0.242648606032343, + "learning_rate": 0.00028217623110838956, + "loss": 3.03950834274292, + "step": 6787, + "token_acc": 0.29262699472981074 + }, + { + "epoch": 3.979184989739079, + "grad_norm": 0.23327309791884834, + "learning_rate": 0.0002821693569987759, + "loss": 3.057147979736328, + "step": 6788, + "token_acc": 0.2917884813607042 + }, + { + "epoch": 3.9797713280562883, + "grad_norm": 0.23442939592708562, + "learning_rate": 0.0002821624816476092, + "loss": 3.0209579467773438, + "step": 6789, + "token_acc": 0.29614456162313724 + }, + { + "epoch": 3.9803576663734974, + "grad_norm": 0.22232839904852242, + "learning_rate": 0.00028215560505495414, + "loss": 3.0328869819641113, + "step": 6790, + "token_acc": 0.2960408493945079 + }, + { + "epoch": 3.9809440046907065, + "grad_norm": 0.224847599001003, + "learning_rate": 0.00028214872722087523, + "loss": 3.0409202575683594, + "step": 6791, + "token_acc": 0.2930351789286394 + }, + { + "epoch": 3.9815303430079156, + "grad_norm": 0.22511505498637185, + "learning_rate": 0.0002821418481454371, + "loss": 3.0554141998291016, + "step": 6792, + "token_acc": 0.2912018460148241 + }, + { + "epoch": 3.9821166813251248, + "grad_norm": 0.2517873437894732, + "learning_rate": 0.00028213496782870435, + "loss": 3.0371201038360596, + "step": 6793, + "token_acc": 0.2940547251557694 + }, + { + "epoch": 3.982703019642334, + "grad_norm": 0.22287730256972133, + "learning_rate": 0.00028212808627074167, + "loss": 3.0753560066223145, + "step": 6794, + "token_acc": 0.29199729111562595 + }, + { + "epoch": 3.9832893579595425, + "grad_norm": 0.22673259262250867, + "learning_rate": 0.00028212120347161367, + "loss": 3.033461093902588, + "step": 6795, + "token_acc": 0.29428491621517566 + }, + { + "epoch": 3.9838756962767516, + "grad_norm": 0.23363708940968483, + "learning_rate": 0.0002821143194313849, + "loss": 3.0398786067962646, + "step": 6796, + "token_acc": 0.2942784756676871 + }, + { + "epoch": 3.9844620345939608, + "grad_norm": 0.23625510437521774, + "learning_rate": 0.00028210743415012023, + "loss": 3.0347304344177246, + "step": 6797, + "token_acc": 0.2959248940298993 + }, + { + "epoch": 3.98504837291117, + "grad_norm": 0.2608626052020712, + "learning_rate": 0.0002821005476278842, + "loss": 3.053231716156006, + "step": 6798, + "token_acc": 0.2930150246581637 + }, + { + "epoch": 3.9856347112283785, + "grad_norm": 0.2522376154282012, + "learning_rate": 0.00028209365986474154, + "loss": 3.0779242515563965, + "step": 6799, + "token_acc": 0.2882339406473726 + }, + { + "epoch": 3.9862210495455876, + "grad_norm": 0.2261273395977032, + "learning_rate": 0.00028208677086075687, + "loss": 3.038626194000244, + "step": 6800, + "token_acc": 0.29485817843516327 + }, + { + "epoch": 3.9868073878627968, + "grad_norm": 0.24019710677047218, + "learning_rate": 0.0002820798806159951, + "loss": 3.1072871685028076, + "step": 6801, + "token_acc": 0.28583235229244675 + }, + { + "epoch": 3.987393726180006, + "grad_norm": 0.2512887802242321, + "learning_rate": 0.00028207298913052073, + "loss": 3.0856523513793945, + "step": 6802, + "token_acc": 0.28872333988806537 + }, + { + "epoch": 3.987980064497215, + "grad_norm": 0.23261684995397142, + "learning_rate": 0.00028206609640439866, + "loss": 3.059176445007324, + "step": 6803, + "token_acc": 0.29180901970219475 + }, + { + "epoch": 3.988566402814424, + "grad_norm": 0.2328778392300914, + "learning_rate": 0.00028205920243769354, + "loss": 3.0244178771972656, + "step": 6804, + "token_acc": 0.29575560543901747 + }, + { + "epoch": 3.989152741131633, + "grad_norm": 0.22089351747239355, + "learning_rate": 0.0002820523072304702, + "loss": 2.9568567276000977, + "step": 6805, + "token_acc": 0.30743478687322867 + }, + { + "epoch": 3.989739079448842, + "grad_norm": 0.21493245419057083, + "learning_rate": 0.0002820454107827934, + "loss": 3.0491855144500732, + "step": 6806, + "token_acc": 0.2921758956990125 + }, + { + "epoch": 3.990325417766051, + "grad_norm": 0.21334340386383263, + "learning_rate": 0.0002820385130947278, + "loss": 3.050236701965332, + "step": 6807, + "token_acc": 0.2908767636306971 + }, + { + "epoch": 3.99091175608326, + "grad_norm": 0.2375055989795221, + "learning_rate": 0.00028203161416633836, + "loss": 3.1043057441711426, + "step": 6808, + "token_acc": 0.2866864353246428 + }, + { + "epoch": 3.991498094400469, + "grad_norm": 0.20744213953214477, + "learning_rate": 0.0002820247139976898, + "loss": 3.0365242958068848, + "step": 6809, + "token_acc": 0.2955002008838891 + }, + { + "epoch": 3.992084432717678, + "grad_norm": 0.2316783924806943, + "learning_rate": 0.000282017812588847, + "loss": 3.03525447845459, + "step": 6810, + "token_acc": 0.2949135826552979 + }, + { + "epoch": 3.992670771034887, + "grad_norm": 0.23384033697557163, + "learning_rate": 0.00028201090993987466, + "loss": 3.025934934616089, + "step": 6811, + "token_acc": 0.2963743030252664 + }, + { + "epoch": 3.993257109352096, + "grad_norm": 0.2369093605530815, + "learning_rate": 0.0002820040060508378, + "loss": 3.0263609886169434, + "step": 6812, + "token_acc": 0.2979733603620206 + }, + { + "epoch": 3.993843447669305, + "grad_norm": 0.23293367127104092, + "learning_rate": 0.0002819971009218011, + "loss": 3.031742811203003, + "step": 6813, + "token_acc": 0.29453537357087256 + }, + { + "epoch": 3.9944297859865143, + "grad_norm": 0.21007110523762135, + "learning_rate": 0.0002819901945528296, + "loss": 3.032787322998047, + "step": 6814, + "token_acc": 0.2954955480023631 + }, + { + "epoch": 3.9950161243037234, + "grad_norm": 0.23303161242081089, + "learning_rate": 0.000281983286943988, + "loss": 3.039156913757324, + "step": 6815, + "token_acc": 0.2967778374880732 + }, + { + "epoch": 3.9956024626209325, + "grad_norm": 0.22806358375156352, + "learning_rate": 0.0002819763780953413, + "loss": 3.0107882022857666, + "step": 6816, + "token_acc": 0.3007929802952424 + }, + { + "epoch": 3.996188800938141, + "grad_norm": 0.24446090428430606, + "learning_rate": 0.0002819694680069544, + "loss": 3.000490665435791, + "step": 6817, + "token_acc": 0.2986192968498469 + }, + { + "epoch": 3.9967751392553503, + "grad_norm": 0.235042736585659, + "learning_rate": 0.00028196255667889213, + "loss": 3.060365915298462, + "step": 6818, + "token_acc": 0.29064286807735795 + }, + { + "epoch": 3.9973614775725594, + "grad_norm": 0.25854175721791595, + "learning_rate": 0.0002819556441112195, + "loss": 3.085367441177368, + "step": 6819, + "token_acc": 0.28709749942531493 + }, + { + "epoch": 3.9979478158897686, + "grad_norm": 0.27743564912082924, + "learning_rate": 0.0002819487303040014, + "loss": 3.0660033226013184, + "step": 6820, + "token_acc": 0.28893453495334753 + }, + { + "epoch": 3.998534154206977, + "grad_norm": 0.2407415566890626, + "learning_rate": 0.0002819418152573027, + "loss": 3.048217296600342, + "step": 6821, + "token_acc": 0.292888224748988 + }, + { + "epoch": 3.9991204925241863, + "grad_norm": 0.24866435752706523, + "learning_rate": 0.0002819348989711885, + "loss": 3.043728828430176, + "step": 6822, + "token_acc": 0.29371006598931604 + }, + { + "epoch": 3.9997068308413954, + "grad_norm": 0.23171331540245313, + "learning_rate": 0.00028192798144572375, + "loss": 3.0476438999176025, + "step": 6823, + "token_acc": 0.29490519648710994 + }, + { + "epoch": 4.0, + "grad_norm": 0.28439850357277063, + "learning_rate": 0.00028192106268097334, + "loss": 3.066741466522217, + "step": 6824, + "token_acc": 0.28813739645869746 + }, + { + "epoch": 4.0, + "eval_loss": 3.0832579135894775, + "eval_runtime": 8.6908, + "eval_samples_per_second": 29.456, + "eval_steps_per_second": 3.682, + "eval_token_acc": 0.2887123355594045, + "step": 6824 + }, + { + "epoch": 4.000586338317209, + "grad_norm": 0.28216336788056445, + "learning_rate": 0.00028191414267700235, + "loss": 2.944533348083496, + "step": 6825, + "token_acc": 0.3062234452842389 + }, + { + "epoch": 4.001172676634418, + "grad_norm": 0.27395479903507763, + "learning_rate": 0.0002819072214338757, + "loss": 2.967478036880493, + "step": 6826, + "token_acc": 0.30317066271160004 + }, + { + "epoch": 4.001759014951627, + "grad_norm": 0.27788378274825914, + "learning_rate": 0.0002819002989516585, + "loss": 2.9264707565307617, + "step": 6827, + "token_acc": 0.30926321163618753 + }, + { + "epoch": 4.0023453532688364, + "grad_norm": 0.2714292277794991, + "learning_rate": 0.00028189337523041566, + "loss": 2.9909849166870117, + "step": 6828, + "token_acc": 0.29895499045375146 + }, + { + "epoch": 4.002931691586046, + "grad_norm": 0.265384067148991, + "learning_rate": 0.0002818864502702123, + "loss": 2.9543075561523438, + "step": 6829, + "token_acc": 0.30500837891674837 + }, + { + "epoch": 4.003518029903254, + "grad_norm": 0.26683949037227045, + "learning_rate": 0.00028187952407111356, + "loss": 2.8907968997955322, + "step": 6830, + "token_acc": 0.3160368425123969 + }, + { + "epoch": 4.004104368220463, + "grad_norm": 0.23893140333650503, + "learning_rate": 0.00028187259663318433, + "loss": 2.916276454925537, + "step": 6831, + "token_acc": 0.3101819668247062 + }, + { + "epoch": 4.004690706537672, + "grad_norm": 0.2502459052824102, + "learning_rate": 0.0002818656679564897, + "loss": 2.937521457672119, + "step": 6832, + "token_acc": 0.30728913315319123 + }, + { + "epoch": 4.005277044854881, + "grad_norm": 0.26280534259129135, + "learning_rate": 0.0002818587380410949, + "loss": 2.925959587097168, + "step": 6833, + "token_acc": 0.30921082819930107 + }, + { + "epoch": 4.00586338317209, + "grad_norm": 0.26351394380053844, + "learning_rate": 0.0002818518068870649, + "loss": 2.9871985912323, + "step": 6834, + "token_acc": 0.3008091904541924 + }, + { + "epoch": 4.006449721489299, + "grad_norm": 0.2728771235082952, + "learning_rate": 0.00028184487449446486, + "loss": 3.002471446990967, + "step": 6835, + "token_acc": 0.3005390233454138 + }, + { + "epoch": 4.0070360598065085, + "grad_norm": 0.31299569159245527, + "learning_rate": 0.00028183794086335983, + "loss": 2.9059829711914062, + "step": 6836, + "token_acc": 0.3108463978809253 + }, + { + "epoch": 4.007622398123718, + "grad_norm": 0.23232042205554926, + "learning_rate": 0.0002818310059938151, + "loss": 2.955127239227295, + "step": 6837, + "token_acc": 0.30381741175512367 + }, + { + "epoch": 4.008208736440927, + "grad_norm": 0.25988279385537993, + "learning_rate": 0.00028182406988589565, + "loss": 2.9458322525024414, + "step": 6838, + "token_acc": 0.3072939456493202 + }, + { + "epoch": 4.008795074758136, + "grad_norm": 0.2503989358406708, + "learning_rate": 0.0002818171325396667, + "loss": 2.919551372528076, + "step": 6839, + "token_acc": 0.3103773938262158 + }, + { + "epoch": 4.009381413075345, + "grad_norm": 0.23452859189331524, + "learning_rate": 0.00028181019395519345, + "loss": 2.9691507816314697, + "step": 6840, + "token_acc": 0.30378627465297214 + }, + { + "epoch": 4.009967751392553, + "grad_norm": 0.2888972510298122, + "learning_rate": 0.00028180325413254103, + "loss": 2.9465975761413574, + "step": 6841, + "token_acc": 0.3060956688871564 + }, + { + "epoch": 4.010554089709762, + "grad_norm": 0.24449485094201281, + "learning_rate": 0.00028179631307177457, + "loss": 2.951639413833618, + "step": 6842, + "token_acc": 0.30501685361359027 + }, + { + "epoch": 4.011140428026971, + "grad_norm": 0.25895866770562065, + "learning_rate": 0.00028178937077295944, + "loss": 2.992367744445801, + "step": 6843, + "token_acc": 0.3001132561000352 + }, + { + "epoch": 4.0117267663441805, + "grad_norm": 0.2321962739273638, + "learning_rate": 0.00028178242723616074, + "loss": 2.8931946754455566, + "step": 6844, + "token_acc": 0.31423757579345696 + }, + { + "epoch": 4.01231310466139, + "grad_norm": 0.2343966391566824, + "learning_rate": 0.0002817754824614437, + "loss": 2.9754481315612793, + "step": 6845, + "token_acc": 0.30297965638703184 + }, + { + "epoch": 4.012899442978599, + "grad_norm": 0.23062670460327683, + "learning_rate": 0.00028176853644887355, + "loss": 2.92448091506958, + "step": 6846, + "token_acc": 0.30941318977119786 + }, + { + "epoch": 4.013485781295808, + "grad_norm": 0.24968046097314264, + "learning_rate": 0.00028176158919851566, + "loss": 2.961799144744873, + "step": 6847, + "token_acc": 0.3039912726178385 + }, + { + "epoch": 4.014072119613017, + "grad_norm": 0.24445860737859293, + "learning_rate": 0.00028175464071043506, + "loss": 2.9927375316619873, + "step": 6848, + "token_acc": 0.30192968303406864 + }, + { + "epoch": 4.014658457930226, + "grad_norm": 0.22608646061513976, + "learning_rate": 0.00028174769098469725, + "loss": 2.9448094367980957, + "step": 6849, + "token_acc": 0.30507045187249315 + }, + { + "epoch": 4.015244796247435, + "grad_norm": 0.22150582025533833, + "learning_rate": 0.0002817407400213673, + "loss": 2.9568686485290527, + "step": 6850, + "token_acc": 0.3041502725454569 + }, + { + "epoch": 4.015831134564644, + "grad_norm": 0.23244199406580396, + "learning_rate": 0.00028173378782051075, + "loss": 2.942444086074829, + "step": 6851, + "token_acc": 0.3057414582599507 + }, + { + "epoch": 4.0164174728818525, + "grad_norm": 0.23118338525190385, + "learning_rate": 0.0002817268343821927, + "loss": 2.9376115798950195, + "step": 6852, + "token_acc": 0.3093613222369053 + }, + { + "epoch": 4.017003811199062, + "grad_norm": 0.24938039788287913, + "learning_rate": 0.0002817198797064786, + "loss": 2.980619192123413, + "step": 6853, + "token_acc": 0.30216388995494525 + }, + { + "epoch": 4.017590149516271, + "grad_norm": 0.2098607385288621, + "learning_rate": 0.00028171292379343367, + "loss": 2.9677047729492188, + "step": 6854, + "token_acc": 0.3022785838506163 + }, + { + "epoch": 4.01817648783348, + "grad_norm": 0.23299093960293868, + "learning_rate": 0.00028170596664312333, + "loss": 2.9541516304016113, + "step": 6855, + "token_acc": 0.30500204937415265 + }, + { + "epoch": 4.018762826150689, + "grad_norm": 0.21243887508578263, + "learning_rate": 0.00028169900825561294, + "loss": 2.9389421939849854, + "step": 6856, + "token_acc": 0.30743810483255485 + }, + { + "epoch": 4.019349164467898, + "grad_norm": 0.23343911908070072, + "learning_rate": 0.0002816920486309678, + "loss": 2.966653823852539, + "step": 6857, + "token_acc": 0.30342511207118383 + }, + { + "epoch": 4.019935502785107, + "grad_norm": 0.23591169720119587, + "learning_rate": 0.0002816850877692533, + "loss": 2.9488651752471924, + "step": 6858, + "token_acc": 0.30680579056213575 + }, + { + "epoch": 4.020521841102316, + "grad_norm": 0.21111065589458275, + "learning_rate": 0.0002816781256705349, + "loss": 2.9734513759613037, + "step": 6859, + "token_acc": 0.3011205279898878 + }, + { + "epoch": 4.021108179419525, + "grad_norm": 0.24698705205671348, + "learning_rate": 0.0002816711623348779, + "loss": 2.923494815826416, + "step": 6860, + "token_acc": 0.31093772166264716 + }, + { + "epoch": 4.0216945177367345, + "grad_norm": 0.22884400357792228, + "learning_rate": 0.0002816641977623478, + "loss": 2.952188014984131, + "step": 6861, + "token_acc": 0.30517447895871597 + }, + { + "epoch": 4.022280856053943, + "grad_norm": 0.2261622138764315, + "learning_rate": 0.00028165723195300996, + "loss": 2.9609763622283936, + "step": 6862, + "token_acc": 0.3032411543662107 + }, + { + "epoch": 4.022867194371152, + "grad_norm": 0.24293061996220033, + "learning_rate": 0.0002816502649069298, + "loss": 2.9293220043182373, + "step": 6863, + "token_acc": 0.3087857093209147 + }, + { + "epoch": 4.023453532688361, + "grad_norm": 0.24767818354102958, + "learning_rate": 0.00028164329662417286, + "loss": 2.9248855113983154, + "step": 6864, + "token_acc": 0.3082075706586778 + }, + { + "epoch": 4.02403987100557, + "grad_norm": 0.22793501718735645, + "learning_rate": 0.0002816363271048045, + "loss": 2.9628825187683105, + "step": 6865, + "token_acc": 0.30342474999740393 + }, + { + "epoch": 4.024626209322779, + "grad_norm": 0.2357400941888537, + "learning_rate": 0.00028162935634889027, + "loss": 2.9487009048461914, + "step": 6866, + "token_acc": 0.3064385853574107 + }, + { + "epoch": 4.025212547639988, + "grad_norm": 0.25716743022408983, + "learning_rate": 0.00028162238435649556, + "loss": 2.9794740676879883, + "step": 6867, + "token_acc": 0.3018006855821253 + }, + { + "epoch": 4.025798885957197, + "grad_norm": 0.2541800291026612, + "learning_rate": 0.00028161541112768597, + "loss": 2.9488229751586914, + "step": 6868, + "token_acc": 0.30483346922186066 + }, + { + "epoch": 4.0263852242744065, + "grad_norm": 0.2627002261975489, + "learning_rate": 0.0002816084366625269, + "loss": 2.94063401222229, + "step": 6869, + "token_acc": 0.3051032542072119 + }, + { + "epoch": 4.026971562591616, + "grad_norm": 0.2390431291299368, + "learning_rate": 0.0002816014609610839, + "loss": 2.9164950847625732, + "step": 6870, + "token_acc": 0.3100041887740855 + }, + { + "epoch": 4.027557900908825, + "grad_norm": 0.25149967986428645, + "learning_rate": 0.00028159448402342255, + "loss": 2.9283103942871094, + "step": 6871, + "token_acc": 0.30773154335030145 + }, + { + "epoch": 4.028144239226034, + "grad_norm": 0.26899724204387904, + "learning_rate": 0.0002815875058496084, + "loss": 2.950249671936035, + "step": 6872, + "token_acc": 0.30503656105176574 + }, + { + "epoch": 4.028730577543242, + "grad_norm": 0.2514777683898177, + "learning_rate": 0.00028158052643970685, + "loss": 2.941936731338501, + "step": 6873, + "token_acc": 0.30690293628461146 + }, + { + "epoch": 4.029316915860451, + "grad_norm": 0.24725644212320996, + "learning_rate": 0.0002815735457937836, + "loss": 2.9809818267822266, + "step": 6874, + "token_acc": 0.3009461825242916 + }, + { + "epoch": 4.02990325417766, + "grad_norm": 0.2557611248726055, + "learning_rate": 0.00028156656391190417, + "loss": 2.931403875350952, + "step": 6875, + "token_acc": 0.3082515920363773 + }, + { + "epoch": 4.030489592494869, + "grad_norm": 0.26653458449535766, + "learning_rate": 0.00028155958079413416, + "loss": 2.978273868560791, + "step": 6876, + "token_acc": 0.3020928378148734 + }, + { + "epoch": 4.0310759308120785, + "grad_norm": 0.25368195208694644, + "learning_rate": 0.0002815525964405392, + "loss": 2.9217934608459473, + "step": 6877, + "token_acc": 0.308928412348228 + }, + { + "epoch": 4.031662269129288, + "grad_norm": 0.24439218556045106, + "learning_rate": 0.0002815456108511848, + "loss": 2.9629507064819336, + "step": 6878, + "token_acc": 0.305746858703107 + }, + { + "epoch": 4.032248607446497, + "grad_norm": 0.25185516113403245, + "learning_rate": 0.0002815386240261367, + "loss": 2.9473233222961426, + "step": 6879, + "token_acc": 0.3063880994578269 + }, + { + "epoch": 4.032834945763706, + "grad_norm": 0.2421124145847601, + "learning_rate": 0.00028153163596546054, + "loss": 2.9624361991882324, + "step": 6880, + "token_acc": 0.30239046730822333 + }, + { + "epoch": 4.033421284080915, + "grad_norm": 0.2539310943188892, + "learning_rate": 0.00028152464666922176, + "loss": 2.9481663703918457, + "step": 6881, + "token_acc": 0.30659783596704393 + }, + { + "epoch": 4.034007622398124, + "grad_norm": 0.2184570666864419, + "learning_rate": 0.00028151765613748626, + "loss": 2.9816980361938477, + "step": 6882, + "token_acc": 0.3012252797935473 + }, + { + "epoch": 4.034593960715333, + "grad_norm": 0.2389289269437876, + "learning_rate": 0.00028151066437031956, + "loss": 2.928229331970215, + "step": 6883, + "token_acc": 0.3085266827399777 + }, + { + "epoch": 4.035180299032541, + "grad_norm": 0.25975013857731755, + "learning_rate": 0.0002815036713677874, + "loss": 2.9651870727539062, + "step": 6884, + "token_acc": 0.3028297357021018 + }, + { + "epoch": 4.0357666373497505, + "grad_norm": 0.2542800726881421, + "learning_rate": 0.0002814966771299554, + "loss": 2.899287700653076, + "step": 6885, + "token_acc": 0.3132434702186693 + }, + { + "epoch": 4.03635297566696, + "grad_norm": 0.2466108736272474, + "learning_rate": 0.00028148968165688936, + "loss": 2.9474711418151855, + "step": 6886, + "token_acc": 0.3069161423481007 + }, + { + "epoch": 4.036939313984169, + "grad_norm": 0.26595522224939017, + "learning_rate": 0.000281482684948655, + "loss": 2.972527503967285, + "step": 6887, + "token_acc": 0.30060869497154813 + }, + { + "epoch": 4.037525652301378, + "grad_norm": 0.23235069213361323, + "learning_rate": 0.00028147568700531787, + "loss": 2.9345703125, + "step": 6888, + "token_acc": 0.3095714097322959 + }, + { + "epoch": 4.038111990618587, + "grad_norm": 0.253754163181534, + "learning_rate": 0.0002814686878269439, + "loss": 2.955718994140625, + "step": 6889, + "token_acc": 0.3040995795303046 + }, + { + "epoch": 4.038698328935796, + "grad_norm": 0.22127013980095941, + "learning_rate": 0.00028146168741359875, + "loss": 2.961019992828369, + "step": 6890, + "token_acc": 0.30375535193712744 + }, + { + "epoch": 4.039284667253005, + "grad_norm": 0.22898829945286564, + "learning_rate": 0.0002814546857653482, + "loss": 2.9535651206970215, + "step": 6891, + "token_acc": 0.3059546345532539 + }, + { + "epoch": 4.039871005570214, + "grad_norm": 0.22509580601947154, + "learning_rate": 0.000281447682882258, + "loss": 2.9435219764709473, + "step": 6892, + "token_acc": 0.3054716339114475 + }, + { + "epoch": 4.040457343887423, + "grad_norm": 0.22913555838447983, + "learning_rate": 0.00028144067876439396, + "loss": 2.956338405609131, + "step": 6893, + "token_acc": 0.3050677177412517 + }, + { + "epoch": 4.0410436822046325, + "grad_norm": 0.212019453446366, + "learning_rate": 0.0002814336734118218, + "loss": 2.951082706451416, + "step": 6894, + "token_acc": 0.3045080595077145 + }, + { + "epoch": 4.041630020521841, + "grad_norm": 0.2316710447880236, + "learning_rate": 0.0002814266668246075, + "loss": 2.9304442405700684, + "step": 6895, + "token_acc": 0.3088595771403543 + }, + { + "epoch": 4.04221635883905, + "grad_norm": 0.23815131880844087, + "learning_rate": 0.00028141965900281666, + "loss": 2.982321262359619, + "step": 6896, + "token_acc": 0.30016633824191674 + }, + { + "epoch": 4.042802697156259, + "grad_norm": 0.231105456707983, + "learning_rate": 0.0002814126499465153, + "loss": 2.967602252960205, + "step": 6897, + "token_acc": 0.3022644214266966 + }, + { + "epoch": 4.043389035473468, + "grad_norm": 0.23221661805069357, + "learning_rate": 0.00028140563965576914, + "loss": 2.9378325939178467, + "step": 6898, + "token_acc": 0.3071820897440015 + }, + { + "epoch": 4.043975373790677, + "grad_norm": 0.23631529893571196, + "learning_rate": 0.00028139862813064405, + "loss": 2.9074361324310303, + "step": 6899, + "token_acc": 0.31095913735835057 + }, + { + "epoch": 4.044561712107886, + "grad_norm": 0.21895844765962938, + "learning_rate": 0.0002813916153712059, + "loss": 2.9249377250671387, + "step": 6900, + "token_acc": 0.30870772655208256 + }, + { + "epoch": 4.045148050425095, + "grad_norm": 0.2308639660276988, + "learning_rate": 0.0002813846013775206, + "loss": 2.985396385192871, + "step": 6901, + "token_acc": 0.30179490014584553 + }, + { + "epoch": 4.0457343887423045, + "grad_norm": 0.240050473848098, + "learning_rate": 0.000281377586149654, + "loss": 2.9237637519836426, + "step": 6902, + "token_acc": 0.309829310716753 + }, + { + "epoch": 4.046320727059514, + "grad_norm": 0.23318000128119437, + "learning_rate": 0.00028137056968767206, + "loss": 2.941197395324707, + "step": 6903, + "token_acc": 0.30751803910218667 + }, + { + "epoch": 4.046907065376723, + "grad_norm": 0.22853206903610426, + "learning_rate": 0.0002813635519916406, + "loss": 2.9341156482696533, + "step": 6904, + "token_acc": 0.3063069499695076 + }, + { + "epoch": 4.047493403693931, + "grad_norm": 0.23423981529941945, + "learning_rate": 0.00028135653306162557, + "loss": 2.920598030090332, + "step": 6905, + "token_acc": 0.30918320258421783 + }, + { + "epoch": 4.04807974201114, + "grad_norm": 0.23447817882395006, + "learning_rate": 0.0002813495128976929, + "loss": 2.944331169128418, + "step": 6906, + "token_acc": 0.30536422193909923 + }, + { + "epoch": 4.048666080328349, + "grad_norm": 0.2368179296052367, + "learning_rate": 0.00028134249149990866, + "loss": 2.968672275543213, + "step": 6907, + "token_acc": 0.30193295179202095 + }, + { + "epoch": 4.049252418645558, + "grad_norm": 0.2220827766640887, + "learning_rate": 0.00028133546886833865, + "loss": 2.9314537048339844, + "step": 6908, + "token_acc": 0.3081834332667111 + }, + { + "epoch": 4.049838756962767, + "grad_norm": 0.22807748679717127, + "learning_rate": 0.00028132844500304886, + "loss": 2.9324493408203125, + "step": 6909, + "token_acc": 0.3074675445746035 + }, + { + "epoch": 4.0504250952799765, + "grad_norm": 0.23036939759035227, + "learning_rate": 0.00028132141990410526, + "loss": 2.939117670059204, + "step": 6910, + "token_acc": 0.3072159403772522 + }, + { + "epoch": 4.051011433597186, + "grad_norm": 0.21525497370072308, + "learning_rate": 0.00028131439357157394, + "loss": 2.9474310874938965, + "step": 6911, + "token_acc": 0.30482529960390276 + }, + { + "epoch": 4.051597771914395, + "grad_norm": 0.23171626008961732, + "learning_rate": 0.0002813073660055208, + "loss": 2.976034164428711, + "step": 6912, + "token_acc": 0.30085298279487416 + }, + { + "epoch": 4.052184110231604, + "grad_norm": 0.24059671071013447, + "learning_rate": 0.0002813003372060119, + "loss": 2.9680233001708984, + "step": 6913, + "token_acc": 0.3046745453109783 + }, + { + "epoch": 4.052770448548813, + "grad_norm": 0.250940495116202, + "learning_rate": 0.0002812933071731133, + "loss": 2.9491381645202637, + "step": 6914, + "token_acc": 0.30599131025595383 + }, + { + "epoch": 4.053356786866022, + "grad_norm": 0.2633041123693953, + "learning_rate": 0.000281286275906891, + "loss": 2.9330320358276367, + "step": 6915, + "token_acc": 0.3073229923693056 + }, + { + "epoch": 4.05394312518323, + "grad_norm": 0.2500502796933771, + "learning_rate": 0.000281279243407411, + "loss": 2.8771369457244873, + "step": 6916, + "token_acc": 0.3159533853018131 + }, + { + "epoch": 4.054529463500439, + "grad_norm": 0.22923756716949134, + "learning_rate": 0.00028127220967473943, + "loss": 2.9559926986694336, + "step": 6917, + "token_acc": 0.3051138087231931 + }, + { + "epoch": 4.0551158018176485, + "grad_norm": 0.24755360767433782, + "learning_rate": 0.0002812651747089423, + "loss": 2.9593422412872314, + "step": 6918, + "token_acc": 0.30339995899199357 + }, + { + "epoch": 4.055702140134858, + "grad_norm": 0.2428586202005314, + "learning_rate": 0.00028125813851008583, + "loss": 2.9512016773223877, + "step": 6919, + "token_acc": 0.3047375361456364 + }, + { + "epoch": 4.056288478452067, + "grad_norm": 0.22353498608021963, + "learning_rate": 0.00028125110107823594, + "loss": 2.9419007301330566, + "step": 6920, + "token_acc": 0.3068314504117864 + }, + { + "epoch": 4.056874816769276, + "grad_norm": 0.22876681894709675, + "learning_rate": 0.0002812440624134589, + "loss": 2.936403751373291, + "step": 6921, + "token_acc": 0.3079590766137681 + }, + { + "epoch": 4.057461155086485, + "grad_norm": 0.2113837598970633, + "learning_rate": 0.0002812370225158207, + "loss": 2.967212677001953, + "step": 6922, + "token_acc": 0.3027097238833803 + }, + { + "epoch": 4.058047493403694, + "grad_norm": 0.256385156589934, + "learning_rate": 0.0002812299813853875, + "loss": 2.9763388633728027, + "step": 6923, + "token_acc": 0.3010738055133314 + }, + { + "epoch": 4.058633831720903, + "grad_norm": 0.25883922401192666, + "learning_rate": 0.00028122293902222545, + "loss": 2.988863945007324, + "step": 6924, + "token_acc": 0.2997903992638413 + }, + { + "epoch": 4.059220170038112, + "grad_norm": 0.22083192845593885, + "learning_rate": 0.00028121589542640075, + "loss": 2.990067481994629, + "step": 6925, + "token_acc": 0.2986797881207819 + }, + { + "epoch": 4.059806508355321, + "grad_norm": 0.24633861199790094, + "learning_rate": 0.0002812088505979795, + "loss": 2.9325003623962402, + "step": 6926, + "token_acc": 0.3082074919147419 + }, + { + "epoch": 4.06039284667253, + "grad_norm": 0.21730724404120141, + "learning_rate": 0.0002812018045370279, + "loss": 2.909435749053955, + "step": 6927, + "token_acc": 0.31196311976466695 + }, + { + "epoch": 4.060979184989739, + "grad_norm": 0.23327928903727468, + "learning_rate": 0.0002811947572436122, + "loss": 2.9470596313476562, + "step": 6928, + "token_acc": 0.3061533169433737 + }, + { + "epoch": 4.061565523306948, + "grad_norm": 0.23329139830763307, + "learning_rate": 0.0002811877087177985, + "loss": 2.976271390914917, + "step": 6929, + "token_acc": 0.3022678171431601 + }, + { + "epoch": 4.062151861624157, + "grad_norm": 0.2579642625941111, + "learning_rate": 0.0002811806589596531, + "loss": 2.902055263519287, + "step": 6930, + "token_acc": 0.31295412474342343 + }, + { + "epoch": 4.062738199941366, + "grad_norm": 0.24602644035909682, + "learning_rate": 0.0002811736079692421, + "loss": 2.964625358581543, + "step": 6931, + "token_acc": 0.3030681436723733 + }, + { + "epoch": 4.063324538258575, + "grad_norm": 0.22567880039766103, + "learning_rate": 0.00028116655574663183, + "loss": 2.941694498062134, + "step": 6932, + "token_acc": 0.30744580200819965 + }, + { + "epoch": 4.063910876575784, + "grad_norm": 0.2576117346148285, + "learning_rate": 0.00028115950229188854, + "loss": 2.9105496406555176, + "step": 6933, + "token_acc": 0.3105480809642655 + }, + { + "epoch": 4.064497214892993, + "grad_norm": 0.23742791168201235, + "learning_rate": 0.00028115244760507844, + "loss": 2.997262954711914, + "step": 6934, + "token_acc": 0.29682367233510293 + }, + { + "epoch": 4.0650835532102025, + "grad_norm": 0.25004121322333744, + "learning_rate": 0.0002811453916862679, + "loss": 2.9160399436950684, + "step": 6935, + "token_acc": 0.3093443810118452 + }, + { + "epoch": 4.065669891527412, + "grad_norm": 0.24799324978322812, + "learning_rate": 0.00028113833453552304, + "loss": 2.944429397583008, + "step": 6936, + "token_acc": 0.30514306410720754 + }, + { + "epoch": 4.066256229844621, + "grad_norm": 0.26184406268904226, + "learning_rate": 0.0002811312761529103, + "loss": 2.9519710540771484, + "step": 6937, + "token_acc": 0.30582702530219036 + }, + { + "epoch": 4.066842568161829, + "grad_norm": 0.25173615768490015, + "learning_rate": 0.0002811242165384959, + "loss": 2.958865165710449, + "step": 6938, + "token_acc": 0.3057311147259032 + }, + { + "epoch": 4.067428906479038, + "grad_norm": 0.2132416606100034, + "learning_rate": 0.00028111715569234617, + "loss": 2.9657392501831055, + "step": 6939, + "token_acc": 0.30266488774828126 + }, + { + "epoch": 4.068015244796247, + "grad_norm": 0.2517744518025153, + "learning_rate": 0.0002811100936145274, + "loss": 2.9701757431030273, + "step": 6940, + "token_acc": 0.303782709852822 + }, + { + "epoch": 4.068601583113456, + "grad_norm": 0.22932701214736192, + "learning_rate": 0.000281103030305106, + "loss": 2.9627833366394043, + "step": 6941, + "token_acc": 0.30162417304690403 + }, + { + "epoch": 4.069187921430665, + "grad_norm": 0.2843282222511165, + "learning_rate": 0.00028109596576414837, + "loss": 2.9619956016540527, + "step": 6942, + "token_acc": 0.30449611743657773 + }, + { + "epoch": 4.0697742597478745, + "grad_norm": 0.2569670015757998, + "learning_rate": 0.0002810888999917207, + "loss": 2.9476637840270996, + "step": 6943, + "token_acc": 0.3055021063271914 + }, + { + "epoch": 4.070360598065084, + "grad_norm": 0.2772409250860447, + "learning_rate": 0.0002810818329878895, + "loss": 2.914224624633789, + "step": 6944, + "token_acc": 0.3102136493474775 + }, + { + "epoch": 4.070946936382293, + "grad_norm": 0.23562750983034342, + "learning_rate": 0.00028107476475272114, + "loss": 2.9132649898529053, + "step": 6945, + "token_acc": 0.3117679153636303 + }, + { + "epoch": 4.071533274699502, + "grad_norm": 0.23420760795161652, + "learning_rate": 0.00028106769528628197, + "loss": 2.9094314575195312, + "step": 6946, + "token_acc": 0.3108844700207016 + }, + { + "epoch": 4.072119613016711, + "grad_norm": 0.22399538568737767, + "learning_rate": 0.00028106062458863843, + "loss": 2.9470975399017334, + "step": 6947, + "token_acc": 0.30716677341581655 + }, + { + "epoch": 4.07270595133392, + "grad_norm": 0.22794376746347073, + "learning_rate": 0.0002810535526598569, + "loss": 2.9407453536987305, + "step": 6948, + "token_acc": 0.3066781800066423 + }, + { + "epoch": 4.073292289651128, + "grad_norm": 0.23367575859109102, + "learning_rate": 0.00028104647950000385, + "loss": 2.9658050537109375, + "step": 6949, + "token_acc": 0.3040869502856642 + }, + { + "epoch": 4.073878627968337, + "grad_norm": 0.21948915807040575, + "learning_rate": 0.0002810394051091457, + "loss": 2.9171390533447266, + "step": 6950, + "token_acc": 0.30975133210185546 + }, + { + "epoch": 4.0744649662855466, + "grad_norm": 0.24601820474275396, + "learning_rate": 0.00028103232948734893, + "loss": 2.965160846710205, + "step": 6951, + "token_acc": 0.3036856325909604 + }, + { + "epoch": 4.075051304602756, + "grad_norm": 0.24710536972874503, + "learning_rate": 0.00028102525263467995, + "loss": 2.94419527053833, + "step": 6952, + "token_acc": 0.3058493831176623 + }, + { + "epoch": 4.075637642919965, + "grad_norm": 0.2248494191552353, + "learning_rate": 0.00028101817455120537, + "loss": 2.988813877105713, + "step": 6953, + "token_acc": 0.30025053397818297 + }, + { + "epoch": 4.076223981237174, + "grad_norm": 0.24983539539000194, + "learning_rate": 0.0002810110952369915, + "loss": 2.9237418174743652, + "step": 6954, + "token_acc": 0.30906816925142844 + }, + { + "epoch": 4.076810319554383, + "grad_norm": 0.24027861714622115, + "learning_rate": 0.000281004014692105, + "loss": 2.930018901824951, + "step": 6955, + "token_acc": 0.3077422513075516 + }, + { + "epoch": 4.077396657871592, + "grad_norm": 0.23440704285176886, + "learning_rate": 0.0002809969329166123, + "loss": 2.9123759269714355, + "step": 6956, + "token_acc": 0.3108210079485067 + }, + { + "epoch": 4.077982996188801, + "grad_norm": 0.2574909275167172, + "learning_rate": 0.0002809898499105799, + "loss": 2.959304094314575, + "step": 6957, + "token_acc": 0.30463539714942334 + }, + { + "epoch": 4.07856933450601, + "grad_norm": 0.25477023014933414, + "learning_rate": 0.00028098276567407437, + "loss": 2.9802112579345703, + "step": 6958, + "token_acc": 0.3019641772455579 + }, + { + "epoch": 4.0791556728232194, + "grad_norm": 0.23018660726368353, + "learning_rate": 0.0002809756802071623, + "loss": 2.9313008785247803, + "step": 6959, + "token_acc": 0.30868448351239 + }, + { + "epoch": 4.079742011140428, + "grad_norm": 0.27146401439459206, + "learning_rate": 0.0002809685935099102, + "loss": 2.9311976432800293, + "step": 6960, + "token_acc": 0.308977992345828 + }, + { + "epoch": 4.080328349457637, + "grad_norm": 0.22274603415499342, + "learning_rate": 0.0002809615055823846, + "loss": 2.9252712726593018, + "step": 6961, + "token_acc": 0.30986631983595464 + }, + { + "epoch": 4.080914687774846, + "grad_norm": 0.23408572265134073, + "learning_rate": 0.0002809544164246522, + "loss": 2.955280065536499, + "step": 6962, + "token_acc": 0.3052462650317878 + }, + { + "epoch": 4.081501026092055, + "grad_norm": 0.2373170483958627, + "learning_rate": 0.0002809473260367794, + "loss": 2.941588878631592, + "step": 6963, + "token_acc": 0.30850422039919767 + }, + { + "epoch": 4.082087364409264, + "grad_norm": 0.25316730489776573, + "learning_rate": 0.000280940234418833, + "loss": 2.9353485107421875, + "step": 6964, + "token_acc": 0.3075964268402465 + }, + { + "epoch": 4.082673702726473, + "grad_norm": 0.23253274045248168, + "learning_rate": 0.00028093314157087956, + "loss": 2.950018882751465, + "step": 6965, + "token_acc": 0.3055967276479184 + }, + { + "epoch": 4.083260041043682, + "grad_norm": 0.2440045285991042, + "learning_rate": 0.00028092604749298575, + "loss": 2.911876678466797, + "step": 6966, + "token_acc": 0.3110312311822726 + }, + { + "epoch": 4.0838463793608915, + "grad_norm": 0.2618890281012526, + "learning_rate": 0.00028091895218521805, + "loss": 2.9440929889678955, + "step": 6967, + "token_acc": 0.30677873817464096 + }, + { + "epoch": 4.084432717678101, + "grad_norm": 0.23161350121640223, + "learning_rate": 0.00028091185564764324, + "loss": 2.9329781532287598, + "step": 6968, + "token_acc": 0.3074822359991199 + }, + { + "epoch": 4.08501905599531, + "grad_norm": 0.2655355416123265, + "learning_rate": 0.00028090475788032795, + "loss": 2.942594051361084, + "step": 6969, + "token_acc": 0.30590818084712373 + }, + { + "epoch": 4.085605394312518, + "grad_norm": 0.2214483964503748, + "learning_rate": 0.0002808976588833388, + "loss": 2.976132392883301, + "step": 6970, + "token_acc": 0.3022902464471299 + }, + { + "epoch": 4.086191732629727, + "grad_norm": 0.24649806550535966, + "learning_rate": 0.0002808905586567426, + "loss": 2.9484410285949707, + "step": 6971, + "token_acc": 0.3043380722577279 + }, + { + "epoch": 4.086778070946936, + "grad_norm": 0.21342176360141749, + "learning_rate": 0.000280883457200606, + "loss": 2.9396610260009766, + "step": 6972, + "token_acc": 0.30588673330434535 + }, + { + "epoch": 4.087364409264145, + "grad_norm": 0.24467962114483774, + "learning_rate": 0.0002808763545149956, + "loss": 2.9299819469451904, + "step": 6973, + "token_acc": 0.3079387967635985 + }, + { + "epoch": 4.087950747581354, + "grad_norm": 0.24134583800497117, + "learning_rate": 0.00028086925059997827, + "loss": 2.9529669284820557, + "step": 6974, + "token_acc": 0.3059884818116744 + }, + { + "epoch": 4.0885370858985635, + "grad_norm": 0.2197654080326428, + "learning_rate": 0.0002808621454556207, + "loss": 2.955392837524414, + "step": 6975, + "token_acc": 0.3049167893980836 + }, + { + "epoch": 4.089123424215773, + "grad_norm": 0.2308425818519057, + "learning_rate": 0.00028085503908198954, + "loss": 2.9469759464263916, + "step": 6976, + "token_acc": 0.30431890381397103 + }, + { + "epoch": 4.089709762532982, + "grad_norm": 0.23909669167255249, + "learning_rate": 0.00028084793147915165, + "loss": 2.918078660964966, + "step": 6977, + "token_acc": 0.309660649068767 + }, + { + "epoch": 4.090296100850191, + "grad_norm": 0.2311736844574702, + "learning_rate": 0.0002808408226471738, + "loss": 2.915369987487793, + "step": 6978, + "token_acc": 0.3104114556793403 + }, + { + "epoch": 4.0908824391674, + "grad_norm": 0.2394403381467421, + "learning_rate": 0.0002808337125861227, + "loss": 2.99251651763916, + "step": 6979, + "token_acc": 0.29774545904951627 + }, + { + "epoch": 4.091468777484609, + "grad_norm": 0.22923229086532657, + "learning_rate": 0.00028082660129606516, + "loss": 2.9468069076538086, + "step": 6980, + "token_acc": 0.30680760002593865 + }, + { + "epoch": 4.092055115801817, + "grad_norm": 0.22222499039945026, + "learning_rate": 0.00028081948877706805, + "loss": 2.968657970428467, + "step": 6981, + "token_acc": 0.3029411687046342 + }, + { + "epoch": 4.092641454119026, + "grad_norm": 0.21198660468617503, + "learning_rate": 0.0002808123750291981, + "loss": 2.932565689086914, + "step": 6982, + "token_acc": 0.308610524208208 + }, + { + "epoch": 4.0932277924362355, + "grad_norm": 0.21607844782255936, + "learning_rate": 0.0002808052600525221, + "loss": 2.9474844932556152, + "step": 6983, + "token_acc": 0.3069443300243028 + }, + { + "epoch": 4.093814130753445, + "grad_norm": 0.22468553144386327, + "learning_rate": 0.000280798143847107, + "loss": 2.9423489570617676, + "step": 6984, + "token_acc": 0.3075592949804654 + }, + { + "epoch": 4.094400469070654, + "grad_norm": 0.23463957823947734, + "learning_rate": 0.0002807910264130195, + "loss": 2.948122501373291, + "step": 6985, + "token_acc": 0.305525175138996 + }, + { + "epoch": 4.094986807387863, + "grad_norm": 0.2235503887760045, + "learning_rate": 0.0002807839077503267, + "loss": 2.94474458694458, + "step": 6986, + "token_acc": 0.30710059326310396 + }, + { + "epoch": 4.095573145705072, + "grad_norm": 0.2416409927231637, + "learning_rate": 0.0002807767878590952, + "loss": 2.901216745376587, + "step": 6987, + "token_acc": 0.3115882783252291 + }, + { + "epoch": 4.096159484022281, + "grad_norm": 0.23121359914777234, + "learning_rate": 0.00028076966673939204, + "loss": 2.9537413120269775, + "step": 6988, + "token_acc": 0.304296623103717 + }, + { + "epoch": 4.09674582233949, + "grad_norm": 0.24593833094922632, + "learning_rate": 0.0002807625443912841, + "loss": 2.9365625381469727, + "step": 6989, + "token_acc": 0.30787411336049847 + }, + { + "epoch": 4.097332160656699, + "grad_norm": 0.23073527671601043, + "learning_rate": 0.00028075542081483826, + "loss": 2.958406925201416, + "step": 6990, + "token_acc": 0.3029237544785389 + }, + { + "epoch": 4.097918498973908, + "grad_norm": 0.22255770005073805, + "learning_rate": 0.00028074829601012135, + "loss": 2.9231908321380615, + "step": 6991, + "token_acc": 0.3090659690234182 + }, + { + "epoch": 4.098504837291117, + "grad_norm": 0.22661711114172467, + "learning_rate": 0.0002807411699772005, + "loss": 2.9605636596679688, + "step": 6992, + "token_acc": 0.30387789497005135 + }, + { + "epoch": 4.099091175608326, + "grad_norm": 0.2349355074050412, + "learning_rate": 0.00028073404271614246, + "loss": 2.9717674255371094, + "step": 6993, + "token_acc": 0.30238029871119537 + }, + { + "epoch": 4.099677513925535, + "grad_norm": 0.2517513559080339, + "learning_rate": 0.0002807269142270143, + "loss": 2.959568738937378, + "step": 6994, + "token_acc": 0.30339439627044495 + }, + { + "epoch": 4.100263852242744, + "grad_norm": 0.24567950484387097, + "learning_rate": 0.0002807197845098829, + "loss": 2.954444169998169, + "step": 6995, + "token_acc": 0.3045302390218256 + }, + { + "epoch": 4.100850190559953, + "grad_norm": 0.21968298114852824, + "learning_rate": 0.0002807126535648153, + "loss": 2.9386672973632812, + "step": 6996, + "token_acc": 0.30573486249073 + }, + { + "epoch": 4.101436528877162, + "grad_norm": 0.22531659145905045, + "learning_rate": 0.0002807055213918785, + "loss": 2.9417734146118164, + "step": 6997, + "token_acc": 0.3070468150625838 + }, + { + "epoch": 4.102022867194371, + "grad_norm": 0.2394505185396973, + "learning_rate": 0.0002806983879911394, + "loss": 2.9634926319122314, + "step": 6998, + "token_acc": 0.30283758377855907 + }, + { + "epoch": 4.10260920551158, + "grad_norm": 0.2584810154417215, + "learning_rate": 0.000280691253362665, + "loss": 2.9657249450683594, + "step": 6999, + "token_acc": 0.3026017205419656 + }, + { + "epoch": 4.1031955438287895, + "grad_norm": 0.236165942874937, + "learning_rate": 0.0002806841175065225, + "loss": 2.9570493698120117, + "step": 7000, + "token_acc": 0.304639390460286 + }, + { + "epoch": 4.103781882145999, + "grad_norm": 0.24832873786933202, + "learning_rate": 0.0002806769804227787, + "loss": 2.969536304473877, + "step": 7001, + "token_acc": 0.3036872356287593 + }, + { + "epoch": 4.104368220463208, + "grad_norm": 0.2526444661160461, + "learning_rate": 0.00028066984211150086, + "loss": 2.958831548690796, + "step": 7002, + "token_acc": 0.304220326153561 + }, + { + "epoch": 4.104954558780416, + "grad_norm": 0.21207038322400587, + "learning_rate": 0.0002806627025727559, + "loss": 2.9533753395080566, + "step": 7003, + "token_acc": 0.30400468585105256 + }, + { + "epoch": 4.105540897097625, + "grad_norm": 0.26832401180344534, + "learning_rate": 0.00028065556180661093, + "loss": 2.9689018726348877, + "step": 7004, + "token_acc": 0.30206641220613295 + }, + { + "epoch": 4.106127235414834, + "grad_norm": 0.24374313216129817, + "learning_rate": 0.000280648419813133, + "loss": 2.997386932373047, + "step": 7005, + "token_acc": 0.3012756427156285 + }, + { + "epoch": 4.106713573732043, + "grad_norm": 0.24559914525244286, + "learning_rate": 0.00028064127659238917, + "loss": 2.938683032989502, + "step": 7006, + "token_acc": 0.30888912957510206 + }, + { + "epoch": 4.107299912049252, + "grad_norm": 0.2677457960943747, + "learning_rate": 0.0002806341321444467, + "loss": 2.9458703994750977, + "step": 7007, + "token_acc": 0.3058554483180654 + }, + { + "epoch": 4.1078862503664615, + "grad_norm": 0.2518637497901332, + "learning_rate": 0.00028062698646937246, + "loss": 2.9873874187469482, + "step": 7008, + "token_acc": 0.30060431921879555 + }, + { + "epoch": 4.108472588683671, + "grad_norm": 0.23117696542780328, + "learning_rate": 0.0002806198395672338, + "loss": 2.9272685050964355, + "step": 7009, + "token_acc": 0.30968000658084155 + }, + { + "epoch": 4.10905892700088, + "grad_norm": 0.2676533556202926, + "learning_rate": 0.0002806126914380977, + "loss": 2.9826345443725586, + "step": 7010, + "token_acc": 0.3010104373439201 + }, + { + "epoch": 4.109645265318089, + "grad_norm": 0.22230772657588319, + "learning_rate": 0.0002806055420820314, + "loss": 2.9499261379241943, + "step": 7011, + "token_acc": 0.30689628661489965 + }, + { + "epoch": 4.110231603635298, + "grad_norm": 0.24191875398136786, + "learning_rate": 0.00028059839149910203, + "loss": 2.9512939453125, + "step": 7012, + "token_acc": 0.3051099679176901 + }, + { + "epoch": 4.110817941952506, + "grad_norm": 0.22337166676077516, + "learning_rate": 0.00028059123968937676, + "loss": 2.9200072288513184, + "step": 7013, + "token_acc": 0.3106512366202621 + }, + { + "epoch": 4.111404280269715, + "grad_norm": 0.23946198837486707, + "learning_rate": 0.00028058408665292275, + "loss": 2.991720676422119, + "step": 7014, + "token_acc": 0.30046864330242085 + }, + { + "epoch": 4.111990618586924, + "grad_norm": 0.24714873620394268, + "learning_rate": 0.0002805769323898072, + "loss": 2.974813461303711, + "step": 7015, + "token_acc": 0.3014735078625161 + }, + { + "epoch": 4.1125769569041335, + "grad_norm": 0.2203441639150923, + "learning_rate": 0.00028056977690009736, + "loss": 2.9447214603424072, + "step": 7016, + "token_acc": 0.30750746414888075 + }, + { + "epoch": 4.113163295221343, + "grad_norm": 0.23990041870033801, + "learning_rate": 0.0002805626201838604, + "loss": 2.9701313972473145, + "step": 7017, + "token_acc": 0.30236100334831306 + }, + { + "epoch": 4.113749633538552, + "grad_norm": 0.21635971070710705, + "learning_rate": 0.0002805554622411635, + "loss": 2.9463934898376465, + "step": 7018, + "token_acc": 0.30310996399324475 + }, + { + "epoch": 4.114335971855761, + "grad_norm": 0.22763762534898171, + "learning_rate": 0.00028054830307207404, + "loss": 2.9523544311523438, + "step": 7019, + "token_acc": 0.3061413722070902 + }, + { + "epoch": 4.11492231017297, + "grad_norm": 0.22657027485409625, + "learning_rate": 0.00028054114267665915, + "loss": 2.92579984664917, + "step": 7020, + "token_acc": 0.30727782984403357 + }, + { + "epoch": 4.115508648490179, + "grad_norm": 0.23159538530595852, + "learning_rate": 0.00028053398105498613, + "loss": 2.9265389442443848, + "step": 7021, + "token_acc": 0.31014779888842975 + }, + { + "epoch": 4.116094986807388, + "grad_norm": 0.23168884475736576, + "learning_rate": 0.0002805268182071223, + "loss": 2.947672128677368, + "step": 7022, + "token_acc": 0.30478707034690605 + }, + { + "epoch": 4.116681325124597, + "grad_norm": 0.24236124560948394, + "learning_rate": 0.00028051965413313483, + "loss": 2.9204933643341064, + "step": 7023, + "token_acc": 0.31057923806826226 + }, + { + "epoch": 4.1172676634418055, + "grad_norm": 0.22627282958571734, + "learning_rate": 0.00028051248883309115, + "loss": 2.969742774963379, + "step": 7024, + "token_acc": 0.30365686589934926 + }, + { + "epoch": 4.117854001759015, + "grad_norm": 0.267871241652411, + "learning_rate": 0.00028050532230705844, + "loss": 2.881059169769287, + "step": 7025, + "token_acc": 0.3169502706225511 + }, + { + "epoch": 4.118440340076224, + "grad_norm": 0.25489030902860266, + "learning_rate": 0.00028049815455510413, + "loss": 2.9760515689849854, + "step": 7026, + "token_acc": 0.2999971148070724 + }, + { + "epoch": 4.119026678393433, + "grad_norm": 0.27679805834243426, + "learning_rate": 0.0002804909855772955, + "loss": 2.978717803955078, + "step": 7027, + "token_acc": 0.3010443553190644 + }, + { + "epoch": 4.119613016710642, + "grad_norm": 0.23507745164652938, + "learning_rate": 0.0002804838153736999, + "loss": 2.95672869682312, + "step": 7028, + "token_acc": 0.30455859951600767 + }, + { + "epoch": 4.120199355027851, + "grad_norm": 0.22614213004090142, + "learning_rate": 0.0002804766439443847, + "loss": 2.9913949966430664, + "step": 7029, + "token_acc": 0.2988339017226569 + }, + { + "epoch": 4.12078569334506, + "grad_norm": 0.2479979734389295, + "learning_rate": 0.0002804694712894172, + "loss": 2.9279022216796875, + "step": 7030, + "token_acc": 0.308490558730044 + }, + { + "epoch": 4.121372031662269, + "grad_norm": 0.21831398989775666, + "learning_rate": 0.00028046229740886483, + "loss": 2.9356937408447266, + "step": 7031, + "token_acc": 0.3068591017774321 + }, + { + "epoch": 4.121958369979478, + "grad_norm": 0.25508115668536735, + "learning_rate": 0.00028045512230279505, + "loss": 2.92872953414917, + "step": 7032, + "token_acc": 0.30877569726332327 + }, + { + "epoch": 4.1225447082966875, + "grad_norm": 0.23704200293915115, + "learning_rate": 0.0002804479459712751, + "loss": 2.9174094200134277, + "step": 7033, + "token_acc": 0.31116756191779493 + }, + { + "epoch": 4.123131046613897, + "grad_norm": 0.2605022210442283, + "learning_rate": 0.0002804407684143725, + "loss": 2.9697089195251465, + "step": 7034, + "token_acc": 0.3022893052643762 + }, + { + "epoch": 4.123717384931105, + "grad_norm": 0.22742753369859667, + "learning_rate": 0.0002804335896321547, + "loss": 2.9532570838928223, + "step": 7035, + "token_acc": 0.30286329187394223 + }, + { + "epoch": 4.124303723248314, + "grad_norm": 0.24214065157301334, + "learning_rate": 0.00028042640962468906, + "loss": 2.9967823028564453, + "step": 7036, + "token_acc": 0.2985971328037369 + }, + { + "epoch": 4.124890061565523, + "grad_norm": 0.2326218495025064, + "learning_rate": 0.00028041922839204303, + "loss": 2.941636562347412, + "step": 7037, + "token_acc": 0.3060871648257797 + }, + { + "epoch": 4.125476399882732, + "grad_norm": 0.2541204139686519, + "learning_rate": 0.0002804120459342841, + "loss": 2.964323043823242, + "step": 7038, + "token_acc": 0.3049063842075841 + }, + { + "epoch": 4.126062738199941, + "grad_norm": 0.26932132221580224, + "learning_rate": 0.0002804048622514798, + "loss": 2.937098503112793, + "step": 7039, + "token_acc": 0.30726133036834535 + }, + { + "epoch": 4.12664907651715, + "grad_norm": 0.26133225286895573, + "learning_rate": 0.00028039767734369745, + "loss": 2.9305028915405273, + "step": 7040, + "token_acc": 0.30853444958415466 + }, + { + "epoch": 4.1272354148343595, + "grad_norm": 0.25331248054799116, + "learning_rate": 0.0002803904912110047, + "loss": 2.936984062194824, + "step": 7041, + "token_acc": 0.3071333005822036 + }, + { + "epoch": 4.127821753151569, + "grad_norm": 0.26815808385911194, + "learning_rate": 0.0002803833038534689, + "loss": 2.999026298522949, + "step": 7042, + "token_acc": 0.2985480990978328 + }, + { + "epoch": 4.128408091468778, + "grad_norm": 0.2511223325206849, + "learning_rate": 0.00028037611527115773, + "loss": 2.954646587371826, + "step": 7043, + "token_acc": 0.3070369030390738 + }, + { + "epoch": 4.128994429785987, + "grad_norm": 0.2551933052948444, + "learning_rate": 0.00028036892546413856, + "loss": 2.9374358654022217, + "step": 7044, + "token_acc": 0.3073796172391087 + }, + { + "epoch": 4.129580768103196, + "grad_norm": 0.2574986024833367, + "learning_rate": 0.0002803617344324791, + "loss": 2.9563074111938477, + "step": 7045, + "token_acc": 0.3039713204741065 + }, + { + "epoch": 4.130167106420404, + "grad_norm": 0.261607554709604, + "learning_rate": 0.0002803545421762468, + "loss": 2.924598455429077, + "step": 7046, + "token_acc": 0.30951524523922674 + }, + { + "epoch": 4.130753444737613, + "grad_norm": 0.2594858690872754, + "learning_rate": 0.00028034734869550917, + "loss": 2.931480884552002, + "step": 7047, + "token_acc": 0.30810158310331304 + }, + { + "epoch": 4.131339783054822, + "grad_norm": 0.25950538188557626, + "learning_rate": 0.0002803401539903339, + "loss": 2.957671642303467, + "step": 7048, + "token_acc": 0.30473850567080135 + }, + { + "epoch": 4.1319261213720315, + "grad_norm": 0.26676664138606776, + "learning_rate": 0.0002803329580607885, + "loss": 3.025540828704834, + "step": 7049, + "token_acc": 0.29556255966938255 + }, + { + "epoch": 4.132512459689241, + "grad_norm": 0.2428995759440316, + "learning_rate": 0.00028032576090694064, + "loss": 2.9836220741271973, + "step": 7050, + "token_acc": 0.29984319843876783 + }, + { + "epoch": 4.13309879800645, + "grad_norm": 0.24274853593252, + "learning_rate": 0.0002803185625288578, + "loss": 2.9807164669036865, + "step": 7051, + "token_acc": 0.30284529552108186 + }, + { + "epoch": 4.133685136323659, + "grad_norm": 0.2232729070243147, + "learning_rate": 0.0002803113629266077, + "loss": 2.9615988731384277, + "step": 7052, + "token_acc": 0.3028200839428192 + }, + { + "epoch": 4.134271474640868, + "grad_norm": 0.22309885025648937, + "learning_rate": 0.0002803041621002579, + "loss": 3.0009429454803467, + "step": 7053, + "token_acc": 0.2987090803066536 + }, + { + "epoch": 4.134857812958077, + "grad_norm": 0.23118697000946808, + "learning_rate": 0.0002802969600498761, + "loss": 2.9297561645507812, + "step": 7054, + "token_acc": 0.3097233299562415 + }, + { + "epoch": 4.135444151275286, + "grad_norm": 0.24633332716304196, + "learning_rate": 0.00028028975677552996, + "loss": 2.9691290855407715, + "step": 7055, + "token_acc": 0.30230859070174243 + }, + { + "epoch": 4.136030489592494, + "grad_norm": 0.24210784554013462, + "learning_rate": 0.00028028255227728713, + "loss": 3.0154945850372314, + "step": 7056, + "token_acc": 0.295490711895075 + }, + { + "epoch": 4.1366168279097035, + "grad_norm": 0.25311856661514065, + "learning_rate": 0.0002802753465552153, + "loss": 2.91676664352417, + "step": 7057, + "token_acc": 0.3108535442233308 + }, + { + "epoch": 4.137203166226913, + "grad_norm": 0.22818253388629445, + "learning_rate": 0.0002802681396093821, + "loss": 2.961775779724121, + "step": 7058, + "token_acc": 0.3044568237704372 + }, + { + "epoch": 4.137789504544122, + "grad_norm": 0.2264347300078047, + "learning_rate": 0.00028026093143985526, + "loss": 3.0084028244018555, + "step": 7059, + "token_acc": 0.29910979694780043 + }, + { + "epoch": 4.138375842861331, + "grad_norm": 0.22188733061543833, + "learning_rate": 0.00028025372204670254, + "loss": 2.976555824279785, + "step": 7060, + "token_acc": 0.3017328374131419 + }, + { + "epoch": 4.13896218117854, + "grad_norm": 0.20972460257703235, + "learning_rate": 0.00028024651142999156, + "loss": 2.899305820465088, + "step": 7061, + "token_acc": 0.31322386154988086 + }, + { + "epoch": 4.139548519495749, + "grad_norm": 0.2284321127092416, + "learning_rate": 0.00028023929958979015, + "loss": 2.9407315254211426, + "step": 7062, + "token_acc": 0.30848145886235306 + }, + { + "epoch": 4.140134857812958, + "grad_norm": 0.2251948627095996, + "learning_rate": 0.000280232086526166, + "loss": 2.9364328384399414, + "step": 7063, + "token_acc": 0.307368871869804 + }, + { + "epoch": 4.140721196130167, + "grad_norm": 0.2214741640435235, + "learning_rate": 0.00028022487223918694, + "loss": 3.009774684906006, + "step": 7064, + "token_acc": 0.2979091022275976 + }, + { + "epoch": 4.141307534447376, + "grad_norm": 0.22189053718363874, + "learning_rate": 0.0002802176567289206, + "loss": 2.99110746383667, + "step": 7065, + "token_acc": 0.30059114339078985 + }, + { + "epoch": 4.1418938727645855, + "grad_norm": 0.222933484263725, + "learning_rate": 0.0002802104399954349, + "loss": 2.9572460651397705, + "step": 7066, + "token_acc": 0.30354219882187355 + }, + { + "epoch": 4.142480211081795, + "grad_norm": 0.2161261199293689, + "learning_rate": 0.0002802032220387976, + "loss": 2.931248188018799, + "step": 7067, + "token_acc": 0.3075161964742018 + }, + { + "epoch": 4.143066549399003, + "grad_norm": 0.23061846200269812, + "learning_rate": 0.00028019600285907645, + "loss": 2.9586434364318848, + "step": 7068, + "token_acc": 0.30535956032146405 + }, + { + "epoch": 4.143652887716212, + "grad_norm": 0.22502234555649936, + "learning_rate": 0.00028018878245633926, + "loss": 2.97121524810791, + "step": 7069, + "token_acc": 0.30230687135703144 + }, + { + "epoch": 4.144239226033421, + "grad_norm": 0.2204685403627289, + "learning_rate": 0.00028018156083065395, + "loss": 2.9642229080200195, + "step": 7070, + "token_acc": 0.30380702191103176 + }, + { + "epoch": 4.14482556435063, + "grad_norm": 0.22681193702318006, + "learning_rate": 0.0002801743379820883, + "loss": 2.9790120124816895, + "step": 7071, + "token_acc": 0.2999884299433067 + }, + { + "epoch": 4.145411902667839, + "grad_norm": 0.22753099396102439, + "learning_rate": 0.00028016711391071013, + "loss": 2.948507785797119, + "step": 7072, + "token_acc": 0.30713466547193913 + }, + { + "epoch": 4.145998240985048, + "grad_norm": 0.23762351548633373, + "learning_rate": 0.0002801598886165873, + "loss": 2.93526029586792, + "step": 7073, + "token_acc": 0.3076610662220512 + }, + { + "epoch": 4.1465845793022575, + "grad_norm": 0.23350424503284453, + "learning_rate": 0.00028015266209978774, + "loss": 2.965040683746338, + "step": 7074, + "token_acc": 0.3040210248280352 + }, + { + "epoch": 4.147170917619467, + "grad_norm": 0.24965724553199728, + "learning_rate": 0.0002801454343603793, + "loss": 2.9607763290405273, + "step": 7075, + "token_acc": 0.3040059833882248 + }, + { + "epoch": 4.147757255936676, + "grad_norm": 0.2217545354075916, + "learning_rate": 0.0002801382053984299, + "loss": 2.9728918075561523, + "step": 7076, + "token_acc": 0.30271535580524345 + }, + { + "epoch": 4.148343594253885, + "grad_norm": 0.22762392624710606, + "learning_rate": 0.0002801309752140074, + "loss": 2.9778435230255127, + "step": 7077, + "token_acc": 0.300032694797923 + }, + { + "epoch": 4.148929932571093, + "grad_norm": 0.22916105769679446, + "learning_rate": 0.0002801237438071797, + "loss": 2.9443044662475586, + "step": 7078, + "token_acc": 0.30466865590950676 + }, + { + "epoch": 4.149516270888302, + "grad_norm": 0.23011379946959531, + "learning_rate": 0.0002801165111780148, + "loss": 2.973602294921875, + "step": 7079, + "token_acc": 0.3030043830521982 + }, + { + "epoch": 4.150102609205511, + "grad_norm": 0.23387135508406948, + "learning_rate": 0.00028010927732658066, + "loss": 2.9712934494018555, + "step": 7080, + "token_acc": 0.302414244594123 + }, + { + "epoch": 4.15068894752272, + "grad_norm": 0.25445804985885206, + "learning_rate": 0.00028010204225294513, + "loss": 2.9957268238067627, + "step": 7081, + "token_acc": 0.29876220560509625 + }, + { + "epoch": 4.1512752858399296, + "grad_norm": 0.24829901991419118, + "learning_rate": 0.00028009480595717626, + "loss": 2.930509567260742, + "step": 7082, + "token_acc": 0.3104601387598959 + }, + { + "epoch": 4.151861624157139, + "grad_norm": 0.22691444173188832, + "learning_rate": 0.000280087568439342, + "loss": 2.972425699234009, + "step": 7083, + "token_acc": 0.3011443682451383 + }, + { + "epoch": 4.152447962474348, + "grad_norm": 0.25637480601914125, + "learning_rate": 0.00028008032969951025, + "loss": 2.952868938446045, + "step": 7084, + "token_acc": 0.304035063890253 + }, + { + "epoch": 4.153034300791557, + "grad_norm": 0.24183091062628748, + "learning_rate": 0.0002800730897377492, + "loss": 2.972439765930176, + "step": 7085, + "token_acc": 0.3035490733700452 + }, + { + "epoch": 4.153620639108766, + "grad_norm": 0.2392878230619902, + "learning_rate": 0.0002800658485541267, + "loss": 2.9905807971954346, + "step": 7086, + "token_acc": 0.3005035913950329 + }, + { + "epoch": 4.154206977425975, + "grad_norm": 0.2444217927559297, + "learning_rate": 0.0002800586061487108, + "loss": 2.925438642501831, + "step": 7087, + "token_acc": 0.3091620129800125 + }, + { + "epoch": 4.154793315743184, + "grad_norm": 0.21797320716174756, + "learning_rate": 0.00028005136252156953, + "loss": 2.952061891555786, + "step": 7088, + "token_acc": 0.30380722891566264 + }, + { + "epoch": 4.1553796540603924, + "grad_norm": 0.23439335581191156, + "learning_rate": 0.000280044117672771, + "loss": 2.9105563163757324, + "step": 7089, + "token_acc": 0.3107666005318924 + }, + { + "epoch": 4.155965992377602, + "grad_norm": 0.240632879032838, + "learning_rate": 0.0002800368716023832, + "loss": 2.947885513305664, + "step": 7090, + "token_acc": 0.304783145233106 + }, + { + "epoch": 4.156552330694811, + "grad_norm": 0.244042502175311, + "learning_rate": 0.00028002962431047425, + "loss": 2.937822103500366, + "step": 7091, + "token_acc": 0.3075243479294802 + }, + { + "epoch": 4.15713866901202, + "grad_norm": 0.26747605260300134, + "learning_rate": 0.0002800223757971122, + "loss": 2.9654717445373535, + "step": 7092, + "token_acc": 0.30359673409942817 + }, + { + "epoch": 4.157725007329229, + "grad_norm": 0.23634852609128923, + "learning_rate": 0.0002800151260623651, + "loss": 2.937488317489624, + "step": 7093, + "token_acc": 0.30765783825853166 + }, + { + "epoch": 4.158311345646438, + "grad_norm": 0.23847859466719065, + "learning_rate": 0.0002800078751063011, + "loss": 2.989058017730713, + "step": 7094, + "token_acc": 0.3001433581315918 + }, + { + "epoch": 4.158897683963647, + "grad_norm": 0.26843347481760865, + "learning_rate": 0.0002800006229289883, + "loss": 2.9836111068725586, + "step": 7095, + "token_acc": 0.30146643305576987 + }, + { + "epoch": 4.159484022280856, + "grad_norm": 0.2221002321510319, + "learning_rate": 0.00027999336953049483, + "loss": 2.9927072525024414, + "step": 7096, + "token_acc": 0.2994058913521783 + }, + { + "epoch": 4.160070360598065, + "grad_norm": 0.22148263504205806, + "learning_rate": 0.00027998611491088883, + "loss": 2.95967698097229, + "step": 7097, + "token_acc": 0.30393760878996806 + }, + { + "epoch": 4.1606566989152745, + "grad_norm": 0.2472598839608634, + "learning_rate": 0.0002799788590702384, + "loss": 2.9445595741271973, + "step": 7098, + "token_acc": 0.3066633844240312 + }, + { + "epoch": 4.161243037232484, + "grad_norm": 0.2306243136796015, + "learning_rate": 0.00027997160200861175, + "loss": 2.964796543121338, + "step": 7099, + "token_acc": 0.30244676145982136 + }, + { + "epoch": 4.161829375549692, + "grad_norm": 0.2547094045215424, + "learning_rate": 0.00027996434372607707, + "loss": 2.9427552223205566, + "step": 7100, + "token_acc": 0.30708439904299056 + }, + { + "epoch": 4.162415713866901, + "grad_norm": 0.23370596441884453, + "learning_rate": 0.0002799570842227025, + "loss": 2.956815481185913, + "step": 7101, + "token_acc": 0.3044716817506903 + }, + { + "epoch": 4.16300205218411, + "grad_norm": 0.2429097091197528, + "learning_rate": 0.0002799498234985562, + "loss": 2.949885606765747, + "step": 7102, + "token_acc": 0.3063343717549325 + }, + { + "epoch": 4.163588390501319, + "grad_norm": 0.2475838052855693, + "learning_rate": 0.00027994256155370646, + "loss": 2.9774158000946045, + "step": 7103, + "token_acc": 0.3022645376536852 + }, + { + "epoch": 4.164174728818528, + "grad_norm": 0.2547814826972486, + "learning_rate": 0.0002799352983882215, + "loss": 2.9701404571533203, + "step": 7104, + "token_acc": 0.303403080862684 + }, + { + "epoch": 4.164761067135737, + "grad_norm": 0.24521281986178634, + "learning_rate": 0.00027992803400216944, + "loss": 2.9859836101531982, + "step": 7105, + "token_acc": 0.29887775591955723 + }, + { + "epoch": 4.1653474054529465, + "grad_norm": 0.2647998910863689, + "learning_rate": 0.0002799207683956186, + "loss": 2.954998731613159, + "step": 7106, + "token_acc": 0.30600121014320025 + }, + { + "epoch": 4.165933743770156, + "grad_norm": 0.2629432528514209, + "learning_rate": 0.00027991350156863717, + "loss": 2.9690797328948975, + "step": 7107, + "token_acc": 0.3027115677261151 + }, + { + "epoch": 4.166520082087365, + "grad_norm": 0.24680571741192905, + "learning_rate": 0.00027990623352129346, + "loss": 2.9474008083343506, + "step": 7108, + "token_acc": 0.30641482427325434 + }, + { + "epoch": 4.167106420404574, + "grad_norm": 0.23644255257035737, + "learning_rate": 0.00027989896425365576, + "loss": 2.9576873779296875, + "step": 7109, + "token_acc": 0.30443801056815334 + }, + { + "epoch": 4.167692758721783, + "grad_norm": 0.2619561570670442, + "learning_rate": 0.00027989169376579237, + "loss": 2.9569711685180664, + "step": 7110, + "token_acc": 0.3042408353488225 + }, + { + "epoch": 4.168279097038991, + "grad_norm": 0.21634723956185892, + "learning_rate": 0.0002798844220577715, + "loss": 2.9592514038085938, + "step": 7111, + "token_acc": 0.30343157900176476 + }, + { + "epoch": 4.1688654353562, + "grad_norm": 0.26147269816837365, + "learning_rate": 0.0002798771491296615, + "loss": 2.969801425933838, + "step": 7112, + "token_acc": 0.30307761332603717 + }, + { + "epoch": 4.169451773673409, + "grad_norm": 0.21805415342833442, + "learning_rate": 0.0002798698749815307, + "loss": 2.9321141242980957, + "step": 7113, + "token_acc": 0.3085903164853442 + }, + { + "epoch": 4.1700381119906185, + "grad_norm": 0.25158614798935675, + "learning_rate": 0.0002798625996134475, + "loss": 2.9781296253204346, + "step": 7114, + "token_acc": 0.3012214455483732 + }, + { + "epoch": 4.170624450307828, + "grad_norm": 0.21055065657956562, + "learning_rate": 0.0002798553230254801, + "loss": 2.94368052482605, + "step": 7115, + "token_acc": 0.3075166462977791 + }, + { + "epoch": 4.171210788625037, + "grad_norm": 0.23576910755775563, + "learning_rate": 0.0002798480452176969, + "loss": 2.9640750885009766, + "step": 7116, + "token_acc": 0.30338828286781133 + }, + { + "epoch": 4.171797126942246, + "grad_norm": 0.235043132161141, + "learning_rate": 0.00027984076619016633, + "loss": 2.970208168029785, + "step": 7117, + "token_acc": 0.3022410455614306 + }, + { + "epoch": 4.172383465259455, + "grad_norm": 0.21938000103478963, + "learning_rate": 0.0002798334859429567, + "loss": 2.9788246154785156, + "step": 7118, + "token_acc": 0.3015351227140714 + }, + { + "epoch": 4.172969803576664, + "grad_norm": 0.2322315098196772, + "learning_rate": 0.00027982620447613644, + "loss": 2.9677741527557373, + "step": 7119, + "token_acc": 0.30360736076853173 + }, + { + "epoch": 4.173556141893873, + "grad_norm": 0.22000374386222343, + "learning_rate": 0.00027981892178977394, + "loss": 2.986588954925537, + "step": 7120, + "token_acc": 0.30002392929413785 + }, + { + "epoch": 4.174142480211081, + "grad_norm": 0.2260902705951297, + "learning_rate": 0.0002798116378839376, + "loss": 2.972031593322754, + "step": 7121, + "token_acc": 0.30150214867551794 + }, + { + "epoch": 4.1747288185282905, + "grad_norm": 0.2227465686895031, + "learning_rate": 0.0002798043527586958, + "loss": 2.927219867706299, + "step": 7122, + "token_acc": 0.3094250920566751 + }, + { + "epoch": 4.1753151568455, + "grad_norm": 0.21935979084719215, + "learning_rate": 0.0002797970664141171, + "loss": 2.9706227779388428, + "step": 7123, + "token_acc": 0.30268446553733735 + }, + { + "epoch": 4.175901495162709, + "grad_norm": 0.2204873715575863, + "learning_rate": 0.00027978977885026983, + "loss": 2.945462703704834, + "step": 7124, + "token_acc": 0.3064844391961385 + }, + { + "epoch": 4.176487833479918, + "grad_norm": 0.22956624948708773, + "learning_rate": 0.00027978249006722244, + "loss": 2.9830143451690674, + "step": 7125, + "token_acc": 0.30172838609614017 + }, + { + "epoch": 4.177074171797127, + "grad_norm": 0.21350312543426075, + "learning_rate": 0.0002797752000650435, + "loss": 2.9473366737365723, + "step": 7126, + "token_acc": 0.30502939255388634 + }, + { + "epoch": 4.177660510114336, + "grad_norm": 0.23924067253008593, + "learning_rate": 0.0002797679088438014, + "loss": 2.9383702278137207, + "step": 7127, + "token_acc": 0.3086238339848964 + }, + { + "epoch": 4.178246848431545, + "grad_norm": 0.22888709026859433, + "learning_rate": 0.0002797606164035647, + "loss": 2.967724323272705, + "step": 7128, + "token_acc": 0.30075341123567695 + }, + { + "epoch": 4.178833186748754, + "grad_norm": 0.22299672441223056, + "learning_rate": 0.0002797533227444018, + "loss": 2.9012012481689453, + "step": 7129, + "token_acc": 0.3134994489384267 + }, + { + "epoch": 4.179419525065963, + "grad_norm": 0.2267764326101299, + "learning_rate": 0.0002797460278663813, + "loss": 2.9398555755615234, + "step": 7130, + "token_acc": 0.30675164920241504 + }, + { + "epoch": 4.1800058633831725, + "grad_norm": 0.24350531200783265, + "learning_rate": 0.00027973873176957167, + "loss": 2.9520580768585205, + "step": 7131, + "token_acc": 0.30463218358119826 + }, + { + "epoch": 4.180592201700381, + "grad_norm": 0.24714987235594757, + "learning_rate": 0.0002797314344540415, + "loss": 2.9722585678100586, + "step": 7132, + "token_acc": 0.3023703838365475 + }, + { + "epoch": 4.18117854001759, + "grad_norm": 0.2527182244850833, + "learning_rate": 0.00027972413591985937, + "loss": 2.9668326377868652, + "step": 7133, + "token_acc": 0.3039479054197168 + }, + { + "epoch": 4.181764878334799, + "grad_norm": 0.22874539615960185, + "learning_rate": 0.00027971683616709374, + "loss": 2.9069981575012207, + "step": 7134, + "token_acc": 0.31072032639991737 + }, + { + "epoch": 4.182351216652008, + "grad_norm": 0.22820477298100342, + "learning_rate": 0.0002797095351958133, + "loss": 2.9717440605163574, + "step": 7135, + "token_acc": 0.3011552724780373 + }, + { + "epoch": 4.182937554969217, + "grad_norm": 0.2373630390894112, + "learning_rate": 0.00027970223300608643, + "loss": 2.9145684242248535, + "step": 7136, + "token_acc": 0.3091124373460199 + }, + { + "epoch": 4.183523893286426, + "grad_norm": 0.22602484445093982, + "learning_rate": 0.00027969492959798196, + "loss": 2.9773716926574707, + "step": 7137, + "token_acc": 0.3019175021672046 + }, + { + "epoch": 4.184110231603635, + "grad_norm": 0.22848192729257139, + "learning_rate": 0.00027968762497156835, + "loss": 2.9658541679382324, + "step": 7138, + "token_acc": 0.3032980463631827 + }, + { + "epoch": 4.1846965699208445, + "grad_norm": 0.24610876966895218, + "learning_rate": 0.0002796803191269143, + "loss": 2.932302474975586, + "step": 7139, + "token_acc": 0.3081606760838522 + }, + { + "epoch": 4.185282908238054, + "grad_norm": 0.23334521099448174, + "learning_rate": 0.00027967301206408837, + "loss": 2.9689481258392334, + "step": 7140, + "token_acc": 0.30098418101985464 + }, + { + "epoch": 4.185869246555263, + "grad_norm": 0.25264448409772233, + "learning_rate": 0.00027966570378315926, + "loss": 2.965956449508667, + "step": 7141, + "token_acc": 0.3024810880727472 + }, + { + "epoch": 4.186455584872472, + "grad_norm": 0.22368728813867814, + "learning_rate": 0.00027965839428419553, + "loss": 2.9376280307769775, + "step": 7142, + "token_acc": 0.30766104554854773 + }, + { + "epoch": 4.18704192318968, + "grad_norm": 0.23440779366134426, + "learning_rate": 0.0002796510835672659, + "loss": 2.963715076446533, + "step": 7143, + "token_acc": 0.3031145921018922 + }, + { + "epoch": 4.187628261506889, + "grad_norm": 0.2569461271408038, + "learning_rate": 0.00027964377163243914, + "loss": 2.9298667907714844, + "step": 7144, + "token_acc": 0.3081967213114754 + }, + { + "epoch": 4.188214599824098, + "grad_norm": 0.2146023937850612, + "learning_rate": 0.00027963645847978375, + "loss": 2.9771406650543213, + "step": 7145, + "token_acc": 0.30071813744281733 + }, + { + "epoch": 4.188800938141307, + "grad_norm": 0.23064107450841975, + "learning_rate": 0.0002796291441093686, + "loss": 2.93847393989563, + "step": 7146, + "token_acc": 0.30797406299517727 + }, + { + "epoch": 4.1893872764585165, + "grad_norm": 0.21873435236853192, + "learning_rate": 0.0002796218285212622, + "loss": 3.0000576972961426, + "step": 7147, + "token_acc": 0.2975899149083081 + }, + { + "epoch": 4.189973614775726, + "grad_norm": 0.23362462088238492, + "learning_rate": 0.0002796145117155335, + "loss": 2.945387363433838, + "step": 7148, + "token_acc": 0.3074763911532191 + }, + { + "epoch": 4.190559953092935, + "grad_norm": 0.21900884207080107, + "learning_rate": 0.00027960719369225106, + "loss": 2.9527676105499268, + "step": 7149, + "token_acc": 0.3034883254164715 + }, + { + "epoch": 4.191146291410144, + "grad_norm": 0.23252562681536362, + "learning_rate": 0.0002795998744514837, + "loss": 2.9716498851776123, + "step": 7150, + "token_acc": 0.30208920486571994 + }, + { + "epoch": 4.191732629727353, + "grad_norm": 0.2613067061748528, + "learning_rate": 0.0002795925539933002, + "loss": 2.989401340484619, + "step": 7151, + "token_acc": 0.29963326524535183 + }, + { + "epoch": 4.192318968044562, + "grad_norm": 0.2459518909128689, + "learning_rate": 0.0002795852323177692, + "loss": 2.9221770763397217, + "step": 7152, + "token_acc": 0.3112821097825298 + }, + { + "epoch": 4.192905306361771, + "grad_norm": 0.2205318046437699, + "learning_rate": 0.00027957790942495964, + "loss": 3.0039522647857666, + "step": 7153, + "token_acc": 0.296598592721575 + }, + { + "epoch": 4.193491644678979, + "grad_norm": 0.24371516259060283, + "learning_rate": 0.0002795705853149402, + "loss": 3.002087354660034, + "step": 7154, + "token_acc": 0.29926761143504393 + }, + { + "epoch": 4.1940779829961885, + "grad_norm": 0.23099993737736058, + "learning_rate": 0.0002795632599877797, + "loss": 2.9613800048828125, + "step": 7155, + "token_acc": 0.3060259817345438 + }, + { + "epoch": 4.194664321313398, + "grad_norm": 0.21666149844607271, + "learning_rate": 0.000279555933443547, + "loss": 2.9645919799804688, + "step": 7156, + "token_acc": 0.30355589824268997 + }, + { + "epoch": 4.195250659630607, + "grad_norm": 0.22560474757390725, + "learning_rate": 0.0002795486056823108, + "loss": 2.93660306930542, + "step": 7157, + "token_acc": 0.307644438195372 + }, + { + "epoch": 4.195836997947816, + "grad_norm": 0.2143182202227415, + "learning_rate": 0.0002795412767041401, + "loss": 2.9247260093688965, + "step": 7158, + "token_acc": 0.308627515462744 + }, + { + "epoch": 4.196423336265025, + "grad_norm": 0.24378449410246744, + "learning_rate": 0.0002795339465091036, + "loss": 2.9704389572143555, + "step": 7159, + "token_acc": 0.3022703013694482 + }, + { + "epoch": 4.197009674582234, + "grad_norm": 0.2393462724897528, + "learning_rate": 0.00027952661509727026, + "loss": 2.9679412841796875, + "step": 7160, + "token_acc": 0.30400949478681466 + }, + { + "epoch": 4.197596012899443, + "grad_norm": 0.25074212708065247, + "learning_rate": 0.0002795192824687089, + "loss": 2.940260410308838, + "step": 7161, + "token_acc": 0.3066548210955075 + }, + { + "epoch": 4.198182351216652, + "grad_norm": 0.2055295308303941, + "learning_rate": 0.00027951194862348844, + "loss": 2.965881109237671, + "step": 7162, + "token_acc": 0.30385203098632724 + }, + { + "epoch": 4.198768689533861, + "grad_norm": 0.2795800503669608, + "learning_rate": 0.00027950461356167773, + "loss": 2.928539276123047, + "step": 7163, + "token_acc": 0.3080695414419093 + }, + { + "epoch": 4.19935502785107, + "grad_norm": 0.2574245167648106, + "learning_rate": 0.0002794972772833456, + "loss": 2.9717133045196533, + "step": 7164, + "token_acc": 0.3018946823445076 + }, + { + "epoch": 4.199941366168279, + "grad_norm": 0.20455830980264006, + "learning_rate": 0.0002794899397885612, + "loss": 2.9435808658599854, + "step": 7165, + "token_acc": 0.30615077803143614 + }, + { + "epoch": 4.200527704485488, + "grad_norm": 0.26263796607918494, + "learning_rate": 0.0002794826010773932, + "loss": 2.929746150970459, + "step": 7166, + "token_acc": 0.30981280113741083 + }, + { + "epoch": 4.201114042802697, + "grad_norm": 0.2161661501522452, + "learning_rate": 0.00027947526114991073, + "loss": 2.9303746223449707, + "step": 7167, + "token_acc": 0.3091720624520303 + }, + { + "epoch": 4.201700381119906, + "grad_norm": 0.2522378803582865, + "learning_rate": 0.0002794679200061825, + "loss": 3.014087677001953, + "step": 7168, + "token_acc": 0.29651893819850417 + }, + { + "epoch": 4.202286719437115, + "grad_norm": 0.24439679799342348, + "learning_rate": 0.00027946057764627775, + "loss": 2.906731605529785, + "step": 7169, + "token_acc": 0.3118171523773575 + }, + { + "epoch": 4.202873057754324, + "grad_norm": 0.22583642152211197, + "learning_rate": 0.0002794532340702653, + "loss": 2.937023162841797, + "step": 7170, + "token_acc": 0.3073085429203312 + }, + { + "epoch": 4.203459396071533, + "grad_norm": 0.23110355674714725, + "learning_rate": 0.00027944588927821413, + "loss": 2.9656410217285156, + "step": 7171, + "token_acc": 0.30344160906945383 + }, + { + "epoch": 4.2040457343887425, + "grad_norm": 0.2502207298018072, + "learning_rate": 0.0002794385432701933, + "loss": 2.938009262084961, + "step": 7172, + "token_acc": 0.308593295770479 + }, + { + "epoch": 4.204632072705952, + "grad_norm": 0.24808591023320029, + "learning_rate": 0.00027943119604627173, + "loss": 2.981632709503174, + "step": 7173, + "token_acc": 0.3021054006016953 + }, + { + "epoch": 4.205218411023161, + "grad_norm": 0.23886907557550716, + "learning_rate": 0.0002794238476065185, + "loss": 2.975621461868286, + "step": 7174, + "token_acc": 0.3005077746724803 + }, + { + "epoch": 4.205804749340369, + "grad_norm": 0.22924291178181072, + "learning_rate": 0.00027941649795100264, + "loss": 2.9458160400390625, + "step": 7175, + "token_acc": 0.3059799574981215 + }, + { + "epoch": 4.206391087657578, + "grad_norm": 0.2227007321915543, + "learning_rate": 0.00027940914707979316, + "loss": 2.951711654663086, + "step": 7176, + "token_acc": 0.30477109006945124 + }, + { + "epoch": 4.206977425974787, + "grad_norm": 0.2300017522391354, + "learning_rate": 0.00027940179499295914, + "loss": 2.935321807861328, + "step": 7177, + "token_acc": 0.30842687477212605 + }, + { + "epoch": 4.207563764291996, + "grad_norm": 0.2342032710115526, + "learning_rate": 0.0002793944416905696, + "loss": 2.9410626888275146, + "step": 7178, + "token_acc": 0.3066074361048511 + }, + { + "epoch": 4.208150102609205, + "grad_norm": 0.23937518119630974, + "learning_rate": 0.0002793870871726936, + "loss": 2.9870901107788086, + "step": 7179, + "token_acc": 0.2996194736228443 + }, + { + "epoch": 4.2087364409264145, + "grad_norm": 0.2435238510899763, + "learning_rate": 0.0002793797314394004, + "loss": 2.9651312828063965, + "step": 7180, + "token_acc": 0.30357758286735614 + }, + { + "epoch": 4.209322779243624, + "grad_norm": 0.24063754098713017, + "learning_rate": 0.0002793723744907589, + "loss": 2.937845468521118, + "step": 7181, + "token_acc": 0.3066822166331961 + }, + { + "epoch": 4.209909117560833, + "grad_norm": 0.2336205600179901, + "learning_rate": 0.0002793650163268382, + "loss": 2.944732904434204, + "step": 7182, + "token_acc": 0.3071571702043063 + }, + { + "epoch": 4.210495455878042, + "grad_norm": 0.22337784377271147, + "learning_rate": 0.00027935765694770754, + "loss": 2.9650626182556152, + "step": 7183, + "token_acc": 0.30338061081578804 + }, + { + "epoch": 4.211081794195251, + "grad_norm": 0.22940496114069442, + "learning_rate": 0.000279350296353436, + "loss": 2.954596757888794, + "step": 7184, + "token_acc": 0.30548322627440133 + }, + { + "epoch": 4.21166813251246, + "grad_norm": 0.2368833138661059, + "learning_rate": 0.0002793429345440928, + "loss": 2.9842207431793213, + "step": 7185, + "token_acc": 0.2995793049385942 + }, + { + "epoch": 4.212254470829668, + "grad_norm": 0.2410989701531458, + "learning_rate": 0.00027933557151974697, + "loss": 2.9854331016540527, + "step": 7186, + "token_acc": 0.300787216242699 + }, + { + "epoch": 4.212840809146877, + "grad_norm": 0.2195840763549594, + "learning_rate": 0.0002793282072804677, + "loss": 2.977243423461914, + "step": 7187, + "token_acc": 0.30053291360727385 + }, + { + "epoch": 4.2134271474640865, + "grad_norm": 0.23243841516018374, + "learning_rate": 0.00027932084182632425, + "loss": 3.0346317291259766, + "step": 7188, + "token_acc": 0.2930134274582094 + }, + { + "epoch": 4.214013485781296, + "grad_norm": 0.2283962479897357, + "learning_rate": 0.0002793134751573857, + "loss": 2.9303455352783203, + "step": 7189, + "token_acc": 0.309084792464316 + }, + { + "epoch": 4.214599824098505, + "grad_norm": 0.23880408473174367, + "learning_rate": 0.0002793061072737213, + "loss": 2.9933369159698486, + "step": 7190, + "token_acc": 0.2982695983493462 + }, + { + "epoch": 4.215186162415714, + "grad_norm": 0.22206798767023753, + "learning_rate": 0.0002792987381754003, + "loss": 2.921048641204834, + "step": 7191, + "token_acc": 0.310069056367941 + }, + { + "epoch": 4.215772500732923, + "grad_norm": 0.2548641533782326, + "learning_rate": 0.00027929136786249186, + "loss": 2.933879852294922, + "step": 7192, + "token_acc": 0.30738909725072855 + }, + { + "epoch": 4.216358839050132, + "grad_norm": 0.22598315336124358, + "learning_rate": 0.00027928399633506525, + "loss": 2.967158317565918, + "step": 7193, + "token_acc": 0.303506335929188 + }, + { + "epoch": 4.216945177367341, + "grad_norm": 0.24078647759021363, + "learning_rate": 0.00027927662359318974, + "loss": 2.9204459190368652, + "step": 7194, + "token_acc": 0.3096900270940819 + }, + { + "epoch": 4.21753151568455, + "grad_norm": 0.24184321318659951, + "learning_rate": 0.0002792692496369345, + "loss": 3.000326633453369, + "step": 7195, + "token_acc": 0.2992154433777227 + }, + { + "epoch": 4.218117854001759, + "grad_norm": 0.23464220067362695, + "learning_rate": 0.00027926187446636894, + "loss": 2.9488885402679443, + "step": 7196, + "token_acc": 0.30547350398346457 + }, + { + "epoch": 4.218704192318968, + "grad_norm": 0.2607214459305649, + "learning_rate": 0.00027925449808156215, + "loss": 3.009669065475464, + "step": 7197, + "token_acc": 0.2956722827882392 + }, + { + "epoch": 4.219290530636177, + "grad_norm": 0.23777982058184158, + "learning_rate": 0.00027924712048258354, + "loss": 2.942624092102051, + "step": 7198, + "token_acc": 0.306084282806813 + }, + { + "epoch": 4.219876868953386, + "grad_norm": 0.23116310031305282, + "learning_rate": 0.0002792397416695025, + "loss": 3.0218467712402344, + "step": 7199, + "token_acc": 0.2979693547531944 + }, + { + "epoch": 4.220463207270595, + "grad_norm": 0.22967455758778085, + "learning_rate": 0.0002792323616423881, + "loss": 2.9594767093658447, + "step": 7200, + "token_acc": 0.3034323075563779 + }, + { + "epoch": 4.221049545587804, + "grad_norm": 0.22284893708568565, + "learning_rate": 0.00027922498040130984, + "loss": 2.9530961513519287, + "step": 7201, + "token_acc": 0.3041579244944278 + }, + { + "epoch": 4.221635883905013, + "grad_norm": 0.21244358462659857, + "learning_rate": 0.0002792175979463371, + "loss": 2.9933292865753174, + "step": 7202, + "token_acc": 0.30087169315981566 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.23414613721805977, + "learning_rate": 0.0002792102142775391, + "loss": 3.0051686763763428, + "step": 7203, + "token_acc": 0.29778418437094506 + }, + { + "epoch": 4.222808560539431, + "grad_norm": 0.22866065510903766, + "learning_rate": 0.00027920282939498524, + "loss": 2.934781074523926, + "step": 7204, + "token_acc": 0.3078724684178865 + }, + { + "epoch": 4.2233948988566405, + "grad_norm": 0.21421220293720977, + "learning_rate": 0.00027919544329874487, + "loss": 2.9708244800567627, + "step": 7205, + "token_acc": 0.3035659154059895 + }, + { + "epoch": 4.22398123717385, + "grad_norm": 0.21253943761023864, + "learning_rate": 0.00027918805598888745, + "loss": 2.967909097671509, + "step": 7206, + "token_acc": 0.30134420280595947 + }, + { + "epoch": 4.224567575491059, + "grad_norm": 0.22766300462921712, + "learning_rate": 0.0002791806674654823, + "loss": 2.9439306259155273, + "step": 7207, + "token_acc": 0.3073599256295135 + }, + { + "epoch": 4.225153913808267, + "grad_norm": 0.25491453230249544, + "learning_rate": 0.00027917327772859887, + "loss": 2.9557056427001953, + "step": 7208, + "token_acc": 0.3049779122725648 + }, + { + "epoch": 4.225740252125476, + "grad_norm": 0.2573004461323963, + "learning_rate": 0.0002791658867783066, + "loss": 2.977468490600586, + "step": 7209, + "token_acc": 0.3020123432297224 + }, + { + "epoch": 4.226326590442685, + "grad_norm": 0.2215120293760408, + "learning_rate": 0.0002791584946146748, + "loss": 3.0005874633789062, + "step": 7210, + "token_acc": 0.29880392038076475 + }, + { + "epoch": 4.226912928759894, + "grad_norm": 0.24115393281733802, + "learning_rate": 0.00027915110123777305, + "loss": 2.941826343536377, + "step": 7211, + "token_acc": 0.3061504545988182 + }, + { + "epoch": 4.227499267077103, + "grad_norm": 0.23730474695811699, + "learning_rate": 0.0002791437066476707, + "loss": 2.9924192428588867, + "step": 7212, + "token_acc": 0.2998862739395954 + }, + { + "epoch": 4.2280856053943126, + "grad_norm": 0.22845052258412402, + "learning_rate": 0.00027913631084443725, + "loss": 3.0223019123077393, + "step": 7213, + "token_acc": 0.29430885224753484 + }, + { + "epoch": 4.228671943711522, + "grad_norm": 0.24559478199437917, + "learning_rate": 0.00027912891382814224, + "loss": 2.9465341567993164, + "step": 7214, + "token_acc": 0.305107394866758 + }, + { + "epoch": 4.229258282028731, + "grad_norm": 0.2209345299661073, + "learning_rate": 0.00027912151559885497, + "loss": 2.99446964263916, + "step": 7215, + "token_acc": 0.2982912379326657 + }, + { + "epoch": 4.22984462034594, + "grad_norm": 0.2436864376958213, + "learning_rate": 0.00027911411615664513, + "loss": 2.934150218963623, + "step": 7216, + "token_acc": 0.30776252456857206 + }, + { + "epoch": 4.230430958663149, + "grad_norm": 0.22593124510100204, + "learning_rate": 0.00027910671550158213, + "loss": 2.99104905128479, + "step": 7217, + "token_acc": 0.29994016169075055 + }, + { + "epoch": 4.231017296980358, + "grad_norm": 0.22568673634368636, + "learning_rate": 0.0002790993136337355, + "loss": 2.9484798908233643, + "step": 7218, + "token_acc": 0.3072376564076428 + }, + { + "epoch": 4.231603635297566, + "grad_norm": 0.2303993207447669, + "learning_rate": 0.0002790919105531748, + "loss": 2.9428210258483887, + "step": 7219, + "token_acc": 0.30737112087790025 + }, + { + "epoch": 4.2321899736147754, + "grad_norm": 0.23764369171518032, + "learning_rate": 0.0002790845062599696, + "loss": 2.9334921836853027, + "step": 7220, + "token_acc": 0.3069261854826334 + }, + { + "epoch": 4.232776311931985, + "grad_norm": 0.24173926010601354, + "learning_rate": 0.0002790771007541893, + "loss": 2.939941883087158, + "step": 7221, + "token_acc": 0.3068294808926423 + }, + { + "epoch": 4.233362650249194, + "grad_norm": 0.2181224970387667, + "learning_rate": 0.0002790696940359037, + "loss": 3.00750732421875, + "step": 7222, + "token_acc": 0.2968870361140061 + }, + { + "epoch": 4.233948988566403, + "grad_norm": 0.22004373046188294, + "learning_rate": 0.00027906228610518214, + "loss": 2.9417672157287598, + "step": 7223, + "token_acc": 0.30669800545696824 + }, + { + "epoch": 4.234535326883612, + "grad_norm": 0.2273454545502435, + "learning_rate": 0.0002790548769620944, + "loss": 2.986320972442627, + "step": 7224, + "token_acc": 0.3006915035351275 + }, + { + "epoch": 4.235121665200821, + "grad_norm": 0.223823389661935, + "learning_rate": 0.0002790474666067099, + "loss": 3.0056748390197754, + "step": 7225, + "token_acc": 0.2976590734859271 + }, + { + "epoch": 4.23570800351803, + "grad_norm": 0.24702666535264076, + "learning_rate": 0.0002790400550390984, + "loss": 2.9845900535583496, + "step": 7226, + "token_acc": 0.2994404467947563 + }, + { + "epoch": 4.236294341835239, + "grad_norm": 0.2642517644378435, + "learning_rate": 0.0002790326422593295, + "loss": 2.981661796569824, + "step": 7227, + "token_acc": 0.3007371138135661 + }, + { + "epoch": 4.236880680152448, + "grad_norm": 0.23498001984855038, + "learning_rate": 0.0002790252282674727, + "loss": 2.9488978385925293, + "step": 7228, + "token_acc": 0.3051641416870178 + }, + { + "epoch": 4.237467018469657, + "grad_norm": 0.2124376175250313, + "learning_rate": 0.00027901781306359784, + "loss": 2.955808639526367, + "step": 7229, + "token_acc": 0.3039293625494933 + }, + { + "epoch": 4.238053356786866, + "grad_norm": 0.2247504235429204, + "learning_rate": 0.00027901039664777447, + "loss": 2.962716817855835, + "step": 7230, + "token_acc": 0.30221665947802523 + }, + { + "epoch": 4.238639695104075, + "grad_norm": 0.22354858818037412, + "learning_rate": 0.00027900297902007224, + "loss": 2.996952533721924, + "step": 7231, + "token_acc": 0.30008518362245346 + }, + { + "epoch": 4.239226033421284, + "grad_norm": 0.23492827246659842, + "learning_rate": 0.0002789955601805609, + "loss": 2.987431526184082, + "step": 7232, + "token_acc": 0.3002526992878475 + }, + { + "epoch": 4.239812371738493, + "grad_norm": 0.23520890514516835, + "learning_rate": 0.00027898814012931, + "loss": 3.0119194984436035, + "step": 7233, + "token_acc": 0.29662859165010363 + }, + { + "epoch": 4.240398710055702, + "grad_norm": 0.23379002750282327, + "learning_rate": 0.0002789807188663894, + "loss": 2.95747447013855, + "step": 7234, + "token_acc": 0.30488665361578415 + }, + { + "epoch": 4.240985048372911, + "grad_norm": 0.23088098275481278, + "learning_rate": 0.00027897329639186874, + "loss": 3.0047826766967773, + "step": 7235, + "token_acc": 0.2974579828001486 + }, + { + "epoch": 4.24157138669012, + "grad_norm": 0.22960480116811235, + "learning_rate": 0.00027896587270581776, + "loss": 2.9464821815490723, + "step": 7236, + "token_acc": 0.30496864716104405 + }, + { + "epoch": 4.2421577250073295, + "grad_norm": 0.21414973851746352, + "learning_rate": 0.00027895844780830616, + "loss": 2.9867663383483887, + "step": 7237, + "token_acc": 0.29981873587978775 + }, + { + "epoch": 4.242744063324539, + "grad_norm": 0.2164449220096112, + "learning_rate": 0.00027895102169940377, + "loss": 2.9728453159332275, + "step": 7238, + "token_acc": 0.30284858364737266 + }, + { + "epoch": 4.243330401641748, + "grad_norm": 0.2225356668681225, + "learning_rate": 0.00027894359437918024, + "loss": 2.9414124488830566, + "step": 7239, + "token_acc": 0.30824104479213615 + }, + { + "epoch": 4.243916739958956, + "grad_norm": 0.2284371079919919, + "learning_rate": 0.00027893616584770544, + "loss": 3.0008625984191895, + "step": 7240, + "token_acc": 0.29724459425151384 + }, + { + "epoch": 4.244503078276165, + "grad_norm": 0.22869310209195204, + "learning_rate": 0.00027892873610504905, + "loss": 2.9359726905822754, + "step": 7241, + "token_acc": 0.30582687348291393 + }, + { + "epoch": 4.245089416593374, + "grad_norm": 0.23148401445140004, + "learning_rate": 0.000278921305151281, + "loss": 2.964212656021118, + "step": 7242, + "token_acc": 0.3033599232791892 + }, + { + "epoch": 4.245675754910583, + "grad_norm": 0.22815844596200105, + "learning_rate": 0.00027891387298647097, + "loss": 2.9286434650421143, + "step": 7243, + "token_acc": 0.3087061597720895 + }, + { + "epoch": 4.246262093227792, + "grad_norm": 0.22207631598123453, + "learning_rate": 0.00027890643961068877, + "loss": 2.952810287475586, + "step": 7244, + "token_acc": 0.3050997177411047 + }, + { + "epoch": 4.2468484315450015, + "grad_norm": 0.2661146352574905, + "learning_rate": 0.00027889900502400437, + "loss": 2.986868381500244, + "step": 7245, + "token_acc": 0.3014544516234521 + }, + { + "epoch": 4.247434769862211, + "grad_norm": 0.23686821287018786, + "learning_rate": 0.0002788915692264875, + "loss": 2.932485342025757, + "step": 7246, + "token_acc": 0.30744662968566144 + }, + { + "epoch": 4.24802110817942, + "grad_norm": 0.20720182121010983, + "learning_rate": 0.0002788841322182079, + "loss": 2.9421558380126953, + "step": 7247, + "token_acc": 0.30634722113655904 + }, + { + "epoch": 4.248607446496629, + "grad_norm": 0.23844848148599096, + "learning_rate": 0.00027887669399923563, + "loss": 2.964144706726074, + "step": 7248, + "token_acc": 0.30308403152306806 + }, + { + "epoch": 4.249193784813838, + "grad_norm": 0.23823319275182203, + "learning_rate": 0.0002788692545696405, + "loss": 2.984557628631592, + "step": 7249, + "token_acc": 0.30038957020330115 + }, + { + "epoch": 4.249780123131047, + "grad_norm": 0.21820367909968633, + "learning_rate": 0.00027886181392949237, + "loss": 2.9552369117736816, + "step": 7250, + "token_acc": 0.30631165800360666 + }, + { + "epoch": 4.250366461448255, + "grad_norm": 0.23392373874309083, + "learning_rate": 0.00027885437207886114, + "loss": 2.981729030609131, + "step": 7251, + "token_acc": 0.30055999457228 + }, + { + "epoch": 4.250952799765464, + "grad_norm": 0.24395941239362548, + "learning_rate": 0.0002788469290178167, + "loss": 2.9653823375701904, + "step": 7252, + "token_acc": 0.3043243330430014 + }, + { + "epoch": 4.2515391380826735, + "grad_norm": 0.20863186148278146, + "learning_rate": 0.00027883948474642894, + "loss": 2.9317467212677, + "step": 7253, + "token_acc": 0.30933163464800184 + }, + { + "epoch": 4.252125476399883, + "grad_norm": 0.2286143433917573, + "learning_rate": 0.00027883203926476794, + "loss": 2.953239917755127, + "step": 7254, + "token_acc": 0.30279217697509353 + }, + { + "epoch": 4.252711814717092, + "grad_norm": 0.22551397624086614, + "learning_rate": 0.0002788245925729035, + "loss": 2.9629886150360107, + "step": 7255, + "token_acc": 0.30401657474623883 + }, + { + "epoch": 4.253298153034301, + "grad_norm": 0.22584383182463144, + "learning_rate": 0.00027881714467090557, + "loss": 2.9899630546569824, + "step": 7256, + "token_acc": 0.29995232162095953 + }, + { + "epoch": 4.25388449135151, + "grad_norm": 0.2083256029583409, + "learning_rate": 0.00027880969555884417, + "loss": 2.937809467315674, + "step": 7257, + "token_acc": 0.3077331902040645 + }, + { + "epoch": 4.254470829668719, + "grad_norm": 0.20224647510583466, + "learning_rate": 0.00027880224523678924, + "loss": 2.9497933387756348, + "step": 7258, + "token_acc": 0.30434684458713757 + }, + { + "epoch": 4.255057167985928, + "grad_norm": 0.2148677563268644, + "learning_rate": 0.0002787947937048108, + "loss": 2.922703981399536, + "step": 7259, + "token_acc": 0.3098799733838566 + }, + { + "epoch": 4.255643506303137, + "grad_norm": 0.23026476398375495, + "learning_rate": 0.00027878734096297884, + "loss": 2.993525505065918, + "step": 7260, + "token_acc": 0.3003125417530108 + }, + { + "epoch": 4.256229844620346, + "grad_norm": 0.20487611822643104, + "learning_rate": 0.00027877988701136333, + "loss": 2.955353260040283, + "step": 7261, + "token_acc": 0.30489917629579855 + }, + { + "epoch": 4.256816182937555, + "grad_norm": 0.22828713198687642, + "learning_rate": 0.0002787724318500343, + "loss": 2.949578285217285, + "step": 7262, + "token_acc": 0.3047173797921336 + }, + { + "epoch": 4.257402521254764, + "grad_norm": 0.22663398999582224, + "learning_rate": 0.0002787649754790618, + "loss": 2.917224645614624, + "step": 7263, + "token_acc": 0.3088675227138123 + }, + { + "epoch": 4.257988859571973, + "grad_norm": 0.19943245199961812, + "learning_rate": 0.0002787575178985159, + "loss": 2.9855685234069824, + "step": 7264, + "token_acc": 0.29906549291957973 + }, + { + "epoch": 4.258575197889182, + "grad_norm": 0.22908655246498183, + "learning_rate": 0.0002787500591084666, + "loss": 3.0319266319274902, + "step": 7265, + "token_acc": 0.2923925356383243 + }, + { + "epoch": 4.259161536206391, + "grad_norm": 0.21915104013630266, + "learning_rate": 0.000278742599108984, + "loss": 2.9652533531188965, + "step": 7266, + "token_acc": 0.3042721274397338 + }, + { + "epoch": 4.2597478745236, + "grad_norm": 0.2412256988000664, + "learning_rate": 0.00027873513790013815, + "loss": 2.963923215866089, + "step": 7267, + "token_acc": 0.30165588410601957 + }, + { + "epoch": 4.260334212840809, + "grad_norm": 0.22216080646828737, + "learning_rate": 0.00027872767548199915, + "loss": 2.931894302368164, + "step": 7268, + "token_acc": 0.30872874136568446 + }, + { + "epoch": 4.260920551158018, + "grad_norm": 0.2477531655146537, + "learning_rate": 0.0002787202118546371, + "loss": 2.9958064556121826, + "step": 7269, + "token_acc": 0.29990436943764437 + }, + { + "epoch": 4.2615068894752275, + "grad_norm": 0.24460556786318674, + "learning_rate": 0.0002787127470181222, + "loss": 2.9284067153930664, + "step": 7270, + "token_acc": 0.3098318566918806 + }, + { + "epoch": 4.262093227792437, + "grad_norm": 0.2547147244752354, + "learning_rate": 0.00027870528097252435, + "loss": 2.9740731716156006, + "step": 7271, + "token_acc": 0.30190828301659534 + }, + { + "epoch": 4.262679566109645, + "grad_norm": 0.25850772824056945, + "learning_rate": 0.00027869781371791386, + "loss": 2.943971872329712, + "step": 7272, + "token_acc": 0.30662736551637637 + }, + { + "epoch": 4.263265904426854, + "grad_norm": 0.23606850652969488, + "learning_rate": 0.00027869034525436086, + "loss": 2.993342399597168, + "step": 7273, + "token_acc": 0.2990371072691021 + }, + { + "epoch": 4.263852242744063, + "grad_norm": 0.22186647149390218, + "learning_rate": 0.00027868287558193545, + "loss": 2.9822707176208496, + "step": 7274, + "token_acc": 0.30125332358581736 + }, + { + "epoch": 4.264438581061272, + "grad_norm": 0.23008549422646365, + "learning_rate": 0.0002786754047007078, + "loss": 2.9711735248565674, + "step": 7275, + "token_acc": 0.30232240041876335 + }, + { + "epoch": 4.265024919378481, + "grad_norm": 0.22419943437499343, + "learning_rate": 0.0002786679326107482, + "loss": 2.964339256286621, + "step": 7276, + "token_acc": 0.3022512960770707 + }, + { + "epoch": 4.26561125769569, + "grad_norm": 0.21518140203649397, + "learning_rate": 0.0002786604593121267, + "loss": 2.9791159629821777, + "step": 7277, + "token_acc": 0.30121050688327444 + }, + { + "epoch": 4.2661975960128995, + "grad_norm": 0.225399138157019, + "learning_rate": 0.0002786529848049136, + "loss": 2.9751439094543457, + "step": 7278, + "token_acc": 0.3029537244875206 + }, + { + "epoch": 4.266783934330109, + "grad_norm": 0.22864444401647632, + "learning_rate": 0.000278645509089179, + "loss": 2.9641313552856445, + "step": 7279, + "token_acc": 0.3045027526513189 + }, + { + "epoch": 4.267370272647318, + "grad_norm": 0.23008937451762368, + "learning_rate": 0.00027863803216499327, + "loss": 2.917933464050293, + "step": 7280, + "token_acc": 0.30946260614241233 + }, + { + "epoch": 4.267956610964527, + "grad_norm": 0.2318931699117132, + "learning_rate": 0.0002786305540324265, + "loss": 2.963479995727539, + "step": 7281, + "token_acc": 0.3040939977967562 + }, + { + "epoch": 4.268542949281736, + "grad_norm": 0.24804974801904991, + "learning_rate": 0.00027862307469154904, + "loss": 2.97214412689209, + "step": 7282, + "token_acc": 0.30295866653417086 + }, + { + "epoch": 4.269129287598945, + "grad_norm": 0.237380116889433, + "learning_rate": 0.0002786155941424312, + "loss": 2.9963736534118652, + "step": 7283, + "token_acc": 0.2986338783573182 + }, + { + "epoch": 4.269715625916153, + "grad_norm": 0.23486382594631494, + "learning_rate": 0.00027860811238514303, + "loss": 2.971968650817871, + "step": 7284, + "token_acc": 0.30230279928783255 + }, + { + "epoch": 4.270301964233362, + "grad_norm": 0.1976003320369681, + "learning_rate": 0.00027860062941975497, + "loss": 2.9394516944885254, + "step": 7285, + "token_acc": 0.30747536572174805 + }, + { + "epoch": 4.2708883025505715, + "grad_norm": 0.21471758595695353, + "learning_rate": 0.0002785931452463373, + "loss": 2.930746555328369, + "step": 7286, + "token_acc": 0.30949319097047884 + }, + { + "epoch": 4.271474640867781, + "grad_norm": 0.22125778815854652, + "learning_rate": 0.00027858565986496034, + "loss": 3.0177621841430664, + "step": 7287, + "token_acc": 0.2956276291303736 + }, + { + "epoch": 4.27206097918499, + "grad_norm": 0.22883807450084737, + "learning_rate": 0.0002785781732756944, + "loss": 2.9535980224609375, + "step": 7288, + "token_acc": 0.3053907066555652 + }, + { + "epoch": 4.272647317502199, + "grad_norm": 0.20984395787708307, + "learning_rate": 0.0002785706854786097, + "loss": 2.9449849128723145, + "step": 7289, + "token_acc": 0.3065855836787792 + }, + { + "epoch": 4.273233655819408, + "grad_norm": 0.19782005553295082, + "learning_rate": 0.00027856319647377676, + "loss": 2.9426798820495605, + "step": 7290, + "token_acc": 0.30734986011134674 + }, + { + "epoch": 4.273819994136617, + "grad_norm": 0.2227097223946743, + "learning_rate": 0.0002785557062612657, + "loss": 3.006328582763672, + "step": 7291, + "token_acc": 0.29785886160856884 + }, + { + "epoch": 4.274406332453826, + "grad_norm": 0.21645721491781605, + "learning_rate": 0.00027854821484114714, + "loss": 2.983450412750244, + "step": 7292, + "token_acc": 0.3009982144465579 + }, + { + "epoch": 4.274992670771035, + "grad_norm": 0.21341011490581083, + "learning_rate": 0.00027854072221349124, + "loss": 2.946321964263916, + "step": 7293, + "token_acc": 0.30640563941460425 + }, + { + "epoch": 4.2755790090882435, + "grad_norm": 0.2124940734635888, + "learning_rate": 0.0002785332283783685, + "loss": 2.9877591133117676, + "step": 7294, + "token_acc": 0.30071421637997536 + }, + { + "epoch": 4.276165347405453, + "grad_norm": 0.21017025760151842, + "learning_rate": 0.0002785257333358493, + "loss": 2.9772019386291504, + "step": 7295, + "token_acc": 0.3010421784892291 + }, + { + "epoch": 4.276751685722662, + "grad_norm": 0.2039433832680745, + "learning_rate": 0.000278518237086004, + "loss": 2.966921806335449, + "step": 7296, + "token_acc": 0.30404714548053496 + }, + { + "epoch": 4.277338024039871, + "grad_norm": 0.21323318688308443, + "learning_rate": 0.000278510739628903, + "loss": 2.979121685028076, + "step": 7297, + "token_acc": 0.3011762854357891 + }, + { + "epoch": 4.27792436235708, + "grad_norm": 0.22044668323003674, + "learning_rate": 0.00027850324096461684, + "loss": 2.981025457382202, + "step": 7298, + "token_acc": 0.3011268545545785 + }, + { + "epoch": 4.278510700674289, + "grad_norm": 0.22172839523627158, + "learning_rate": 0.00027849574109321584, + "loss": 2.9934585094451904, + "step": 7299, + "token_acc": 0.29860844590730595 + }, + { + "epoch": 4.279097038991498, + "grad_norm": 0.23812662664903442, + "learning_rate": 0.00027848824001477056, + "loss": 3.001340866088867, + "step": 7300, + "token_acc": 0.29800167548681067 + }, + { + "epoch": 4.279683377308707, + "grad_norm": 0.22277668044175938, + "learning_rate": 0.00027848073772935135, + "loss": 2.976656913757324, + "step": 7301, + "token_acc": 0.3005656102889608 + }, + { + "epoch": 4.280269715625916, + "grad_norm": 0.2705714537224366, + "learning_rate": 0.0002784732342370288, + "loss": 2.995959758758545, + "step": 7302, + "token_acc": 0.29987252276659626 + }, + { + "epoch": 4.2808560539431255, + "grad_norm": 0.2821550847432431, + "learning_rate": 0.0002784657295378733, + "loss": 2.9847006797790527, + "step": 7303, + "token_acc": 0.30008076464327166 + }, + { + "epoch": 4.281442392260335, + "grad_norm": 0.24712605674202043, + "learning_rate": 0.0002784582236319554, + "loss": 2.948005199432373, + "step": 7304, + "token_acc": 0.3051224773156948 + }, + { + "epoch": 4.282028730577543, + "grad_norm": 0.22568535406650034, + "learning_rate": 0.00027845071651934556, + "loss": 2.987281322479248, + "step": 7305, + "token_acc": 0.30122061608207706 + }, + { + "epoch": 4.282615068894752, + "grad_norm": 0.2239776251962893, + "learning_rate": 0.00027844320820011434, + "loss": 2.969425916671753, + "step": 7306, + "token_acc": 0.3028026140059335 + }, + { + "epoch": 4.283201407211961, + "grad_norm": 0.22080399525976876, + "learning_rate": 0.00027843569867433226, + "loss": 2.896232843399048, + "step": 7307, + "token_acc": 0.31288075926123193 + }, + { + "epoch": 4.28378774552917, + "grad_norm": 0.2411283814479587, + "learning_rate": 0.00027842818794206984, + "loss": 2.945919990539551, + "step": 7308, + "token_acc": 0.3070842161364597 + }, + { + "epoch": 4.284374083846379, + "grad_norm": 0.23859573016978153, + "learning_rate": 0.0002784206760033976, + "loss": 2.9758903980255127, + "step": 7309, + "token_acc": 0.30247424835760406 + }, + { + "epoch": 4.284960422163588, + "grad_norm": 0.2311983869189385, + "learning_rate": 0.00027841316285838626, + "loss": 2.987905979156494, + "step": 7310, + "token_acc": 0.29981592937180945 + }, + { + "epoch": 4.2855467604807975, + "grad_norm": 0.23898733621874915, + "learning_rate": 0.00027840564850710627, + "loss": 2.987511157989502, + "step": 7311, + "token_acc": 0.3014416090824508 + }, + { + "epoch": 4.286133098798007, + "grad_norm": 0.23849464929604225, + "learning_rate": 0.00027839813294962826, + "loss": 2.957730770111084, + "step": 7312, + "token_acc": 0.30425664510890504 + }, + { + "epoch": 4.286719437115216, + "grad_norm": 0.2387539661875173, + "learning_rate": 0.0002783906161860228, + "loss": 2.977391481399536, + "step": 7313, + "token_acc": 0.30073084904809166 + }, + { + "epoch": 4.287305775432425, + "grad_norm": 0.24129985214027794, + "learning_rate": 0.00027838309821636044, + "loss": 2.915719985961914, + "step": 7314, + "token_acc": 0.30988447809512765 + }, + { + "epoch": 4.287892113749633, + "grad_norm": 0.22217302071811537, + "learning_rate": 0.0002783755790407119, + "loss": 2.9591007232666016, + "step": 7315, + "token_acc": 0.30442186212578964 + }, + { + "epoch": 4.288478452066842, + "grad_norm": 0.22380145471505034, + "learning_rate": 0.0002783680586591478, + "loss": 2.9735379219055176, + "step": 7316, + "token_acc": 0.30108086560653646 + }, + { + "epoch": 4.289064790384051, + "grad_norm": 0.2046230388161708, + "learning_rate": 0.00027836053707173876, + "loss": 2.9954564571380615, + "step": 7317, + "token_acc": 0.30006228589224543 + }, + { + "epoch": 4.28965112870126, + "grad_norm": 0.21409488910118052, + "learning_rate": 0.0002783530142785555, + "loss": 2.9871108531951904, + "step": 7318, + "token_acc": 0.2998447123689215 + }, + { + "epoch": 4.2902374670184695, + "grad_norm": 0.20436121060337242, + "learning_rate": 0.0002783454902796686, + "loss": 2.946488380432129, + "step": 7319, + "token_acc": 0.3067118254532259 + }, + { + "epoch": 4.290823805335679, + "grad_norm": 0.23169151078990502, + "learning_rate": 0.00027833796507514873, + "loss": 2.9630203247070312, + "step": 7320, + "token_acc": 0.30242631383032725 + }, + { + "epoch": 4.291410143652888, + "grad_norm": 0.21334231936600517, + "learning_rate": 0.0002783304386650666, + "loss": 2.9706084728240967, + "step": 7321, + "token_acc": 0.3030002367417945 + }, + { + "epoch": 4.291996481970097, + "grad_norm": 0.2316141018543359, + "learning_rate": 0.00027832291104949296, + "loss": 2.9752144813537598, + "step": 7322, + "token_acc": 0.3008110250625038 + }, + { + "epoch": 4.292582820287306, + "grad_norm": 0.2615834246296336, + "learning_rate": 0.0002783153822284985, + "loss": 2.9859681129455566, + "step": 7323, + "token_acc": 0.2986965553450362 + }, + { + "epoch": 4.293169158604515, + "grad_norm": 0.22188157660718458, + "learning_rate": 0.00027830785220215394, + "loss": 2.9728899002075195, + "step": 7324, + "token_acc": 0.3009495345044804 + }, + { + "epoch": 4.293755496921724, + "grad_norm": 0.2416084010837058, + "learning_rate": 0.00027830032097053, + "loss": 2.9865312576293945, + "step": 7325, + "token_acc": 0.30182591342298154 + }, + { + "epoch": 4.294341835238933, + "grad_norm": 0.21036539713942493, + "learning_rate": 0.0002782927885336974, + "loss": 2.9353537559509277, + "step": 7326, + "token_acc": 0.3073467122813448 + }, + { + "epoch": 4.2949281735561415, + "grad_norm": 0.25259210961743245, + "learning_rate": 0.00027828525489172694, + "loss": 2.955595016479492, + "step": 7327, + "token_acc": 0.3042935244937878 + }, + { + "epoch": 4.295514511873351, + "grad_norm": 0.24078945864452275, + "learning_rate": 0.00027827772004468935, + "loss": 2.945195198059082, + "step": 7328, + "token_acc": 0.30498479728889133 + }, + { + "epoch": 4.29610085019056, + "grad_norm": 0.23694222141086502, + "learning_rate": 0.0002782701839926555, + "loss": 2.9610705375671387, + "step": 7329, + "token_acc": 0.3028518011038404 + }, + { + "epoch": 4.296687188507769, + "grad_norm": 0.2232397444615466, + "learning_rate": 0.00027826264673569607, + "loss": 2.9668021202087402, + "step": 7330, + "token_acc": 0.3037526501578789 + }, + { + "epoch": 4.297273526824978, + "grad_norm": 0.250861493532945, + "learning_rate": 0.0002782551082738819, + "loss": 2.9781816005706787, + "step": 7331, + "token_acc": 0.302098428924941 + }, + { + "epoch": 4.297859865142187, + "grad_norm": 0.2131331077271251, + "learning_rate": 0.0002782475686072839, + "loss": 3.0011372566223145, + "step": 7332, + "token_acc": 0.29978922929605795 + }, + { + "epoch": 4.298446203459396, + "grad_norm": 0.2251627245377774, + "learning_rate": 0.0002782400277359727, + "loss": 2.992305278778076, + "step": 7333, + "token_acc": 0.29960192859626633 + }, + { + "epoch": 4.299032541776605, + "grad_norm": 0.2121407170936576, + "learning_rate": 0.0002782324856600193, + "loss": 2.9771840572357178, + "step": 7334, + "token_acc": 0.3032398922691995 + }, + { + "epoch": 4.299618880093814, + "grad_norm": 0.23481351611464565, + "learning_rate": 0.00027822494237949447, + "loss": 2.947333812713623, + "step": 7335, + "token_acc": 0.3057150922575513 + }, + { + "epoch": 4.3002052184110235, + "grad_norm": 0.2100285194998613, + "learning_rate": 0.00027821739789446915, + "loss": 2.983891010284424, + "step": 7336, + "token_acc": 0.3008895345873477 + }, + { + "epoch": 4.300791556728232, + "grad_norm": 0.24065648264378237, + "learning_rate": 0.00027820985220501404, + "loss": 2.9675650596618652, + "step": 7337, + "token_acc": 0.30212896676995404 + }, + { + "epoch": 4.301377895045441, + "grad_norm": 0.21521295110547, + "learning_rate": 0.0002782023053112002, + "loss": 2.9489402770996094, + "step": 7338, + "token_acc": 0.3059314989663036 + }, + { + "epoch": 4.30196423336265, + "grad_norm": 0.22734593287269336, + "learning_rate": 0.0002781947572130985, + "loss": 2.9934773445129395, + "step": 7339, + "token_acc": 0.29964422708596794 + }, + { + "epoch": 4.302550571679859, + "grad_norm": 0.2348058353396603, + "learning_rate": 0.00027818720791077974, + "loss": 2.966704845428467, + "step": 7340, + "token_acc": 0.3029773835619652 + }, + { + "epoch": 4.303136909997068, + "grad_norm": 0.23283173819188377, + "learning_rate": 0.00027817965740431494, + "loss": 2.9984846115112305, + "step": 7341, + "token_acc": 0.300457831009196 + }, + { + "epoch": 4.303723248314277, + "grad_norm": 0.21888877581309213, + "learning_rate": 0.00027817210569377495, + "loss": 2.937060832977295, + "step": 7342, + "token_acc": 0.3052343242343718 + }, + { + "epoch": 4.304309586631486, + "grad_norm": 0.2348822215302708, + "learning_rate": 0.0002781645527792307, + "loss": 3.0079164505004883, + "step": 7343, + "token_acc": 0.2971665954844854 + }, + { + "epoch": 4.3048959249486956, + "grad_norm": 0.22196561702713719, + "learning_rate": 0.00027815699866075327, + "loss": 2.96844482421875, + "step": 7344, + "token_acc": 0.3014047989565411 + }, + { + "epoch": 4.305482263265905, + "grad_norm": 0.2188686153926745, + "learning_rate": 0.0002781494433384135, + "loss": 2.9802191257476807, + "step": 7345, + "token_acc": 0.3011067794897433 + }, + { + "epoch": 4.306068601583114, + "grad_norm": 0.22423033456171357, + "learning_rate": 0.0002781418868122824, + "loss": 2.942945718765259, + "step": 7346, + "token_acc": 0.3080456650084304 + }, + { + "epoch": 4.306654939900323, + "grad_norm": 0.22505982800605304, + "learning_rate": 0.00027813432908243093, + "loss": 2.987175464630127, + "step": 7347, + "token_acc": 0.2987289315968344 + }, + { + "epoch": 4.307241278217531, + "grad_norm": 0.23204469749502288, + "learning_rate": 0.00027812677014893005, + "loss": 2.9192981719970703, + "step": 7348, + "token_acc": 0.30930834370852067 + }, + { + "epoch": 4.30782761653474, + "grad_norm": 0.2403395445204015, + "learning_rate": 0.0002781192100118509, + "loss": 2.964359760284424, + "step": 7349, + "token_acc": 0.30255949195863746 + }, + { + "epoch": 4.308413954851949, + "grad_norm": 0.22107797698098264, + "learning_rate": 0.0002781116486712644, + "loss": 2.9808547496795654, + "step": 7350, + "token_acc": 0.30066191512033996 + }, + { + "epoch": 4.3090002931691584, + "grad_norm": 0.20901261991707945, + "learning_rate": 0.00027810408612724156, + "loss": 3.00819730758667, + "step": 7351, + "token_acc": 0.2967535104634807 + }, + { + "epoch": 4.309586631486368, + "grad_norm": 0.22095505895555012, + "learning_rate": 0.0002780965223798535, + "loss": 2.961474657058716, + "step": 7352, + "token_acc": 0.3033051107831632 + }, + { + "epoch": 4.310172969803577, + "grad_norm": 0.24821213232415576, + "learning_rate": 0.00027808895742917117, + "loss": 2.9418067932128906, + "step": 7353, + "token_acc": 0.30790142163198003 + }, + { + "epoch": 4.310759308120786, + "grad_norm": 0.2200656540801713, + "learning_rate": 0.0002780813912752657, + "loss": 2.9450721740722656, + "step": 7354, + "token_acc": 0.30597286341179813 + }, + { + "epoch": 4.311345646437995, + "grad_norm": 0.2693803743353312, + "learning_rate": 0.0002780738239182082, + "loss": 2.9657819271087646, + "step": 7355, + "token_acc": 0.30279379532613665 + }, + { + "epoch": 4.311931984755204, + "grad_norm": 0.25786773304675537, + "learning_rate": 0.00027806625535806964, + "loss": 2.9411869049072266, + "step": 7356, + "token_acc": 0.3075166060434362 + }, + { + "epoch": 4.312518323072413, + "grad_norm": 0.27531643325792154, + "learning_rate": 0.0002780586855949212, + "loss": 2.9410433769226074, + "step": 7357, + "token_acc": 0.3067419137556724 + }, + { + "epoch": 4.313104661389621, + "grad_norm": 0.2343223844376504, + "learning_rate": 0.000278051114628834, + "loss": 2.980503559112549, + "step": 7358, + "token_acc": 0.30229400685627567 + }, + { + "epoch": 4.3136909997068305, + "grad_norm": 0.2348211656729805, + "learning_rate": 0.0002780435424598791, + "loss": 2.992443561553955, + "step": 7359, + "token_acc": 0.2980131593921023 + }, + { + "epoch": 4.31427733802404, + "grad_norm": 0.26110453564782626, + "learning_rate": 0.00027803596908812766, + "loss": 2.976381778717041, + "step": 7360, + "token_acc": 0.2999567120887324 + }, + { + "epoch": 4.314863676341249, + "grad_norm": 0.2117826384776589, + "learning_rate": 0.00027802839451365085, + "loss": 2.993807554244995, + "step": 7361, + "token_acc": 0.29906718416100125 + }, + { + "epoch": 4.315450014658458, + "grad_norm": 0.23518032746324566, + "learning_rate": 0.0002780208187365198, + "loss": 2.9519076347351074, + "step": 7362, + "token_acc": 0.30601590033603804 + }, + { + "epoch": 4.316036352975667, + "grad_norm": 0.25441996874619666, + "learning_rate": 0.00027801324175680557, + "loss": 2.9919750690460205, + "step": 7363, + "token_acc": 0.29906050253471456 + }, + { + "epoch": 4.316622691292876, + "grad_norm": 0.24687620481151804, + "learning_rate": 0.00027800566357457957, + "loss": 2.943307876586914, + "step": 7364, + "token_acc": 0.3066570917981384 + }, + { + "epoch": 4.317209029610085, + "grad_norm": 0.2273454330674791, + "learning_rate": 0.00027799808418991275, + "loss": 2.946281671524048, + "step": 7365, + "token_acc": 0.3052848502009643 + }, + { + "epoch": 4.317795367927294, + "grad_norm": 0.23556441476814813, + "learning_rate": 0.00027799050360287645, + "loss": 2.9657628536224365, + "step": 7366, + "token_acc": 0.305032662660047 + }, + { + "epoch": 4.318381706244503, + "grad_norm": 0.22643915356520844, + "learning_rate": 0.00027798292181354186, + "loss": 2.999650239944458, + "step": 7367, + "token_acc": 0.2992400622026035 + }, + { + "epoch": 4.3189680445617125, + "grad_norm": 0.2338470667330291, + "learning_rate": 0.00027797533882198015, + "loss": 3.0167508125305176, + "step": 7368, + "token_acc": 0.29723444644694236 + }, + { + "epoch": 4.319554382878922, + "grad_norm": 0.21375324234429222, + "learning_rate": 0.0002779677546282626, + "loss": 2.9877071380615234, + "step": 7369, + "token_acc": 0.30012600839189907 + }, + { + "epoch": 4.32014072119613, + "grad_norm": 0.22903849293370163, + "learning_rate": 0.0002779601692324604, + "loss": 2.977485418319702, + "step": 7370, + "token_acc": 0.30106006889061915 + }, + { + "epoch": 4.320727059513339, + "grad_norm": 0.20910147767347473, + "learning_rate": 0.00027795258263464483, + "loss": 2.9834868907928467, + "step": 7371, + "token_acc": 0.30050895702484454 + }, + { + "epoch": 4.321313397830548, + "grad_norm": 0.23453350049545163, + "learning_rate": 0.0002779449948348872, + "loss": 2.9703733921051025, + "step": 7372, + "token_acc": 0.30355822390462284 + }, + { + "epoch": 4.321899736147757, + "grad_norm": 0.2193544860091229, + "learning_rate": 0.00027793740583325873, + "loss": 2.9763267040252686, + "step": 7373, + "token_acc": 0.303390925409172 + }, + { + "epoch": 4.322486074464966, + "grad_norm": 0.23228489021779758, + "learning_rate": 0.00027792981562983077, + "loss": 2.9279422760009766, + "step": 7374, + "token_acc": 0.30881881871538397 + }, + { + "epoch": 4.323072412782175, + "grad_norm": 0.20267274970124177, + "learning_rate": 0.0002779222242246745, + "loss": 2.954620361328125, + "step": 7375, + "token_acc": 0.30572977680098695 + }, + { + "epoch": 4.3236587510993845, + "grad_norm": 0.2321035668928613, + "learning_rate": 0.0002779146316178614, + "loss": 2.9720616340637207, + "step": 7376, + "token_acc": 0.30252231316845696 + }, + { + "epoch": 4.324245089416594, + "grad_norm": 0.22835241338947843, + "learning_rate": 0.0002779070378094627, + "loss": 2.9318339824676514, + "step": 7377, + "token_acc": 0.3092940731938383 + }, + { + "epoch": 4.324831427733803, + "grad_norm": 0.23255490398917625, + "learning_rate": 0.0002778994427995497, + "loss": 2.93733811378479, + "step": 7378, + "token_acc": 0.307819070500428 + }, + { + "epoch": 4.325417766051012, + "grad_norm": 0.24139845512536073, + "learning_rate": 0.00027789184658819375, + "loss": 2.964897632598877, + "step": 7379, + "token_acc": 0.30301182246915176 + }, + { + "epoch": 4.32600410436822, + "grad_norm": 0.21745012146193285, + "learning_rate": 0.0002778842491754663, + "loss": 2.9501442909240723, + "step": 7380, + "token_acc": 0.3059146553665377 + }, + { + "epoch": 4.326590442685429, + "grad_norm": 0.26590685634628747, + "learning_rate": 0.00027787665056143863, + "loss": 2.98331880569458, + "step": 7381, + "token_acc": 0.3008528977747945 + }, + { + "epoch": 4.327176781002638, + "grad_norm": 0.22630447677430207, + "learning_rate": 0.00027786905074618214, + "loss": 2.940601348876953, + "step": 7382, + "token_acc": 0.30701408181848666 + }, + { + "epoch": 4.327763119319847, + "grad_norm": 0.24180030021853766, + "learning_rate": 0.0002778614497297682, + "loss": 2.971580743789673, + "step": 7383, + "token_acc": 0.3031017198863976 + }, + { + "epoch": 4.3283494576370565, + "grad_norm": 0.256198705061953, + "learning_rate": 0.00027785384751226826, + "loss": 2.9886693954467773, + "step": 7384, + "token_acc": 0.30011776176667976 + }, + { + "epoch": 4.328935795954266, + "grad_norm": 0.23084185164371396, + "learning_rate": 0.0002778462440937537, + "loss": 2.943281650543213, + "step": 7385, + "token_acc": 0.3075881461805051 + }, + { + "epoch": 4.329522134271475, + "grad_norm": 0.259705052041424, + "learning_rate": 0.000277838639474296, + "loss": 2.9910545349121094, + "step": 7386, + "token_acc": 0.29857971492758956 + }, + { + "epoch": 4.330108472588684, + "grad_norm": 0.22980670248144583, + "learning_rate": 0.00027783103365396647, + "loss": 2.96281099319458, + "step": 7387, + "token_acc": 0.30309761370357646 + }, + { + "epoch": 4.330694810905893, + "grad_norm": 0.2538171105014315, + "learning_rate": 0.0002778234266328367, + "loss": 2.9270219802856445, + "step": 7388, + "token_acc": 0.30912486174856674 + }, + { + "epoch": 4.331281149223102, + "grad_norm": 0.22321323030159743, + "learning_rate": 0.00027781581841097803, + "loss": 2.990363359451294, + "step": 7389, + "token_acc": 0.30050385889853615 + }, + { + "epoch": 4.331867487540311, + "grad_norm": 0.2301215860981448, + "learning_rate": 0.000277808208988462, + "loss": 3.007373332977295, + "step": 7390, + "token_acc": 0.2982451211507469 + }, + { + "epoch": 4.33245382585752, + "grad_norm": 0.2540197235546708, + "learning_rate": 0.00027780059836536006, + "loss": 3.001340627670288, + "step": 7391, + "token_acc": 0.29750205859408857 + }, + { + "epoch": 4.3330401641747285, + "grad_norm": 0.25451663879962383, + "learning_rate": 0.0002777929865417437, + "loss": 2.9913687705993652, + "step": 7392, + "token_acc": 0.2992755859470171 + }, + { + "epoch": 4.333626502491938, + "grad_norm": 0.22696660805685323, + "learning_rate": 0.0002777853735176845, + "loss": 2.940615177154541, + "step": 7393, + "token_acc": 0.30760172046212164 + }, + { + "epoch": 4.334212840809147, + "grad_norm": 0.22745082429384703, + "learning_rate": 0.00027777775929325386, + "loss": 2.9625144004821777, + "step": 7394, + "token_acc": 0.30414949515763445 + }, + { + "epoch": 4.334799179126356, + "grad_norm": 0.24545190095920638, + "learning_rate": 0.00027777014386852337, + "loss": 2.9754865169525146, + "step": 7395, + "token_acc": 0.3020578035857749 + }, + { + "epoch": 4.335385517443565, + "grad_norm": 0.21689825822132533, + "learning_rate": 0.00027776252724356454, + "loss": 2.9564085006713867, + "step": 7396, + "token_acc": 0.3051753447249806 + }, + { + "epoch": 4.335971855760774, + "grad_norm": 0.2303311610623671, + "learning_rate": 0.0002777549094184489, + "loss": 2.9842045307159424, + "step": 7397, + "token_acc": 0.30146313197160657 + }, + { + "epoch": 4.336558194077983, + "grad_norm": 0.23134851953879887, + "learning_rate": 0.0002777472903932481, + "loss": 3.0256097316741943, + "step": 7398, + "token_acc": 0.29535930344914507 + }, + { + "epoch": 4.337144532395192, + "grad_norm": 0.21713772891746352, + "learning_rate": 0.00027773967016803363, + "loss": 3.0134687423706055, + "step": 7399, + "token_acc": 0.296860127986746 + }, + { + "epoch": 4.337730870712401, + "grad_norm": 0.23462640799459325, + "learning_rate": 0.000277732048742877, + "loss": 3.0174005031585693, + "step": 7400, + "token_acc": 0.2958578439338295 + }, + { + "epoch": 4.3383172090296105, + "grad_norm": 0.21605905590825009, + "learning_rate": 0.00027772442611785, + "loss": 2.9687488079071045, + "step": 7401, + "token_acc": 0.30237625037564936 + }, + { + "epoch": 4.338903547346819, + "grad_norm": 0.22110227960811124, + "learning_rate": 0.0002777168022930241, + "loss": 2.958282709121704, + "step": 7402, + "token_acc": 0.30418938930641826 + }, + { + "epoch": 4.339489885664028, + "grad_norm": 0.21255136898849908, + "learning_rate": 0.00027770917726847096, + "loss": 2.9841015338897705, + "step": 7403, + "token_acc": 0.3003835183824779 + }, + { + "epoch": 4.340076223981237, + "grad_norm": 0.2482624736248976, + "learning_rate": 0.0002777015510442622, + "loss": 2.9673478603363037, + "step": 7404, + "token_acc": 0.302219011249871 + }, + { + "epoch": 4.340662562298446, + "grad_norm": 0.2170379292591644, + "learning_rate": 0.00027769392362046936, + "loss": 2.954977035522461, + "step": 7405, + "token_acc": 0.3050576835777907 + }, + { + "epoch": 4.341248900615655, + "grad_norm": 0.2545965200666956, + "learning_rate": 0.00027768629499716425, + "loss": 3.034607410430908, + "step": 7406, + "token_acc": 0.29509697289497205 + }, + { + "epoch": 4.341835238932864, + "grad_norm": 0.23248874755300977, + "learning_rate": 0.00027767866517441843, + "loss": 2.96799898147583, + "step": 7407, + "token_acc": 0.30185934239637946 + }, + { + "epoch": 4.342421577250073, + "grad_norm": 0.24833632176455733, + "learning_rate": 0.0002776710341523036, + "loss": 2.943375587463379, + "step": 7408, + "token_acc": 0.3076517940450697 + }, + { + "epoch": 4.3430079155672825, + "grad_norm": 0.22263404272535434, + "learning_rate": 0.0002776634019308915, + "loss": 2.9831347465515137, + "step": 7409, + "token_acc": 0.29982119028556176 + }, + { + "epoch": 4.343594253884492, + "grad_norm": 0.2412797646195928, + "learning_rate": 0.0002776557685102537, + "loss": 2.980590343475342, + "step": 7410, + "token_acc": 0.30060337695208295 + }, + { + "epoch": 4.344180592201701, + "grad_norm": 0.22855144588357856, + "learning_rate": 0.000277648133890462, + "loss": 2.926447868347168, + "step": 7411, + "token_acc": 0.309918158514036 + }, + { + "epoch": 4.34476693051891, + "grad_norm": 0.23063405206132012, + "learning_rate": 0.0002776404980715881, + "loss": 2.9773662090301514, + "step": 7412, + "token_acc": 0.30156770481492695 + }, + { + "epoch": 4.345353268836118, + "grad_norm": 0.2194283262671779, + "learning_rate": 0.00027763286105370375, + "loss": 2.9674577713012695, + "step": 7413, + "token_acc": 0.3016774078096767 + }, + { + "epoch": 4.345939607153327, + "grad_norm": 0.22787634998482156, + "learning_rate": 0.00027762522283688055, + "loss": 2.9720420837402344, + "step": 7414, + "token_acc": 0.30222894625037566 + }, + { + "epoch": 4.346525945470536, + "grad_norm": 0.23791883417456025, + "learning_rate": 0.00027761758342119045, + "loss": 2.940985679626465, + "step": 7415, + "token_acc": 0.30664660824390766 + }, + { + "epoch": 4.347112283787745, + "grad_norm": 0.21748616358476272, + "learning_rate": 0.00027760994280670505, + "loss": 2.9615931510925293, + "step": 7416, + "token_acc": 0.3037087302102096 + }, + { + "epoch": 4.3476986221049545, + "grad_norm": 0.201574760546558, + "learning_rate": 0.00027760230099349624, + "loss": 2.9271907806396484, + "step": 7417, + "token_acc": 0.30948560568295114 + }, + { + "epoch": 4.348284960422164, + "grad_norm": 0.2287733560312696, + "learning_rate": 0.0002775946579816358, + "loss": 2.934257984161377, + "step": 7418, + "token_acc": 0.30816977468440215 + }, + { + "epoch": 4.348871298739373, + "grad_norm": 0.2256953707389003, + "learning_rate": 0.00027758701377119543, + "loss": 2.965078115463257, + "step": 7419, + "token_acc": 0.3042641385648486 + }, + { + "epoch": 4.349457637056582, + "grad_norm": 0.22809543565280077, + "learning_rate": 0.000277579368362247, + "loss": 2.9597039222717285, + "step": 7420, + "token_acc": 0.30339300280616355 + }, + { + "epoch": 4.350043975373791, + "grad_norm": 0.2382039064282838, + "learning_rate": 0.0002775717217548623, + "loss": 2.9811854362487793, + "step": 7421, + "token_acc": 0.30339154109121474 + }, + { + "epoch": 4.350630313691, + "grad_norm": 0.2111265142200861, + "learning_rate": 0.0002775640739491132, + "loss": 2.952059745788574, + "step": 7422, + "token_acc": 0.3043125769560128 + }, + { + "epoch": 4.351216652008208, + "grad_norm": 0.20072847261091104, + "learning_rate": 0.0002775564249450715, + "loss": 2.959197521209717, + "step": 7423, + "token_acc": 0.3046208074021039 + }, + { + "epoch": 4.351802990325417, + "grad_norm": 0.21593366982878148, + "learning_rate": 0.0002775487747428091, + "loss": 3.0046238899230957, + "step": 7424, + "token_acc": 0.2968203871823361 + }, + { + "epoch": 4.3523893286426265, + "grad_norm": 0.22169574168172437, + "learning_rate": 0.00027754112334239784, + "loss": 2.9432876110076904, + "step": 7425, + "token_acc": 0.30547081187589065 + }, + { + "epoch": 4.352975666959836, + "grad_norm": 0.234638050073075, + "learning_rate": 0.00027753347074390957, + "loss": 2.998220443725586, + "step": 7426, + "token_acc": 0.29888128875535386 + }, + { + "epoch": 4.353562005277045, + "grad_norm": 0.22459040539790787, + "learning_rate": 0.0002775258169474162, + "loss": 2.96234393119812, + "step": 7427, + "token_acc": 0.30433893761111297 + }, + { + "epoch": 4.354148343594254, + "grad_norm": 0.214610552416326, + "learning_rate": 0.0002775181619529897, + "loss": 2.9591174125671387, + "step": 7428, + "token_acc": 0.3064812540985155 + }, + { + "epoch": 4.354734681911463, + "grad_norm": 0.24878504339242488, + "learning_rate": 0.0002775105057607018, + "loss": 2.9945285320281982, + "step": 7429, + "token_acc": 0.298331882642663 + }, + { + "epoch": 4.355321020228672, + "grad_norm": 0.224617709020674, + "learning_rate": 0.0002775028483706245, + "loss": 2.933229923248291, + "step": 7430, + "token_acc": 0.30825603714236904 + }, + { + "epoch": 4.355907358545881, + "grad_norm": 0.22527142763040917, + "learning_rate": 0.0002774951897828298, + "loss": 2.9705281257629395, + "step": 7431, + "token_acc": 0.3038445549418809 + }, + { + "epoch": 4.35649369686309, + "grad_norm": 0.2318575799599823, + "learning_rate": 0.0002774875299973896, + "loss": 2.9466748237609863, + "step": 7432, + "token_acc": 0.306039786318581 + }, + { + "epoch": 4.357080035180299, + "grad_norm": 0.22725220371951266, + "learning_rate": 0.00027747986901437583, + "loss": 2.9579403400421143, + "step": 7433, + "token_acc": 0.3048014810671142 + }, + { + "epoch": 4.3576663734975085, + "grad_norm": 0.2214610566303656, + "learning_rate": 0.0002774722068338605, + "loss": 2.927346706390381, + "step": 7434, + "token_acc": 0.3083878973838054 + }, + { + "epoch": 4.358252711814717, + "grad_norm": 0.2190755845403344, + "learning_rate": 0.0002774645434559156, + "loss": 2.982912063598633, + "step": 7435, + "token_acc": 0.30004665039883527 + }, + { + "epoch": 4.358839050131926, + "grad_norm": 0.23995108663628126, + "learning_rate": 0.0002774568788806129, + "loss": 2.996185779571533, + "step": 7436, + "token_acc": 0.29866878092373417 + }, + { + "epoch": 4.359425388449135, + "grad_norm": 0.23559409635806414, + "learning_rate": 0.00027744921310802475, + "loss": 2.957252025604248, + "step": 7437, + "token_acc": 0.30422326692813173 + }, + { + "epoch": 4.360011726766344, + "grad_norm": 0.204677043305365, + "learning_rate": 0.00027744154613822293, + "loss": 2.9237399101257324, + "step": 7438, + "token_acc": 0.30876550498994726 + }, + { + "epoch": 4.360598065083553, + "grad_norm": 0.2344583095496871, + "learning_rate": 0.0002774338779712795, + "loss": 3.0163698196411133, + "step": 7439, + "token_acc": 0.2965866504076151 + }, + { + "epoch": 4.361184403400762, + "grad_norm": 0.24222907083088832, + "learning_rate": 0.0002774262086072665, + "loss": 2.9362940788269043, + "step": 7440, + "token_acc": 0.30632696390658176 + }, + { + "epoch": 4.361770741717971, + "grad_norm": 0.23261026021506248, + "learning_rate": 0.00027741853804625597, + "loss": 2.9780633449554443, + "step": 7441, + "token_acc": 0.3022428185707544 + }, + { + "epoch": 4.3623570800351805, + "grad_norm": 0.22953967202096529, + "learning_rate": 0.00027741086628832, + "loss": 2.964160442352295, + "step": 7442, + "token_acc": 0.3060982892103967 + }, + { + "epoch": 4.36294341835239, + "grad_norm": 0.2281486530179143, + "learning_rate": 0.0002774031933335306, + "loss": 2.9509661197662354, + "step": 7443, + "token_acc": 0.3050236734011309 + }, + { + "epoch": 4.363529756669599, + "grad_norm": 0.2318249755996522, + "learning_rate": 0.0002773955191819599, + "loss": 3.018233299255371, + "step": 7444, + "token_acc": 0.2970305431364126 + }, + { + "epoch": 4.364116094986807, + "grad_norm": 0.21770199107073115, + "learning_rate": 0.00027738784383368, + "loss": 2.954418897628784, + "step": 7445, + "token_acc": 0.30453840145245736 + }, + { + "epoch": 4.364702433304016, + "grad_norm": 0.2195995871176992, + "learning_rate": 0.0002773801672887629, + "loss": 2.9593656063079834, + "step": 7446, + "token_acc": 0.303345385512749 + }, + { + "epoch": 4.365288771621225, + "grad_norm": 0.22544094044324695, + "learning_rate": 0.0002773724895472808, + "loss": 2.960944175720215, + "step": 7447, + "token_acc": 0.3034861996316368 + }, + { + "epoch": 4.365875109938434, + "grad_norm": 0.24358085726752657, + "learning_rate": 0.0002773648106093058, + "loss": 2.972194194793701, + "step": 7448, + "token_acc": 0.3016396603430441 + }, + { + "epoch": 4.366461448255643, + "grad_norm": 0.2288713130325665, + "learning_rate": 0.00027735713047491006, + "loss": 2.95180606842041, + "step": 7449, + "token_acc": 0.30419559876081614 + }, + { + "epoch": 4.3670477865728525, + "grad_norm": 0.2423433430253848, + "learning_rate": 0.0002773494491441657, + "loss": 3.0050132274627686, + "step": 7450, + "token_acc": 0.29752068220070327 + }, + { + "epoch": 4.367634124890062, + "grad_norm": 0.23619046077048764, + "learning_rate": 0.0002773417666171448, + "loss": 2.94372296333313, + "step": 7451, + "token_acc": 0.30577219476171974 + }, + { + "epoch": 4.368220463207271, + "grad_norm": 0.2162855826483245, + "learning_rate": 0.0002773340828939196, + "loss": 2.990633964538574, + "step": 7452, + "token_acc": 0.2999063442099663 + }, + { + "epoch": 4.36880680152448, + "grad_norm": 0.22679060865010944, + "learning_rate": 0.00027732639797456237, + "loss": 2.9583752155303955, + "step": 7453, + "token_acc": 0.3064472633494351 + }, + { + "epoch": 4.369393139841689, + "grad_norm": 0.22268278117379317, + "learning_rate": 0.00027731871185914507, + "loss": 2.8930678367614746, + "step": 7454, + "token_acc": 0.3128569183213706 + }, + { + "epoch": 4.369979478158898, + "grad_norm": 0.20627290022283126, + "learning_rate": 0.0002773110245477401, + "loss": 2.9686548709869385, + "step": 7455, + "token_acc": 0.3019881277678319 + }, + { + "epoch": 4.370565816476106, + "grad_norm": 0.23534461078160235, + "learning_rate": 0.0002773033360404197, + "loss": 2.995039224624634, + "step": 7456, + "token_acc": 0.2989590179017376 + }, + { + "epoch": 4.371152154793315, + "grad_norm": 0.2142829635963942, + "learning_rate": 0.0002772956463372559, + "loss": 2.9501309394836426, + "step": 7457, + "token_acc": 0.3040427353609053 + }, + { + "epoch": 4.3717384931105245, + "grad_norm": 0.2281564945966076, + "learning_rate": 0.00027728795543832105, + "loss": 2.9776997566223145, + "step": 7458, + "token_acc": 0.3026790452350195 + }, + { + "epoch": 4.372324831427734, + "grad_norm": 0.204093352012456, + "learning_rate": 0.0002772802633436874, + "loss": 2.994493246078491, + "step": 7459, + "token_acc": 0.2988305146205238 + }, + { + "epoch": 4.372911169744943, + "grad_norm": 0.23269081759733368, + "learning_rate": 0.00027727257005342716, + "loss": 2.996680736541748, + "step": 7460, + "token_acc": 0.2991457458456698 + }, + { + "epoch": 4.373497508062152, + "grad_norm": 0.23568474628698902, + "learning_rate": 0.00027726487556761266, + "loss": 2.9784648418426514, + "step": 7461, + "token_acc": 0.3031310895048818 + }, + { + "epoch": 4.374083846379361, + "grad_norm": 0.2150090003628934, + "learning_rate": 0.0002772571798863161, + "loss": 2.940512180328369, + "step": 7462, + "token_acc": 0.3062648920551638 + }, + { + "epoch": 4.37467018469657, + "grad_norm": 0.26972538459129053, + "learning_rate": 0.0002772494830096099, + "loss": 2.985027551651001, + "step": 7463, + "token_acc": 0.300904702939021 + }, + { + "epoch": 4.375256523013779, + "grad_norm": 0.2663370815412667, + "learning_rate": 0.0002772417849375662, + "loss": 2.9784159660339355, + "step": 7464, + "token_acc": 0.30264356361376327 + }, + { + "epoch": 4.375842861330988, + "grad_norm": 0.2435779929801738, + "learning_rate": 0.0002772340856702574, + "loss": 3.0186824798583984, + "step": 7465, + "token_acc": 0.29527416789536015 + }, + { + "epoch": 4.3764291996481965, + "grad_norm": 0.26859645515761915, + "learning_rate": 0.0002772263852077558, + "loss": 2.932619571685791, + "step": 7466, + "token_acc": 0.3071234581724347 + }, + { + "epoch": 4.377015537965406, + "grad_norm": 0.28069880953968096, + "learning_rate": 0.00027721868355013384, + "loss": 2.9932851791381836, + "step": 7467, + "token_acc": 0.2988944842948627 + }, + { + "epoch": 4.377601876282615, + "grad_norm": 0.23215340541729915, + "learning_rate": 0.0002772109806974637, + "loss": 2.9450325965881348, + "step": 7468, + "token_acc": 0.30598153291034036 + }, + { + "epoch": 4.378188214599824, + "grad_norm": 0.25374207991843667, + "learning_rate": 0.0002772032766498178, + "loss": 2.953277826309204, + "step": 7469, + "token_acc": 0.3047969143655994 + }, + { + "epoch": 4.378774552917033, + "grad_norm": 0.24068130144488842, + "learning_rate": 0.00027719557140726855, + "loss": 2.99070405960083, + "step": 7470, + "token_acc": 0.3008341777838137 + }, + { + "epoch": 4.379360891234242, + "grad_norm": 0.232580553209548, + "learning_rate": 0.00027718786496988833, + "loss": 2.997485876083374, + "step": 7471, + "token_acc": 0.29904054520900725 + }, + { + "epoch": 4.379947229551451, + "grad_norm": 0.2544280132668076, + "learning_rate": 0.0002771801573377495, + "loss": 3.005141496658325, + "step": 7472, + "token_acc": 0.29849009195101084 + }, + { + "epoch": 4.38053356786866, + "grad_norm": 0.2537585247026658, + "learning_rate": 0.0002771724485109244, + "loss": 2.9789953231811523, + "step": 7473, + "token_acc": 0.3014262373295165 + }, + { + "epoch": 4.381119906185869, + "grad_norm": 0.22469715953060296, + "learning_rate": 0.0002771647384894856, + "loss": 2.956080198287964, + "step": 7474, + "token_acc": 0.30495235642544427 + }, + { + "epoch": 4.3817062445030786, + "grad_norm": 0.25561779959492037, + "learning_rate": 0.00027715702727350544, + "loss": 2.9696555137634277, + "step": 7475, + "token_acc": 0.30262395431653916 + }, + { + "epoch": 4.382292582820288, + "grad_norm": 0.22094482548707126, + "learning_rate": 0.00027714931486305634, + "loss": 2.9638609886169434, + "step": 7476, + "token_acc": 0.3045287308109564 + }, + { + "epoch": 4.382878921137497, + "grad_norm": 0.2533397314274439, + "learning_rate": 0.0002771416012582107, + "loss": 2.99871826171875, + "step": 7477, + "token_acc": 0.2983464296108157 + }, + { + "epoch": 4.383465259454705, + "grad_norm": 0.24903799324657266, + "learning_rate": 0.00027713388645904115, + "loss": 2.9663758277893066, + "step": 7478, + "token_acc": 0.30292466765140325 + }, + { + "epoch": 4.384051597771914, + "grad_norm": 0.23093484256519903, + "learning_rate": 0.00027712617046561996, + "loss": 2.9815616607666016, + "step": 7479, + "token_acc": 0.3024605479756684 + }, + { + "epoch": 4.384637936089123, + "grad_norm": 0.2445199063815977, + "learning_rate": 0.00027711845327801975, + "loss": 2.9700050354003906, + "step": 7480, + "token_acc": 0.3027628589951036 + }, + { + "epoch": 4.385224274406332, + "grad_norm": 0.24360959569548238, + "learning_rate": 0.0002771107348963129, + "loss": 2.9615426063537598, + "step": 7481, + "token_acc": 0.3042609888247472 + }, + { + "epoch": 4.3858106127235414, + "grad_norm": 0.20756497406590646, + "learning_rate": 0.0002771030153205721, + "loss": 2.9506173133850098, + "step": 7482, + "token_acc": 0.304636542340389 + }, + { + "epoch": 4.386396951040751, + "grad_norm": 0.26152720128034224, + "learning_rate": 0.00027709529455086963, + "loss": 3.003720760345459, + "step": 7483, + "token_acc": 0.29753391982993926 + }, + { + "epoch": 4.38698328935796, + "grad_norm": 0.2301045457204367, + "learning_rate": 0.0002770875725872782, + "loss": 2.9745092391967773, + "step": 7484, + "token_acc": 0.3021710119792505 + }, + { + "epoch": 4.387569627675169, + "grad_norm": 0.22519656736733146, + "learning_rate": 0.00027707984942987025, + "loss": 2.9724268913269043, + "step": 7485, + "token_acc": 0.30140923225110716 + }, + { + "epoch": 4.388155965992378, + "grad_norm": 0.21959180863653102, + "learning_rate": 0.0002770721250787184, + "loss": 2.972655773162842, + "step": 7486, + "token_acc": 0.3010668328258583 + }, + { + "epoch": 4.388742304309587, + "grad_norm": 0.2297185154631925, + "learning_rate": 0.00027706439953389505, + "loss": 2.936591625213623, + "step": 7487, + "token_acc": 0.30795136874307716 + }, + { + "epoch": 4.389328642626795, + "grad_norm": 0.21061868400224334, + "learning_rate": 0.000277056672795473, + "loss": 2.9652774333953857, + "step": 7488, + "token_acc": 0.3029329078727312 + }, + { + "epoch": 4.389914980944004, + "grad_norm": 0.23281434234550993, + "learning_rate": 0.00027704894486352467, + "loss": 2.997745990753174, + "step": 7489, + "token_acc": 0.29790875569524516 + }, + { + "epoch": 4.3905013192612135, + "grad_norm": 0.21758261932722223, + "learning_rate": 0.00027704121573812274, + "loss": 2.9508378505706787, + "step": 7490, + "token_acc": 0.30605061692262664 + }, + { + "epoch": 4.391087657578423, + "grad_norm": 0.21031241634625122, + "learning_rate": 0.0002770334854193397, + "loss": 2.9566893577575684, + "step": 7491, + "token_acc": 0.30436006861205256 + }, + { + "epoch": 4.391673995895632, + "grad_norm": 0.20961391120017078, + "learning_rate": 0.0002770257539072483, + "loss": 2.9746947288513184, + "step": 7492, + "token_acc": 0.3017601489854248 + }, + { + "epoch": 4.392260334212841, + "grad_norm": 0.21282074640042992, + "learning_rate": 0.00027701802120192116, + "loss": 3.0275769233703613, + "step": 7493, + "token_acc": 0.2958204629120298 + }, + { + "epoch": 4.39284667253005, + "grad_norm": 0.20269921259214754, + "learning_rate": 0.00027701028730343083, + "loss": 2.9803223609924316, + "step": 7494, + "token_acc": 0.3027306737346366 + }, + { + "epoch": 4.393433010847259, + "grad_norm": 0.22014904650462777, + "learning_rate": 0.00027700255221184997, + "loss": 2.987946033477783, + "step": 7495, + "token_acc": 0.2998152685157284 + }, + { + "epoch": 4.394019349164468, + "grad_norm": 0.21237112595335825, + "learning_rate": 0.0002769948159272513, + "loss": 2.949683666229248, + "step": 7496, + "token_acc": 0.3058696576139103 + }, + { + "epoch": 4.394605687481677, + "grad_norm": 0.22197503913255287, + "learning_rate": 0.0002769870784497074, + "loss": 3.0350022315979004, + "step": 7497, + "token_acc": 0.29369391797547273 + }, + { + "epoch": 4.395192025798886, + "grad_norm": 0.21981970771336845, + "learning_rate": 0.00027697933977929113, + "loss": 2.9578042030334473, + "step": 7498, + "token_acc": 0.304411668239319 + }, + { + "epoch": 4.395778364116095, + "grad_norm": 0.2187988767314944, + "learning_rate": 0.00027697159991607503, + "loss": 2.958164691925049, + "step": 7499, + "token_acc": 0.3048302804243249 + }, + { + "epoch": 4.396364702433304, + "grad_norm": 0.24020584678510976, + "learning_rate": 0.00027696385886013175, + "loss": 2.970421552658081, + "step": 7500, + "token_acc": 0.30253240723930264 + }, + { + "epoch": 4.396951040750513, + "grad_norm": 0.22652073223603636, + "learning_rate": 0.0002769561166115342, + "loss": 2.9632935523986816, + "step": 7501, + "token_acc": 0.3035878540114575 + }, + { + "epoch": 4.397537379067722, + "grad_norm": 0.23870226651248763, + "learning_rate": 0.000276948373170355, + "loss": 2.9731171131134033, + "step": 7502, + "token_acc": 0.30197230253370316 + }, + { + "epoch": 4.398123717384931, + "grad_norm": 0.22826463125813395, + "learning_rate": 0.0002769406285366669, + "loss": 3.00234317779541, + "step": 7503, + "token_acc": 0.29734018917031674 + }, + { + "epoch": 4.39871005570214, + "grad_norm": 0.25205782074025596, + "learning_rate": 0.0002769328827105426, + "loss": 2.9797816276550293, + "step": 7504, + "token_acc": 0.30147632564710763 + }, + { + "epoch": 4.399296394019349, + "grad_norm": 0.22199846075781993, + "learning_rate": 0.00027692513569205495, + "loss": 3.005668878555298, + "step": 7505, + "token_acc": 0.29716366875843364 + }, + { + "epoch": 4.399882732336558, + "grad_norm": 0.2178636745220201, + "learning_rate": 0.00027691738748127667, + "loss": 2.934154510498047, + "step": 7506, + "token_acc": 0.30812018335557584 + }, + { + "epoch": 4.4004690706537675, + "grad_norm": 0.2371901276913139, + "learning_rate": 0.0002769096380782806, + "loss": 2.987424373626709, + "step": 7507, + "token_acc": 0.30062316377481785 + }, + { + "epoch": 4.401055408970977, + "grad_norm": 0.20285393410668617, + "learning_rate": 0.00027690188748313947, + "loss": 2.9692342281341553, + "step": 7508, + "token_acc": 0.3020862783589295 + }, + { + "epoch": 4.401641747288186, + "grad_norm": 0.24882034699699654, + "learning_rate": 0.00027689413569592604, + "loss": 2.9444217681884766, + "step": 7509, + "token_acc": 0.3073822888057929 + }, + { + "epoch": 4.402228085605394, + "grad_norm": 0.21646145312417683, + "learning_rate": 0.0002768863827167133, + "loss": 3.0028581619262695, + "step": 7510, + "token_acc": 0.29734004889714233 + }, + { + "epoch": 4.402814423922603, + "grad_norm": 0.24082268430187542, + "learning_rate": 0.0002768786285455739, + "loss": 3.0078725814819336, + "step": 7511, + "token_acc": 0.2973279083916388 + }, + { + "epoch": 4.403400762239812, + "grad_norm": 0.24430083717649023, + "learning_rate": 0.0002768708731825808, + "loss": 3.0146965980529785, + "step": 7512, + "token_acc": 0.2969611118535347 + }, + { + "epoch": 4.403987100557021, + "grad_norm": 0.23385516543001894, + "learning_rate": 0.00027686311662780677, + "loss": 2.9984161853790283, + "step": 7513, + "token_acc": 0.29922065550775157 + }, + { + "epoch": 4.40457343887423, + "grad_norm": 0.2579148687184216, + "learning_rate": 0.00027685535888132473, + "loss": 2.9890799522399902, + "step": 7514, + "token_acc": 0.3000434157693625 + }, + { + "epoch": 4.4051597771914395, + "grad_norm": 0.23298779472306833, + "learning_rate": 0.00027684759994320757, + "loss": 2.9461140632629395, + "step": 7515, + "token_acc": 0.3055497946824837 + }, + { + "epoch": 4.405746115508649, + "grad_norm": 0.22719467724029815, + "learning_rate": 0.0002768398398135281, + "loss": 2.909564733505249, + "step": 7516, + "token_acc": 0.3131046474865634 + }, + { + "epoch": 4.406332453825858, + "grad_norm": 0.24622822454034474, + "learning_rate": 0.0002768320784923593, + "loss": 2.9649710655212402, + "step": 7517, + "token_acc": 0.303135324305619 + }, + { + "epoch": 4.406918792143067, + "grad_norm": 0.25064053504721323, + "learning_rate": 0.0002768243159797739, + "loss": 2.9926137924194336, + "step": 7518, + "token_acc": 0.3000078571917383 + }, + { + "epoch": 4.407505130460276, + "grad_norm": 0.23487473205149595, + "learning_rate": 0.0002768165522758451, + "loss": 2.984139919281006, + "step": 7519, + "token_acc": 0.2998250008400994 + }, + { + "epoch": 4.408091468777485, + "grad_norm": 0.22706378778188618, + "learning_rate": 0.0002768087873806456, + "loss": 2.9717254638671875, + "step": 7520, + "token_acc": 0.30067460601170476 + }, + { + "epoch": 4.408677807094693, + "grad_norm": 0.2519205363206708, + "learning_rate": 0.00027680102129424845, + "loss": 2.9655544757843018, + "step": 7521, + "token_acc": 0.30559878244979316 + }, + { + "epoch": 4.409264145411902, + "grad_norm": 0.23705315451401202, + "learning_rate": 0.00027679325401672655, + "loss": 3.027557849884033, + "step": 7522, + "token_acc": 0.294660130202156 + }, + { + "epoch": 4.4098504837291115, + "grad_norm": 0.23066611938084383, + "learning_rate": 0.0002767854855481529, + "loss": 2.989354133605957, + "step": 7523, + "token_acc": 0.2996040429570662 + }, + { + "epoch": 4.410436822046321, + "grad_norm": 0.21936123463566679, + "learning_rate": 0.00027677771588860043, + "loss": 2.992600202560425, + "step": 7524, + "token_acc": 0.2995475739808873 + }, + { + "epoch": 4.41102316036353, + "grad_norm": 0.23133323323445437, + "learning_rate": 0.0002767699450381422, + "loss": 2.973036766052246, + "step": 7525, + "token_acc": 0.30246245164979757 + }, + { + "epoch": 4.411609498680739, + "grad_norm": 0.228026485996696, + "learning_rate": 0.0002767621729968511, + "loss": 2.9853219985961914, + "step": 7526, + "token_acc": 0.3000444804939951 + }, + { + "epoch": 4.412195836997948, + "grad_norm": 0.22741291083716406, + "learning_rate": 0.00027675439976480024, + "loss": 2.975165367126465, + "step": 7527, + "token_acc": 0.3012296766562978 + }, + { + "epoch": 4.412782175315157, + "grad_norm": 0.22552353828133287, + "learning_rate": 0.0002767466253420626, + "loss": 2.9211790561676025, + "step": 7528, + "token_acc": 0.3113293590472288 + }, + { + "epoch": 4.413368513632366, + "grad_norm": 0.22311316953967944, + "learning_rate": 0.00027673884972871123, + "loss": 2.9724016189575195, + "step": 7529, + "token_acc": 0.3032933108295768 + }, + { + "epoch": 4.413954851949575, + "grad_norm": 0.22808560315325163, + "learning_rate": 0.00027673107292481913, + "loss": 3.0021939277648926, + "step": 7530, + "token_acc": 0.2990817141336172 + }, + { + "epoch": 4.4145411902667835, + "grad_norm": 0.22394169180736145, + "learning_rate": 0.0002767232949304593, + "loss": 2.9701952934265137, + "step": 7531, + "token_acc": 0.3030814013986238 + }, + { + "epoch": 4.415127528583993, + "grad_norm": 0.2254937808709969, + "learning_rate": 0.000276715515745705, + "loss": 2.953622341156006, + "step": 7532, + "token_acc": 0.30599432593302905 + }, + { + "epoch": 4.415713866901202, + "grad_norm": 0.22246490998683355, + "learning_rate": 0.0002767077353706291, + "loss": 2.941204071044922, + "step": 7533, + "token_acc": 0.3061899249508994 + }, + { + "epoch": 4.416300205218411, + "grad_norm": 0.2141833292792169, + "learning_rate": 0.0002766999538053048, + "loss": 2.9821739196777344, + "step": 7534, + "token_acc": 0.2998423056436737 + }, + { + "epoch": 4.41688654353562, + "grad_norm": 0.22482858779923007, + "learning_rate": 0.00027669217104980517, + "loss": 2.9544153213500977, + "step": 7535, + "token_acc": 0.30288021905603607 + }, + { + "epoch": 4.417472881852829, + "grad_norm": 0.2220578998000814, + "learning_rate": 0.00027668438710420326, + "loss": 2.988034248352051, + "step": 7536, + "token_acc": 0.3013090381983783 + }, + { + "epoch": 4.418059220170038, + "grad_norm": 0.22951222042752878, + "learning_rate": 0.0002766766019685723, + "loss": 3.009068250656128, + "step": 7537, + "token_acc": 0.29712393689541 + }, + { + "epoch": 4.418645558487247, + "grad_norm": 0.262990064336994, + "learning_rate": 0.0002766688156429854, + "loss": 2.993960380554199, + "step": 7538, + "token_acc": 0.30064779019956117 + }, + { + "epoch": 4.419231896804456, + "grad_norm": 0.2206217469011878, + "learning_rate": 0.00027666102812751555, + "loss": 2.9506053924560547, + "step": 7539, + "token_acc": 0.3073134092346616 + }, + { + "epoch": 4.4198182351216655, + "grad_norm": 0.2394109278379797, + "learning_rate": 0.0002766532394222361, + "loss": 2.9905591011047363, + "step": 7540, + "token_acc": 0.29900388404575523 + }, + { + "epoch": 4.420404573438875, + "grad_norm": 0.23826348684458346, + "learning_rate": 0.0002766454495272201, + "loss": 3.011991500854492, + "step": 7541, + "token_acc": 0.2964753075979874 + }, + { + "epoch": 4.420990911756084, + "grad_norm": 0.2478856679968787, + "learning_rate": 0.00027663765844254077, + "loss": 2.9614710807800293, + "step": 7542, + "token_acc": 0.30504336027814427 + }, + { + "epoch": 4.421577250073292, + "grad_norm": 0.2318547149069468, + "learning_rate": 0.00027662986616827125, + "loss": 3.008951187133789, + "step": 7543, + "token_acc": 0.2977561518429126 + }, + { + "epoch": 4.422163588390501, + "grad_norm": 0.2508149531151586, + "learning_rate": 0.0002766220727044848, + "loss": 2.9602606296539307, + "step": 7544, + "token_acc": 0.30259982320476364 + }, + { + "epoch": 4.42274992670771, + "grad_norm": 0.2074290607644501, + "learning_rate": 0.0002766142780512546, + "loss": 2.9678430557250977, + "step": 7545, + "token_acc": 0.30278704812371027 + }, + { + "epoch": 4.423336265024919, + "grad_norm": 0.2278039023036722, + "learning_rate": 0.0002766064822086539, + "loss": 2.977325916290283, + "step": 7546, + "token_acc": 0.3022880178308545 + }, + { + "epoch": 4.423922603342128, + "grad_norm": 0.2206481237285753, + "learning_rate": 0.00027659868517675585, + "loss": 3.0214197635650635, + "step": 7547, + "token_acc": 0.29503669887821066 + }, + { + "epoch": 4.4245089416593375, + "grad_norm": 0.2182695857307359, + "learning_rate": 0.00027659088695563384, + "loss": 2.9423303604125977, + "step": 7548, + "token_acc": 0.3084122828887227 + }, + { + "epoch": 4.425095279976547, + "grad_norm": 0.24116400950160918, + "learning_rate": 0.00027658308754536094, + "loss": 3.0361709594726562, + "step": 7549, + "token_acc": 0.29410926263587905 + }, + { + "epoch": 4.425681618293756, + "grad_norm": 0.2152699195496761, + "learning_rate": 0.00027657528694601056, + "loss": 2.9664125442504883, + "step": 7550, + "token_acc": 0.30368169683441665 + }, + { + "epoch": 4.426267956610965, + "grad_norm": 0.20811772389048305, + "learning_rate": 0.0002765674851576559, + "loss": 2.980156421661377, + "step": 7551, + "token_acc": 0.3004433433634576 + }, + { + "epoch": 4.426854294928174, + "grad_norm": 0.22490433518785619, + "learning_rate": 0.00027655968218037025, + "loss": 3.0214476585388184, + "step": 7552, + "token_acc": 0.2945921221150579 + }, + { + "epoch": 4.427440633245382, + "grad_norm": 0.21924294748443476, + "learning_rate": 0.00027655187801422696, + "loss": 2.981346607208252, + "step": 7553, + "token_acc": 0.298751614291864 + }, + { + "epoch": 4.428026971562591, + "grad_norm": 0.23186865860986766, + "learning_rate": 0.00027654407265929925, + "loss": 2.9444243907928467, + "step": 7554, + "token_acc": 0.30537961024667115 + }, + { + "epoch": 4.4286133098798, + "grad_norm": 0.2233740912221921, + "learning_rate": 0.00027653626611566056, + "loss": 2.943833351135254, + "step": 7555, + "token_acc": 0.3066100359143345 + }, + { + "epoch": 4.4291996481970095, + "grad_norm": 0.2226037170606491, + "learning_rate": 0.00027652845838338415, + "loss": 2.9573678970336914, + "step": 7556, + "token_acc": 0.3038769831381344 + }, + { + "epoch": 4.429785986514219, + "grad_norm": 0.21112469477535387, + "learning_rate": 0.00027652064946254336, + "loss": 2.985640048980713, + "step": 7557, + "token_acc": 0.29939305462508375 + }, + { + "epoch": 4.430372324831428, + "grad_norm": 0.21278645889773481, + "learning_rate": 0.0002765128393532115, + "loss": 2.9902169704437256, + "step": 7558, + "token_acc": 0.29927112788392585 + }, + { + "epoch": 4.430958663148637, + "grad_norm": 0.2335452047190607, + "learning_rate": 0.0002765050280554621, + "loss": 2.95001220703125, + "step": 7559, + "token_acc": 0.30507556671719493 + }, + { + "epoch": 4.431545001465846, + "grad_norm": 0.21208011085085432, + "learning_rate": 0.00027649721556936835, + "loss": 2.9829282760620117, + "step": 7560, + "token_acc": 0.29984166928001593 + }, + { + "epoch": 4.432131339783055, + "grad_norm": 0.22861987589734728, + "learning_rate": 0.00027648940189500376, + "loss": 2.9915008544921875, + "step": 7561, + "token_acc": 0.2986692886882913 + }, + { + "epoch": 4.432717678100264, + "grad_norm": 0.22474816828076577, + "learning_rate": 0.0002764815870324417, + "loss": 3.007002592086792, + "step": 7562, + "token_acc": 0.2974789077455117 + }, + { + "epoch": 4.433304016417473, + "grad_norm": 0.20449870048134206, + "learning_rate": 0.00027647377098175555, + "loss": 2.988819122314453, + "step": 7563, + "token_acc": 0.3004359848241271 + }, + { + "epoch": 4.4338903547346815, + "grad_norm": 0.20037998468759713, + "learning_rate": 0.0002764659537430187, + "loss": 2.9879345893859863, + "step": 7564, + "token_acc": 0.2997982794579056 + }, + { + "epoch": 4.434476693051891, + "grad_norm": 0.2136282549022213, + "learning_rate": 0.00027645813531630464, + "loss": 2.984762191772461, + "step": 7565, + "token_acc": 0.29992371386341604 + }, + { + "epoch": 4.4350630313691, + "grad_norm": 0.2163536331990884, + "learning_rate": 0.0002764503157016869, + "loss": 2.966360569000244, + "step": 7566, + "token_acc": 0.30226549716454015 + }, + { + "epoch": 4.435649369686309, + "grad_norm": 0.23056282637307565, + "learning_rate": 0.00027644249489923873, + "loss": 3.0036206245422363, + "step": 7567, + "token_acc": 0.2962743208159605 + }, + { + "epoch": 4.436235708003518, + "grad_norm": 0.2505185272934878, + "learning_rate": 0.0002764346729090337, + "loss": 2.934352397918701, + "step": 7568, + "token_acc": 0.30735287238631814 + }, + { + "epoch": 4.436822046320727, + "grad_norm": 0.23291100590036753, + "learning_rate": 0.00027642684973114534, + "loss": 2.984907865524292, + "step": 7569, + "token_acc": 0.30181783676069895 + }, + { + "epoch": 4.437408384637936, + "grad_norm": 0.2305082005398384, + "learning_rate": 0.0002764190253656471, + "loss": 2.985118865966797, + "step": 7570, + "token_acc": 0.29943780872997894 + }, + { + "epoch": 4.437994722955145, + "grad_norm": 0.25589584257536085, + "learning_rate": 0.0002764111998126124, + "loss": 2.975830554962158, + "step": 7571, + "token_acc": 0.3005234831988947 + }, + { + "epoch": 4.438581061272354, + "grad_norm": 0.21715621037107885, + "learning_rate": 0.0002764033730721149, + "loss": 3.004791736602783, + "step": 7572, + "token_acc": 0.29819572032236535 + }, + { + "epoch": 4.4391673995895635, + "grad_norm": 0.2332126010050994, + "learning_rate": 0.000276395545144228, + "loss": 2.9431028366088867, + "step": 7573, + "token_acc": 0.3060748083104383 + }, + { + "epoch": 4.439753737906772, + "grad_norm": 0.233698792595349, + "learning_rate": 0.0002763877160290253, + "loss": 3.0251412391662598, + "step": 7574, + "token_acc": 0.29486185888717237 + }, + { + "epoch": 4.440340076223981, + "grad_norm": 0.2292718913688117, + "learning_rate": 0.00027637988572658034, + "loss": 2.9524192810058594, + "step": 7575, + "token_acc": 0.30503595610675993 + }, + { + "epoch": 4.44092641454119, + "grad_norm": 0.22887528645919877, + "learning_rate": 0.0002763720542369666, + "loss": 2.982769012451172, + "step": 7576, + "token_acc": 0.2992277940890712 + }, + { + "epoch": 4.441512752858399, + "grad_norm": 0.2538809280446271, + "learning_rate": 0.0002763642215602577, + "loss": 2.9990055561065674, + "step": 7577, + "token_acc": 0.2971679165291434 + }, + { + "epoch": 4.442099091175608, + "grad_norm": 0.21638494791598495, + "learning_rate": 0.00027635638769652723, + "loss": 2.9935128688812256, + "step": 7578, + "token_acc": 0.3009446981850797 + }, + { + "epoch": 4.442685429492817, + "grad_norm": 0.23801170973998323, + "learning_rate": 0.0002763485526458488, + "loss": 2.9842634201049805, + "step": 7579, + "token_acc": 0.2998494957096176 + }, + { + "epoch": 4.443271767810026, + "grad_norm": 0.2378546425907843, + "learning_rate": 0.0002763407164082959, + "loss": 2.9668431282043457, + "step": 7580, + "token_acc": 0.3026356730338699 + }, + { + "epoch": 4.4438581061272355, + "grad_norm": 0.2501416334600253, + "learning_rate": 0.00027633287898394223, + "loss": 2.9581780433654785, + "step": 7581, + "token_acc": 0.30563516479111763 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.25523598480605075, + "learning_rate": 0.0002763250403728614, + "loss": 2.9914441108703613, + "step": 7582, + "token_acc": 0.2994789904594357 + }, + { + "epoch": 4.445030782761654, + "grad_norm": 0.22751298932767122, + "learning_rate": 0.0002763172005751271, + "loss": 2.975606918334961, + "step": 7583, + "token_acc": 0.30274614330572264 + }, + { + "epoch": 4.445617121078863, + "grad_norm": 0.22322231490937136, + "learning_rate": 0.00027630935959081283, + "loss": 2.9687275886535645, + "step": 7584, + "token_acc": 0.3031522115899709 + }, + { + "epoch": 4.446203459396072, + "grad_norm": 0.2276494286734312, + "learning_rate": 0.0002763015174199924, + "loss": 2.9967830181121826, + "step": 7585, + "token_acc": 0.3000399446101406 + }, + { + "epoch": 4.44678979771328, + "grad_norm": 0.22290993602576076, + "learning_rate": 0.0002762936740627394, + "loss": 3.0188190937042236, + "step": 7586, + "token_acc": 0.29637081193722803 + }, + { + "epoch": 4.447376136030489, + "grad_norm": 0.22810839375775024, + "learning_rate": 0.00027628582951912746, + "loss": 3.0038933753967285, + "step": 7587, + "token_acc": 0.298422424753988 + }, + { + "epoch": 4.447962474347698, + "grad_norm": 0.215049185161377, + "learning_rate": 0.0002762779837892304, + "loss": 2.998764753341675, + "step": 7588, + "token_acc": 0.2984180236637074 + }, + { + "epoch": 4.4485488126649075, + "grad_norm": 0.21708986877967146, + "learning_rate": 0.0002762701368731218, + "loss": 3.014242172241211, + "step": 7589, + "token_acc": 0.29517731331969155 + }, + { + "epoch": 4.449135150982117, + "grad_norm": 0.21439655778186809, + "learning_rate": 0.0002762622887708754, + "loss": 2.9730865955352783, + "step": 7590, + "token_acc": 0.30331937315280105 + }, + { + "epoch": 4.449721489299326, + "grad_norm": 0.2241885067307578, + "learning_rate": 0.00027625443948256495, + "loss": 3.035583257675171, + "step": 7591, + "token_acc": 0.2959085578794088 + }, + { + "epoch": 4.450307827616535, + "grad_norm": 0.2387387559963078, + "learning_rate": 0.0002762465890082642, + "loss": 2.988215446472168, + "step": 7592, + "token_acc": 0.3015569834157638 + }, + { + "epoch": 4.450894165933744, + "grad_norm": 0.21776724465372296, + "learning_rate": 0.00027623873734804687, + "loss": 2.9739420413970947, + "step": 7593, + "token_acc": 0.30295207709801575 + }, + { + "epoch": 4.451480504250953, + "grad_norm": 0.2294945072124841, + "learning_rate": 0.00027623088450198666, + "loss": 2.960691213607788, + "step": 7594, + "token_acc": 0.30552533110471347 + }, + { + "epoch": 4.452066842568162, + "grad_norm": 0.20253151984582082, + "learning_rate": 0.0002762230304701574, + "loss": 2.9611666202545166, + "step": 7595, + "token_acc": 0.304358322131108 + }, + { + "epoch": 4.45265318088537, + "grad_norm": 0.23490758667523984, + "learning_rate": 0.0002762151752526329, + "loss": 2.9545183181762695, + "step": 7596, + "token_acc": 0.30677157113559 + }, + { + "epoch": 4.4532395192025795, + "grad_norm": 0.22097915643256888, + "learning_rate": 0.00027620731884948685, + "loss": 2.9702696800231934, + "step": 7597, + "token_acc": 0.3028869800099432 + }, + { + "epoch": 4.453825857519789, + "grad_norm": 0.23308072974715568, + "learning_rate": 0.00027619946126079315, + "loss": 2.972086191177368, + "step": 7598, + "token_acc": 0.30219612947925906 + }, + { + "epoch": 4.454412195836998, + "grad_norm": 0.21480250335527765, + "learning_rate": 0.00027619160248662554, + "loss": 2.979414463043213, + "step": 7599, + "token_acc": 0.30156305354482255 + }, + { + "epoch": 4.454998534154207, + "grad_norm": 0.22156480564248046, + "learning_rate": 0.0002761837425270579, + "loss": 2.991581916809082, + "step": 7600, + "token_acc": 0.29964662631128575 + }, + { + "epoch": 4.455584872471416, + "grad_norm": 0.22047502943256203, + "learning_rate": 0.000276175881382164, + "loss": 2.9772255420684814, + "step": 7601, + "token_acc": 0.30140528117593546 + }, + { + "epoch": 4.456171210788625, + "grad_norm": 0.23801751997385368, + "learning_rate": 0.00027616801905201775, + "loss": 2.9637534618377686, + "step": 7602, + "token_acc": 0.30447306452391554 + }, + { + "epoch": 4.456757549105834, + "grad_norm": 0.22740141214600645, + "learning_rate": 0.00027616015553669297, + "loss": 2.997546434402466, + "step": 7603, + "token_acc": 0.29920801364463157 + }, + { + "epoch": 4.457343887423043, + "grad_norm": 0.22241688038826007, + "learning_rate": 0.0002761522908362635, + "loss": 2.939868450164795, + "step": 7604, + "token_acc": 0.307261854455585 + }, + { + "epoch": 4.457930225740252, + "grad_norm": 0.21651883624015725, + "learning_rate": 0.00027614442495080326, + "loss": 2.940763235092163, + "step": 7605, + "token_acc": 0.3072985639794736 + }, + { + "epoch": 4.4585165640574616, + "grad_norm": 0.2217748665330733, + "learning_rate": 0.0002761365578803861, + "loss": 3.006091833114624, + "step": 7606, + "token_acc": 0.2982227583852922 + }, + { + "epoch": 4.45910290237467, + "grad_norm": 0.20924903109591278, + "learning_rate": 0.00027612868962508604, + "loss": 2.9682326316833496, + "step": 7607, + "token_acc": 0.30271689844786104 + }, + { + "epoch": 4.459689240691879, + "grad_norm": 0.20612388970463666, + "learning_rate": 0.0002761208201849769, + "loss": 3.0121848583221436, + "step": 7608, + "token_acc": 0.2959412571613007 + }, + { + "epoch": 4.460275579009088, + "grad_norm": 0.23106248329013215, + "learning_rate": 0.0002761129495601325, + "loss": 2.9680256843566895, + "step": 7609, + "token_acc": 0.30297332174554265 + }, + { + "epoch": 4.460861917326297, + "grad_norm": 0.23072071015130413, + "learning_rate": 0.00027610507775062697, + "loss": 2.9940145015716553, + "step": 7610, + "token_acc": 0.2994108369831639 + }, + { + "epoch": 4.461448255643506, + "grad_norm": 0.21866554705368085, + "learning_rate": 0.0002760972047565341, + "loss": 2.9687657356262207, + "step": 7611, + "token_acc": 0.3029277242993139 + }, + { + "epoch": 4.462034593960715, + "grad_norm": 0.2467556929032799, + "learning_rate": 0.0002760893305779279, + "loss": 2.961193084716797, + "step": 7612, + "token_acc": 0.30329074758464486 + }, + { + "epoch": 4.4626209322779244, + "grad_norm": 0.23338321289013325, + "learning_rate": 0.0002760814552148824, + "loss": 2.971200942993164, + "step": 7613, + "token_acc": 0.30271906988766595 + }, + { + "epoch": 4.463207270595134, + "grad_norm": 0.20954108118233988, + "learning_rate": 0.0002760735786674715, + "loss": 2.9147324562072754, + "step": 7614, + "token_acc": 0.3104312423212753 + }, + { + "epoch": 4.463793608912343, + "grad_norm": 0.2546827217856047, + "learning_rate": 0.00027606570093576926, + "loss": 2.9942164421081543, + "step": 7615, + "token_acc": 0.30038036578207705 + }, + { + "epoch": 4.464379947229552, + "grad_norm": 0.21659174618311733, + "learning_rate": 0.00027605782201984956, + "loss": 2.9776782989501953, + "step": 7616, + "token_acc": 0.301635572268506 + }, + { + "epoch": 4.464966285546761, + "grad_norm": 0.2528289550684048, + "learning_rate": 0.0002760499419197865, + "loss": 2.987570285797119, + "step": 7617, + "token_acc": 0.30017876677606126 + }, + { + "epoch": 4.465552623863969, + "grad_norm": 0.24352480342355978, + "learning_rate": 0.00027604206063565416, + "loss": 2.9570417404174805, + "step": 7618, + "token_acc": 0.3043757994075698 + }, + { + "epoch": 4.466138962181178, + "grad_norm": 0.240583112577343, + "learning_rate": 0.00027603417816752645, + "loss": 2.965134620666504, + "step": 7619, + "token_acc": 0.3039687178129726 + }, + { + "epoch": 4.466725300498387, + "grad_norm": 0.23073529259513478, + "learning_rate": 0.00027602629451547745, + "loss": 2.9953105449676514, + "step": 7620, + "token_acc": 0.29807001807138594 + }, + { + "epoch": 4.4673116388155965, + "grad_norm": 0.22379422980522054, + "learning_rate": 0.0002760184096795813, + "loss": 2.991844654083252, + "step": 7621, + "token_acc": 0.29971518256096835 + }, + { + "epoch": 4.467897977132806, + "grad_norm": 0.22404903190261807, + "learning_rate": 0.00027601052365991196, + "loss": 2.962212085723877, + "step": 7622, + "token_acc": 0.3051771411591502 + }, + { + "epoch": 4.468484315450015, + "grad_norm": 0.2173017192481348, + "learning_rate": 0.0002760026364565436, + "loss": 2.9794421195983887, + "step": 7623, + "token_acc": 0.30051131615999266 + }, + { + "epoch": 4.469070653767224, + "grad_norm": 0.2071021383158694, + "learning_rate": 0.0002759947480695502, + "loss": 2.946147918701172, + "step": 7624, + "token_acc": 0.3074731541966795 + }, + { + "epoch": 4.469656992084433, + "grad_norm": 0.22466662970887313, + "learning_rate": 0.0002759868584990059, + "loss": 2.9522435665130615, + "step": 7625, + "token_acc": 0.3064219800953277 + }, + { + "epoch": 4.470243330401642, + "grad_norm": 0.23232520385435237, + "learning_rate": 0.0002759789677449849, + "loss": 3.005612850189209, + "step": 7626, + "token_acc": 0.2979912718301832 + }, + { + "epoch": 4.470829668718851, + "grad_norm": 0.2173149435748064, + "learning_rate": 0.0002759710758075612, + "loss": 2.956317663192749, + "step": 7627, + "token_acc": 0.30477849506438504 + }, + { + "epoch": 4.47141600703606, + "grad_norm": 0.24921664699797585, + "learning_rate": 0.00027596318268680904, + "loss": 2.9827961921691895, + "step": 7628, + "token_acc": 0.3016147869801452 + }, + { + "epoch": 4.4720023453532685, + "grad_norm": 0.24722305291233757, + "learning_rate": 0.0002759552883828025, + "loss": 3.016123056411743, + "step": 7629, + "token_acc": 0.294645429581828 + }, + { + "epoch": 4.472588683670478, + "grad_norm": 0.21689633675608133, + "learning_rate": 0.0002759473928956158, + "loss": 2.9752273559570312, + "step": 7630, + "token_acc": 0.30037186025476165 + }, + { + "epoch": 4.473175021987687, + "grad_norm": 0.2434991820689063, + "learning_rate": 0.000275939496225323, + "loss": 2.9939610958099365, + "step": 7631, + "token_acc": 0.2992342728203414 + }, + { + "epoch": 4.473761360304896, + "grad_norm": 0.21344866813763846, + "learning_rate": 0.0002759315983719983, + "loss": 2.933187961578369, + "step": 7632, + "token_acc": 0.3084968984774344 + }, + { + "epoch": 4.474347698622105, + "grad_norm": 0.24320203118311193, + "learning_rate": 0.000275923699335716, + "loss": 2.9964942932128906, + "step": 7633, + "token_acc": 0.2981918010305851 + }, + { + "epoch": 4.474934036939314, + "grad_norm": 0.2495752518385743, + "learning_rate": 0.00027591579911655017, + "loss": 2.9665846824645996, + "step": 7634, + "token_acc": 0.3037158191184166 + }, + { + "epoch": 4.475520375256523, + "grad_norm": 0.2392050221832817, + "learning_rate": 0.00027590789771457513, + "loss": 2.974921703338623, + "step": 7635, + "token_acc": 0.30296626629581863 + }, + { + "epoch": 4.476106713573732, + "grad_norm": 0.22611602268956996, + "learning_rate": 0.0002758999951298651, + "loss": 2.9386470317840576, + "step": 7636, + "token_acc": 0.3061558644700659 + }, + { + "epoch": 4.476693051890941, + "grad_norm": 0.23762525569628992, + "learning_rate": 0.0002758920913624942, + "loss": 2.964472770690918, + "step": 7637, + "token_acc": 0.3049408625101289 + }, + { + "epoch": 4.4772793902081505, + "grad_norm": 0.22898732954784032, + "learning_rate": 0.0002758841864125367, + "loss": 2.943307399749756, + "step": 7638, + "token_acc": 0.3069276919539269 + }, + { + "epoch": 4.477865728525359, + "grad_norm": 0.21032174140116416, + "learning_rate": 0.000275876280280067, + "loss": 3.0144128799438477, + "step": 7639, + "token_acc": 0.2956134909711444 + }, + { + "epoch": 4.478452066842568, + "grad_norm": 0.23463843962194295, + "learning_rate": 0.0002758683729651592, + "loss": 3.0245981216430664, + "step": 7640, + "token_acc": 0.2951299411516772 + }, + { + "epoch": 4.479038405159777, + "grad_norm": 0.2007427514217131, + "learning_rate": 0.00027586046446788766, + "loss": 2.999563217163086, + "step": 7641, + "token_acc": 0.2982976484354753 + }, + { + "epoch": 4.479624743476986, + "grad_norm": 0.22053355903278282, + "learning_rate": 0.00027585255478832665, + "loss": 2.9731335639953613, + "step": 7642, + "token_acc": 0.30036141434005637 + }, + { + "epoch": 4.480211081794195, + "grad_norm": 0.22287474598000823, + "learning_rate": 0.0002758446439265505, + "loss": 3.020007610321045, + "step": 7643, + "token_acc": 0.2959017362735672 + }, + { + "epoch": 4.480797420111404, + "grad_norm": 0.20509875454697335, + "learning_rate": 0.0002758367318826335, + "loss": 2.965298891067505, + "step": 7644, + "token_acc": 0.3032176782105965 + }, + { + "epoch": 4.481383758428613, + "grad_norm": 0.23438821667069404, + "learning_rate": 0.00027582881865664995, + "loss": 3.009063720703125, + "step": 7645, + "token_acc": 0.2970216794775101 + }, + { + "epoch": 4.4819700967458225, + "grad_norm": 0.21664325536563464, + "learning_rate": 0.00027582090424867423, + "loss": 3.0108723640441895, + "step": 7646, + "token_acc": 0.2974431073391147 + }, + { + "epoch": 4.482556435063032, + "grad_norm": 0.21720614368903043, + "learning_rate": 0.0002758129886587806, + "loss": 3.022472858428955, + "step": 7647, + "token_acc": 0.29574099097276724 + }, + { + "epoch": 4.483142773380241, + "grad_norm": 0.23169805294547768, + "learning_rate": 0.0002758050718870435, + "loss": 2.9337656497955322, + "step": 7648, + "token_acc": 0.3070524087497594 + }, + { + "epoch": 4.48372911169745, + "grad_norm": 0.2052382307719247, + "learning_rate": 0.00027579715393353735, + "loss": 2.983374834060669, + "step": 7649, + "token_acc": 0.30114648653012893 + }, + { + "epoch": 4.484315450014659, + "grad_norm": 0.2451042547369214, + "learning_rate": 0.0002757892347983364, + "loss": 2.9674196243286133, + "step": 7650, + "token_acc": 0.30323449063463 + }, + { + "epoch": 4.484901788331867, + "grad_norm": 0.2552569946102098, + "learning_rate": 0.00027578131448151506, + "loss": 2.9782662391662598, + "step": 7651, + "token_acc": 0.30116845816839277 + }, + { + "epoch": 4.485488126649076, + "grad_norm": 0.21230519858773003, + "learning_rate": 0.0002757733929831478, + "loss": 2.987525463104248, + "step": 7652, + "token_acc": 0.29979934399947233 + }, + { + "epoch": 4.486074464966285, + "grad_norm": 0.22426306371953253, + "learning_rate": 0.00027576547030330897, + "loss": 3.003708600997925, + "step": 7653, + "token_acc": 0.2987595238281423 + }, + { + "epoch": 4.4866608032834945, + "grad_norm": 0.21456576904585917, + "learning_rate": 0.0002757575464420731, + "loss": 2.9577674865722656, + "step": 7654, + "token_acc": 0.3050982047460131 + }, + { + "epoch": 4.487247141600704, + "grad_norm": 0.2524770797821413, + "learning_rate": 0.0002757496213995144, + "loss": 3.0155861377716064, + "step": 7655, + "token_acc": 0.29672841489835905 + }, + { + "epoch": 4.487833479917913, + "grad_norm": 0.21861104063613618, + "learning_rate": 0.00027574169517570756, + "loss": 2.9387197494506836, + "step": 7656, + "token_acc": 0.308054603127056 + }, + { + "epoch": 4.488419818235122, + "grad_norm": 0.21333302968651946, + "learning_rate": 0.00027573376777072694, + "loss": 2.954826831817627, + "step": 7657, + "token_acc": 0.3049119414008665 + }, + { + "epoch": 4.489006156552331, + "grad_norm": 0.21572278987173338, + "learning_rate": 0.00027572583918464695, + "loss": 2.997603416442871, + "step": 7658, + "token_acc": 0.29893922756432145 + }, + { + "epoch": 4.48959249486954, + "grad_norm": 0.20767869334144137, + "learning_rate": 0.0002757179094175421, + "loss": 2.9891881942749023, + "step": 7659, + "token_acc": 0.2999237400959065 + }, + { + "epoch": 4.490178833186749, + "grad_norm": 0.23555214487189335, + "learning_rate": 0.0002757099784694869, + "loss": 3.017289161682129, + "step": 7660, + "token_acc": 0.29596309832463896 + }, + { + "epoch": 4.490765171503957, + "grad_norm": 0.23314860967716342, + "learning_rate": 0.0002757020463405559, + "loss": 3.021169900894165, + "step": 7661, + "token_acc": 0.294855836336743 + }, + { + "epoch": 4.4913515098211665, + "grad_norm": 0.2442421838244778, + "learning_rate": 0.00027569411303082357, + "loss": 3.0114598274230957, + "step": 7662, + "token_acc": 0.2957653841971681 + }, + { + "epoch": 4.491937848138376, + "grad_norm": 0.21306465471849811, + "learning_rate": 0.0002756861785403644, + "loss": 2.990102767944336, + "step": 7663, + "token_acc": 0.3031278088318982 + }, + { + "epoch": 4.492524186455585, + "grad_norm": 0.23942117873238872, + "learning_rate": 0.00027567824286925293, + "loss": 2.9808952808380127, + "step": 7664, + "token_acc": 0.3017831156679328 + }, + { + "epoch": 4.493110524772794, + "grad_norm": 0.2303095524770526, + "learning_rate": 0.00027567030601756377, + "loss": 2.938162088394165, + "step": 7665, + "token_acc": 0.30749313520957194 + }, + { + "epoch": 4.493696863090003, + "grad_norm": 0.21820544362503638, + "learning_rate": 0.00027566236798537137, + "loss": 2.992374897003174, + "step": 7666, + "token_acc": 0.29957593916848674 + }, + { + "epoch": 4.494283201407212, + "grad_norm": 0.21833541524736316, + "learning_rate": 0.00027565442877275034, + "loss": 2.9773597717285156, + "step": 7667, + "token_acc": 0.3020970801445104 + }, + { + "epoch": 4.494869539724421, + "grad_norm": 0.2340539262108697, + "learning_rate": 0.0002756464883797753, + "loss": 2.944711685180664, + "step": 7668, + "token_acc": 0.306485216195657 + }, + { + "epoch": 4.49545587804163, + "grad_norm": 0.20238945688202734, + "learning_rate": 0.00027563854680652083, + "loss": 2.931864023208618, + "step": 7669, + "token_acc": 0.308472712911868 + }, + { + "epoch": 4.496042216358839, + "grad_norm": 0.22481747787216352, + "learning_rate": 0.0002756306040530615, + "loss": 2.987959861755371, + "step": 7670, + "token_acc": 0.29988672114214526 + }, + { + "epoch": 4.4966285546760485, + "grad_norm": 0.21965780034339297, + "learning_rate": 0.00027562266011947194, + "loss": 2.9668350219726562, + "step": 7671, + "token_acc": 0.3038297122442386 + }, + { + "epoch": 4.497214892993257, + "grad_norm": 0.20342045825504645, + "learning_rate": 0.00027561471500582677, + "loss": 2.9384617805480957, + "step": 7672, + "token_acc": 0.30848261327713383 + }, + { + "epoch": 4.497801231310466, + "grad_norm": 0.21870435534201574, + "learning_rate": 0.0002756067687122006, + "loss": 3.0244853496551514, + "step": 7673, + "token_acc": 0.29324283395590484 + }, + { + "epoch": 4.498387569627675, + "grad_norm": 0.19906313659082162, + "learning_rate": 0.0002755988212386681, + "loss": 2.982006788253784, + "step": 7674, + "token_acc": 0.30186280920978337 + }, + { + "epoch": 4.498973907944884, + "grad_norm": 0.2339817020090608, + "learning_rate": 0.00027559087258530396, + "loss": 2.9881882667541504, + "step": 7675, + "token_acc": 0.2997226122034944 + }, + { + "epoch": 4.499560246262093, + "grad_norm": 0.2123836960063692, + "learning_rate": 0.0002755829227521828, + "loss": 2.975555896759033, + "step": 7676, + "token_acc": 0.3004407814089356 + }, + { + "epoch": 4.500146584579302, + "grad_norm": 0.2379519986023324, + "learning_rate": 0.00027557497173937923, + "loss": 2.9358553886413574, + "step": 7677, + "token_acc": 0.3073589110464751 + }, + { + "epoch": 4.500732922896511, + "grad_norm": 0.22020692528363245, + "learning_rate": 0.0002755670195469681, + "loss": 2.9977307319641113, + "step": 7678, + "token_acc": 0.2997275633041668 + }, + { + "epoch": 4.5013192612137205, + "grad_norm": 0.2464986715638401, + "learning_rate": 0.000275559066175024, + "loss": 2.9740078449249268, + "step": 7679, + "token_acc": 0.30189967458066946 + }, + { + "epoch": 4.50190559953093, + "grad_norm": 0.25501870207533445, + "learning_rate": 0.00027555111162362166, + "loss": 2.9926223754882812, + "step": 7680, + "token_acc": 0.29984496456141574 + }, + { + "epoch": 4.502491937848139, + "grad_norm": 0.21750999537434965, + "learning_rate": 0.0002755431558928358, + "loss": 2.9678614139556885, + "step": 7681, + "token_acc": 0.30328653227704133 + }, + { + "epoch": 4.503078276165347, + "grad_norm": 0.24988439546725022, + "learning_rate": 0.0002755351989827412, + "loss": 2.999833106994629, + "step": 7682, + "token_acc": 0.2983530942779406 + }, + { + "epoch": 4.503664614482556, + "grad_norm": 0.23753908934771287, + "learning_rate": 0.00027552724089341255, + "loss": 2.922658920288086, + "step": 7683, + "token_acc": 0.3106094747929972 + }, + { + "epoch": 4.504250952799765, + "grad_norm": 0.25837274385830045, + "learning_rate": 0.00027551928162492456, + "loss": 2.9310784339904785, + "step": 7684, + "token_acc": 0.30704154291866703 + }, + { + "epoch": 4.504837291116974, + "grad_norm": 0.2548669538027812, + "learning_rate": 0.0002755113211773521, + "loss": 2.9529385566711426, + "step": 7685, + "token_acc": 0.3055461170587728 + }, + { + "epoch": 4.505423629434183, + "grad_norm": 0.20457345674230723, + "learning_rate": 0.00027550335955076993, + "loss": 3.0096793174743652, + "step": 7686, + "token_acc": 0.2965724193118165 + }, + { + "epoch": 4.5060099677513925, + "grad_norm": 0.24858476435106425, + "learning_rate": 0.0002754953967452528, + "loss": 2.9415903091430664, + "step": 7687, + "token_acc": 0.30795046112203306 + }, + { + "epoch": 4.506596306068602, + "grad_norm": 0.20864641060189953, + "learning_rate": 0.0002754874327608755, + "loss": 2.950563430786133, + "step": 7688, + "token_acc": 0.30422527713825637 + }, + { + "epoch": 4.507182644385811, + "grad_norm": 0.2432139772116789, + "learning_rate": 0.0002754794675977129, + "loss": 2.960073709487915, + "step": 7689, + "token_acc": 0.30343683774071684 + }, + { + "epoch": 4.50776898270302, + "grad_norm": 0.21575817635543373, + "learning_rate": 0.0002754715012558398, + "loss": 2.9802823066711426, + "step": 7690, + "token_acc": 0.3013164863433457 + }, + { + "epoch": 4.508355321020229, + "grad_norm": 0.23980278291918009, + "learning_rate": 0.000275463533735331, + "loss": 2.940427303314209, + "step": 7691, + "token_acc": 0.30673909898905477 + }, + { + "epoch": 4.508941659337438, + "grad_norm": 0.2095006398751181, + "learning_rate": 0.00027545556503626135, + "loss": 2.9724385738372803, + "step": 7692, + "token_acc": 0.3037464255548658 + }, + { + "epoch": 4.509527997654647, + "grad_norm": 0.21680007942009089, + "learning_rate": 0.00027544759515870575, + "loss": 2.9706430435180664, + "step": 7693, + "token_acc": 0.3041073752256223 + }, + { + "epoch": 4.510114335971855, + "grad_norm": 0.2221932434942384, + "learning_rate": 0.00027543962410273904, + "loss": 3.0166704654693604, + "step": 7694, + "token_acc": 0.29656931427800437 + }, + { + "epoch": 4.5107006742890645, + "grad_norm": 0.2094927413246809, + "learning_rate": 0.00027543165186843605, + "loss": 2.997117519378662, + "step": 7695, + "token_acc": 0.29886809874409154 + }, + { + "epoch": 4.511287012606274, + "grad_norm": 0.2282586027664007, + "learning_rate": 0.00027542367845587173, + "loss": 2.9649057388305664, + "step": 7696, + "token_acc": 0.3039147865733826 + }, + { + "epoch": 4.511873350923483, + "grad_norm": 0.2300564114423717, + "learning_rate": 0.00027541570386512096, + "loss": 2.933500051498413, + "step": 7697, + "token_acc": 0.3071886994108573 + }, + { + "epoch": 4.512459689240692, + "grad_norm": 0.22107905856903817, + "learning_rate": 0.00027540772809625866, + "loss": 2.989394426345825, + "step": 7698, + "token_acc": 0.2995021024936432 + }, + { + "epoch": 4.513046027557901, + "grad_norm": 0.22665328582718158, + "learning_rate": 0.00027539975114935974, + "loss": 2.9659981727600098, + "step": 7699, + "token_acc": 0.30360185410559515 + }, + { + "epoch": 4.51363236587511, + "grad_norm": 0.23836709246069981, + "learning_rate": 0.0002753917730244991, + "loss": 2.978651762008667, + "step": 7700, + "token_acc": 0.30111827901384597 + }, + { + "epoch": 4.514218704192319, + "grad_norm": 0.210970200033658, + "learning_rate": 0.0002753837937217518, + "loss": 3.0234947204589844, + "step": 7701, + "token_acc": 0.2941148534913541 + }, + { + "epoch": 4.514805042509528, + "grad_norm": 0.23511376538809522, + "learning_rate": 0.00027537581324119266, + "loss": 3.001394748687744, + "step": 7702, + "token_acc": 0.29827057364745807 + }, + { + "epoch": 4.515391380826737, + "grad_norm": 0.21999748361684662, + "learning_rate": 0.0002753678315828967, + "loss": 2.961374282836914, + "step": 7703, + "token_acc": 0.30465367552657335 + }, + { + "epoch": 4.515977719143946, + "grad_norm": 0.21964445423063148, + "learning_rate": 0.00027535984874693897, + "loss": 2.9507014751434326, + "step": 7704, + "token_acc": 0.30641595859437687 + }, + { + "epoch": 4.516564057461155, + "grad_norm": 0.21770301530560826, + "learning_rate": 0.0002753518647333943, + "loss": 2.9924402236938477, + "step": 7705, + "token_acc": 0.29946383244424535 + }, + { + "epoch": 4.517150395778364, + "grad_norm": 0.23696318971832303, + "learning_rate": 0.00027534387954233783, + "loss": 2.9958348274230957, + "step": 7706, + "token_acc": 0.30000416170296884 + }, + { + "epoch": 4.517736734095573, + "grad_norm": 0.21288610660964188, + "learning_rate": 0.00027533589317384443, + "loss": 2.988255739212036, + "step": 7707, + "token_acc": 0.2978780316152773 + }, + { + "epoch": 4.518323072412782, + "grad_norm": 0.22393528496370987, + "learning_rate": 0.0002753279056279893, + "loss": 2.963256597518921, + "step": 7708, + "token_acc": 0.3038441866860628 + }, + { + "epoch": 4.518909410729991, + "grad_norm": 0.23258115272441177, + "learning_rate": 0.0002753199169048473, + "loss": 2.9800620079040527, + "step": 7709, + "token_acc": 0.30160334637920044 + }, + { + "epoch": 4.5194957490472, + "grad_norm": 0.21981515569993137, + "learning_rate": 0.0002753119270044936, + "loss": 2.9860894680023193, + "step": 7710, + "token_acc": 0.29900013591381525 + }, + { + "epoch": 4.520082087364409, + "grad_norm": 0.22771136776689815, + "learning_rate": 0.00027530393592700323, + "loss": 2.9410014152526855, + "step": 7711, + "token_acc": 0.30775343887222706 + }, + { + "epoch": 4.5206684256816185, + "grad_norm": 0.22074486147046318, + "learning_rate": 0.00027529594367245116, + "loss": 2.989107131958008, + "step": 7712, + "token_acc": 0.29935150414302625 + }, + { + "epoch": 4.521254763998828, + "grad_norm": 0.23017907428339843, + "learning_rate": 0.0002752879502409126, + "loss": 2.968398094177246, + "step": 7713, + "token_acc": 0.3029256875365711 + }, + { + "epoch": 4.521841102316037, + "grad_norm": 0.22432293327956712, + "learning_rate": 0.0002752799556324625, + "loss": 2.9508323669433594, + "step": 7714, + "token_acc": 0.3072731705765745 + }, + { + "epoch": 4.522427440633246, + "grad_norm": 0.20317947861364058, + "learning_rate": 0.00027527195984717613, + "loss": 2.995086908340454, + "step": 7715, + "token_acc": 0.2996233433844177 + }, + { + "epoch": 4.523013778950454, + "grad_norm": 0.22928105947962923, + "learning_rate": 0.00027526396288512844, + "loss": 2.993931770324707, + "step": 7716, + "token_acc": 0.2999773578149765 + }, + { + "epoch": 4.523600117267663, + "grad_norm": 0.21895660656556226, + "learning_rate": 0.00027525596474639466, + "loss": 2.9658145904541016, + "step": 7717, + "token_acc": 0.30359224048445205 + }, + { + "epoch": 4.524186455584872, + "grad_norm": 0.2282841703315982, + "learning_rate": 0.00027524796543104983, + "loss": 2.9695048332214355, + "step": 7718, + "token_acc": 0.30254743656998095 + }, + { + "epoch": 4.524772793902081, + "grad_norm": 0.23207873565315892, + "learning_rate": 0.00027523996493916913, + "loss": 2.966928482055664, + "step": 7719, + "token_acc": 0.30415452178306324 + }, + { + "epoch": 4.5253591322192905, + "grad_norm": 0.24644392226382714, + "learning_rate": 0.00027523196327082776, + "loss": 2.9880852699279785, + "step": 7720, + "token_acc": 0.3005302570329893 + }, + { + "epoch": 4.5259454705365, + "grad_norm": 0.2444212399368073, + "learning_rate": 0.00027522396042610085, + "loss": 2.9890787601470947, + "step": 7721, + "token_acc": 0.30163186945044396 + }, + { + "epoch": 4.526531808853709, + "grad_norm": 0.21986239219486692, + "learning_rate": 0.00027521595640506353, + "loss": 2.9860782623291016, + "step": 7722, + "token_acc": 0.3014009075559308 + }, + { + "epoch": 4.527118147170918, + "grad_norm": 0.23600814441317555, + "learning_rate": 0.0002752079512077911, + "loss": 3.024751663208008, + "step": 7723, + "token_acc": 0.2959117734596419 + }, + { + "epoch": 4.527704485488127, + "grad_norm": 0.24199749840692936, + "learning_rate": 0.0002751999448343586, + "loss": 2.9594221115112305, + "step": 7724, + "token_acc": 0.3044213102734409 + }, + { + "epoch": 4.528290823805335, + "grad_norm": 0.2267127731836551, + "learning_rate": 0.0002751919372848414, + "loss": 2.9607176780700684, + "step": 7725, + "token_acc": 0.3044495428697518 + }, + { + "epoch": 4.528877162122544, + "grad_norm": 0.22790637518306908, + "learning_rate": 0.0002751839285593146, + "loss": 2.963693380355835, + "step": 7726, + "token_acc": 0.3012129344594417 + }, + { + "epoch": 4.529463500439753, + "grad_norm": 0.2380676948524783, + "learning_rate": 0.0002751759186578535, + "loss": 2.9461140632629395, + "step": 7727, + "token_acc": 0.3066855497674997 + }, + { + "epoch": 4.5300498387569625, + "grad_norm": 0.22676762926695837, + "learning_rate": 0.00027516790758053334, + "loss": 3.0013465881347656, + "step": 7728, + "token_acc": 0.298534249900579 + }, + { + "epoch": 4.530636177074172, + "grad_norm": 0.21562627933791229, + "learning_rate": 0.0002751598953274293, + "loss": 3.014930248260498, + "step": 7729, + "token_acc": 0.2971793523724674 + }, + { + "epoch": 4.531222515391381, + "grad_norm": 0.20665000134286518, + "learning_rate": 0.00027515188189861675, + "loss": 2.9630062580108643, + "step": 7730, + "token_acc": 0.30416323217976843 + }, + { + "epoch": 4.53180885370859, + "grad_norm": 0.21451991595145756, + "learning_rate": 0.0002751438672941709, + "loss": 2.946467399597168, + "step": 7731, + "token_acc": 0.30592124959323136 + }, + { + "epoch": 4.532395192025799, + "grad_norm": 0.22673265597390274, + "learning_rate": 0.00027513585151416704, + "loss": 3.014478921890259, + "step": 7732, + "token_acc": 0.2957839856160133 + }, + { + "epoch": 4.532981530343008, + "grad_norm": 0.22382510372097542, + "learning_rate": 0.0002751278345586805, + "loss": 2.970081329345703, + "step": 7733, + "token_acc": 0.3036868260813577 + }, + { + "epoch": 4.533567868660217, + "grad_norm": 0.2098061907932322, + "learning_rate": 0.00027511981642778653, + "loss": 2.9808945655822754, + "step": 7734, + "token_acc": 0.30107383949748484 + }, + { + "epoch": 4.534154206977426, + "grad_norm": 0.22543071239948761, + "learning_rate": 0.0002751117971215606, + "loss": 2.9669575691223145, + "step": 7735, + "token_acc": 0.30284533809295716 + }, + { + "epoch": 4.534740545294635, + "grad_norm": 0.2211636838214423, + "learning_rate": 0.0002751037766400778, + "loss": 2.990981340408325, + "step": 7736, + "token_acc": 0.2997858211445115 + }, + { + "epoch": 4.535326883611844, + "grad_norm": 0.23426176027712597, + "learning_rate": 0.0002750957549834136, + "loss": 3.0035042762756348, + "step": 7737, + "token_acc": 0.29749086529733487 + }, + { + "epoch": 4.535913221929053, + "grad_norm": 0.22318727840354854, + "learning_rate": 0.0002750877321516434, + "loss": 2.9733495712280273, + "step": 7738, + "token_acc": 0.30123730049250746 + }, + { + "epoch": 4.536499560246262, + "grad_norm": 0.23396894850914174, + "learning_rate": 0.0002750797081448425, + "loss": 2.9857406616210938, + "step": 7739, + "token_acc": 0.3015606120550465 + }, + { + "epoch": 4.537085898563471, + "grad_norm": 0.20553700772355055, + "learning_rate": 0.00027507168296308625, + "loss": 2.931966781616211, + "step": 7740, + "token_acc": 0.3084519700423315 + }, + { + "epoch": 4.53767223688068, + "grad_norm": 0.24434709846180402, + "learning_rate": 0.00027506365660645007, + "loss": 2.9528136253356934, + "step": 7741, + "token_acc": 0.3074063909037444 + }, + { + "epoch": 4.538258575197889, + "grad_norm": 0.2422797190611248, + "learning_rate": 0.00027505562907500944, + "loss": 3.0136547088623047, + "step": 7742, + "token_acc": 0.2971237185491383 + }, + { + "epoch": 4.538844913515098, + "grad_norm": 0.22278820537780417, + "learning_rate": 0.00027504760036883966, + "loss": 2.9406051635742188, + "step": 7743, + "token_acc": 0.3065464739185401 + }, + { + "epoch": 4.5394312518323074, + "grad_norm": 0.2305967651309063, + "learning_rate": 0.00027503957048801613, + "loss": 3.0179290771484375, + "step": 7744, + "token_acc": 0.2971057833645902 + }, + { + "epoch": 4.540017590149517, + "grad_norm": 0.24872656815490563, + "learning_rate": 0.0002750315394326144, + "loss": 3.0021023750305176, + "step": 7745, + "token_acc": 0.2972013891068085 + }, + { + "epoch": 4.540603928466726, + "grad_norm": 0.20310344137934316, + "learning_rate": 0.0002750235072027098, + "loss": 2.9927072525024414, + "step": 7746, + "token_acc": 0.2999105996154992 + }, + { + "epoch": 4.541190266783934, + "grad_norm": 0.25291785059097727, + "learning_rate": 0.00027501547379837785, + "loss": 3.0082972049713135, + "step": 7747, + "token_acc": 0.29722700295590837 + }, + { + "epoch": 4.541776605101143, + "grad_norm": 0.24245550744996894, + "learning_rate": 0.00027500743921969395, + "loss": 3.0299525260925293, + "step": 7748, + "token_acc": 0.2947659151085473 + }, + { + "epoch": 4.542362943418352, + "grad_norm": 0.22409731597927587, + "learning_rate": 0.0002749994034667336, + "loss": 2.998142957687378, + "step": 7749, + "token_acc": 0.29922359307530383 + }, + { + "epoch": 4.542949281735561, + "grad_norm": 0.23226915357677802, + "learning_rate": 0.00027499136653957233, + "loss": 2.968794345855713, + "step": 7750, + "token_acc": 0.3026495576437181 + }, + { + "epoch": 4.54353562005277, + "grad_norm": 0.2436078436062602, + "learning_rate": 0.0002749833284382856, + "loss": 2.9623961448669434, + "step": 7751, + "token_acc": 0.30542321649136533 + }, + { + "epoch": 4.5441219583699795, + "grad_norm": 0.2295361359441344, + "learning_rate": 0.0002749752891629489, + "loss": 2.9902424812316895, + "step": 7752, + "token_acc": 0.30026955167368513 + }, + { + "epoch": 4.544708296687189, + "grad_norm": 0.20533540148340074, + "learning_rate": 0.0002749672487136377, + "loss": 2.9729013442993164, + "step": 7753, + "token_acc": 0.3029422121466814 + }, + { + "epoch": 4.545294635004398, + "grad_norm": 0.22786810106419103, + "learning_rate": 0.00027495920709042773, + "loss": 2.9648854732513428, + "step": 7754, + "token_acc": 0.3034954152235937 + }, + { + "epoch": 4.545880973321607, + "grad_norm": 0.21769783050452754, + "learning_rate": 0.0002749511642933943, + "loss": 2.9705238342285156, + "step": 7755, + "token_acc": 0.30420145080772854 + }, + { + "epoch": 4.546467311638816, + "grad_norm": 0.2210949243093908, + "learning_rate": 0.0002749431203226131, + "loss": 2.9676127433776855, + "step": 7756, + "token_acc": 0.302537440831846 + }, + { + "epoch": 4.547053649956025, + "grad_norm": 0.20198459361942897, + "learning_rate": 0.0002749350751781596, + "loss": 2.946559429168701, + "step": 7757, + "token_acc": 0.30534744592728946 + }, + { + "epoch": 4.547639988273234, + "grad_norm": 0.2163074124529526, + "learning_rate": 0.0002749270288601094, + "loss": 2.9771487712860107, + "step": 7758, + "token_acc": 0.30121539087406574 + }, + { + "epoch": 4.548226326590442, + "grad_norm": 0.20295629311021735, + "learning_rate": 0.00027491898136853816, + "loss": 2.948281764984131, + "step": 7759, + "token_acc": 0.30665099941879564 + }, + { + "epoch": 4.5488126649076515, + "grad_norm": 0.22402314450241137, + "learning_rate": 0.0002749109327035214, + "loss": 2.942823886871338, + "step": 7760, + "token_acc": 0.30771558807366484 + }, + { + "epoch": 4.549399003224861, + "grad_norm": 0.20186083429482596, + "learning_rate": 0.0002749028828651348, + "loss": 3.0131897926330566, + "step": 7761, + "token_acc": 0.29507023819911005 + }, + { + "epoch": 4.54998534154207, + "grad_norm": 0.21582200660148892, + "learning_rate": 0.0002748948318534539, + "loss": 2.982340097427368, + "step": 7762, + "token_acc": 0.30070976636420343 + }, + { + "epoch": 4.550571679859279, + "grad_norm": 0.21652509836385608, + "learning_rate": 0.0002748867796685543, + "loss": 2.9687228202819824, + "step": 7763, + "token_acc": 0.3027524906683853 + }, + { + "epoch": 4.551158018176488, + "grad_norm": 0.2352780849569731, + "learning_rate": 0.0002748787263105117, + "loss": 2.9721882343292236, + "step": 7764, + "token_acc": 0.3029697628516561 + }, + { + "epoch": 4.551744356493697, + "grad_norm": 0.24564396910040026, + "learning_rate": 0.00027487067177940183, + "loss": 2.981130361557007, + "step": 7765, + "token_acc": 0.30254945409758804 + }, + { + "epoch": 4.552330694810906, + "grad_norm": 0.2199831596729623, + "learning_rate": 0.0002748626160753002, + "loss": 3.0031003952026367, + "step": 7766, + "token_acc": 0.2989423363576164 + }, + { + "epoch": 4.552917033128115, + "grad_norm": 0.22875975250252137, + "learning_rate": 0.0002748545591982825, + "loss": 2.9655323028564453, + "step": 7767, + "token_acc": 0.3040906316793439 + }, + { + "epoch": 4.5535033714453235, + "grad_norm": 0.2474565807113134, + "learning_rate": 0.00027484650114842455, + "loss": 2.9587340354919434, + "step": 7768, + "token_acc": 0.30253816163004865 + }, + { + "epoch": 4.554089709762533, + "grad_norm": 0.22263978525699793, + "learning_rate": 0.0002748384419258019, + "loss": 2.975879192352295, + "step": 7769, + "token_acc": 0.3028028147055948 + }, + { + "epoch": 4.554676048079742, + "grad_norm": 0.2270013576657166, + "learning_rate": 0.00027483038153049036, + "loss": 3.033445358276367, + "step": 7770, + "token_acc": 0.2938889250602253 + }, + { + "epoch": 4.555262386396951, + "grad_norm": 0.23130207950435072, + "learning_rate": 0.00027482231996256554, + "loss": 2.991196870803833, + "step": 7771, + "token_acc": 0.29969369273385893 + }, + { + "epoch": 4.55584872471416, + "grad_norm": 0.22422879078213484, + "learning_rate": 0.0002748142572221032, + "loss": 2.9923620223999023, + "step": 7772, + "token_acc": 0.29871994164858273 + }, + { + "epoch": 4.556435063031369, + "grad_norm": 0.2043115435315682, + "learning_rate": 0.0002748061933091792, + "loss": 2.993715524673462, + "step": 7773, + "token_acc": 0.2984629132209556 + }, + { + "epoch": 4.557021401348578, + "grad_norm": 0.21793277441441922, + "learning_rate": 0.0002747981282238691, + "loss": 2.9579083919525146, + "step": 7774, + "token_acc": 0.3042524162045227 + }, + { + "epoch": 4.557607739665787, + "grad_norm": 0.20317010800838273, + "learning_rate": 0.0002747900619662488, + "loss": 2.9364981651306152, + "step": 7775, + "token_acc": 0.3080165027970621 + }, + { + "epoch": 4.558194077982996, + "grad_norm": 0.21498713478707232, + "learning_rate": 0.000274781994536394, + "loss": 2.983680486679077, + "step": 7776, + "token_acc": 0.3004606381874646 + }, + { + "epoch": 4.5587804163002055, + "grad_norm": 0.21118297123661633, + "learning_rate": 0.00027477392593438057, + "loss": 2.974483013153076, + "step": 7777, + "token_acc": 0.30197589523699153 + }, + { + "epoch": 4.559366754617415, + "grad_norm": 0.21325495347960452, + "learning_rate": 0.0002747658561602842, + "loss": 2.9670441150665283, + "step": 7778, + "token_acc": 0.304626014215563 + }, + { + "epoch": 4.559953092934624, + "grad_norm": 0.21489192864117126, + "learning_rate": 0.0002747577852141807, + "loss": 2.9906210899353027, + "step": 7779, + "token_acc": 0.29946914930509244 + }, + { + "epoch": 4.560539431251832, + "grad_norm": 0.20991353451988473, + "learning_rate": 0.00027474971309614596, + "loss": 2.950576066970825, + "step": 7780, + "token_acc": 0.3057473860001996 + }, + { + "epoch": 4.561125769569041, + "grad_norm": 0.24948168144597818, + "learning_rate": 0.0002747416398062557, + "loss": 3.002779722213745, + "step": 7781, + "token_acc": 0.2974535010940919 + }, + { + "epoch": 4.56171210788625, + "grad_norm": 0.2192479419685926, + "learning_rate": 0.0002747335653445859, + "loss": 3.002641201019287, + "step": 7782, + "token_acc": 0.2991641638879911 + }, + { + "epoch": 4.562298446203459, + "grad_norm": 0.22773612919754305, + "learning_rate": 0.00027472548971121236, + "loss": 2.986563205718994, + "step": 7783, + "token_acc": 0.3012086989226814 + }, + { + "epoch": 4.562884784520668, + "grad_norm": 0.2258180741571366, + "learning_rate": 0.0002747174129062109, + "loss": 2.981994867324829, + "step": 7784, + "token_acc": 0.3014281811559009 + }, + { + "epoch": 4.5634711228378775, + "grad_norm": 0.23262902462586132, + "learning_rate": 0.00027470933492965735, + "loss": 2.9737510681152344, + "step": 7785, + "token_acc": 0.3018738556030463 + }, + { + "epoch": 4.564057461155087, + "grad_norm": 0.22576386510162016, + "learning_rate": 0.0002747012557816277, + "loss": 2.987853765487671, + "step": 7786, + "token_acc": 0.29935785595959963 + }, + { + "epoch": 4.564643799472296, + "grad_norm": 0.20651870844602707, + "learning_rate": 0.00027469317546219773, + "loss": 3.0065269470214844, + "step": 7787, + "token_acc": 0.2967583873014594 + }, + { + "epoch": 4.565230137789505, + "grad_norm": 0.2298909664835203, + "learning_rate": 0.0002746850939714434, + "loss": 2.9651455879211426, + "step": 7788, + "token_acc": 0.30302198021966886 + }, + { + "epoch": 4.565816476106714, + "grad_norm": 0.20922893672470744, + "learning_rate": 0.0002746770113094407, + "loss": 3.0190906524658203, + "step": 7789, + "token_acc": 0.29530434970330055 + }, + { + "epoch": 4.566402814423922, + "grad_norm": 0.21965201644410878, + "learning_rate": 0.00027466892747626543, + "loss": 3.0048913955688477, + "step": 7790, + "token_acc": 0.29941258308857627 + }, + { + "epoch": 4.566989152741131, + "grad_norm": 0.21991177847115062, + "learning_rate": 0.0002746608424719936, + "loss": 2.977902889251709, + "step": 7791, + "token_acc": 0.3004478754335399 + }, + { + "epoch": 4.56757549105834, + "grad_norm": 0.23398431057206012, + "learning_rate": 0.00027465275629670117, + "loss": 3.042297840118408, + "step": 7792, + "token_acc": 0.2928192147768667 + }, + { + "epoch": 4.5681618293755495, + "grad_norm": 0.2516246091786564, + "learning_rate": 0.00027464466895046403, + "loss": 2.9790143966674805, + "step": 7793, + "token_acc": 0.3018321709716621 + }, + { + "epoch": 4.568748167692759, + "grad_norm": 0.22211053556914961, + "learning_rate": 0.0002746365804333582, + "loss": 2.976900339126587, + "step": 7794, + "token_acc": 0.3006594473799628 + }, + { + "epoch": 4.569334506009968, + "grad_norm": 0.2281844933762865, + "learning_rate": 0.0002746284907454596, + "loss": 2.976524829864502, + "step": 7795, + "token_acc": 0.30310081728370164 + }, + { + "epoch": 4.569920844327177, + "grad_norm": 0.26458863847915837, + "learning_rate": 0.00027462039988684434, + "loss": 2.9792592525482178, + "step": 7796, + "token_acc": 0.30117043148742895 + }, + { + "epoch": 4.570507182644386, + "grad_norm": 0.21930711607914374, + "learning_rate": 0.00027461230785758825, + "loss": 2.977689266204834, + "step": 7797, + "token_acc": 0.3009856958850247 + }, + { + "epoch": 4.571093520961595, + "grad_norm": 0.20544541006345346, + "learning_rate": 0.00027460421465776754, + "loss": 2.944749116897583, + "step": 7798, + "token_acc": 0.3055804315228257 + }, + { + "epoch": 4.571679859278804, + "grad_norm": 0.21353399368099651, + "learning_rate": 0.0002745961202874581, + "loss": 2.968803882598877, + "step": 7799, + "token_acc": 0.3019290236309312 + }, + { + "epoch": 4.572266197596013, + "grad_norm": 0.20554205299875458, + "learning_rate": 0.000274588024746736, + "loss": 2.999549627304077, + "step": 7800, + "token_acc": 0.2986383918062886 + }, + { + "epoch": 4.572852535913222, + "grad_norm": 0.22332571881470661, + "learning_rate": 0.00027457992803567733, + "loss": 2.9907350540161133, + "step": 7801, + "token_acc": 0.3002282982091513 + }, + { + "epoch": 4.573438874230431, + "grad_norm": 0.22779701637785044, + "learning_rate": 0.00027457183015435805, + "loss": 2.999965190887451, + "step": 7802, + "token_acc": 0.3001837693247399 + }, + { + "epoch": 4.57402521254764, + "grad_norm": 0.20393838001588763, + "learning_rate": 0.00027456373110285433, + "loss": 3.0043723583221436, + "step": 7803, + "token_acc": 0.297095678095376 + }, + { + "epoch": 4.574611550864849, + "grad_norm": 0.2302243853954627, + "learning_rate": 0.0002745556308812422, + "loss": 2.962055206298828, + "step": 7804, + "token_acc": 0.3049126164280602 + }, + { + "epoch": 4.575197889182058, + "grad_norm": 0.2170932482851595, + "learning_rate": 0.00027454752948959777, + "loss": 2.9923315048217773, + "step": 7805, + "token_acc": 0.29908891473897053 + }, + { + "epoch": 4.575784227499267, + "grad_norm": 0.23564919019023953, + "learning_rate": 0.00027453942692799715, + "loss": 2.9985883235931396, + "step": 7806, + "token_acc": 0.29944643215231825 + }, + { + "epoch": 4.576370565816476, + "grad_norm": 0.24257634449575938, + "learning_rate": 0.0002745313231965163, + "loss": 3.0147128105163574, + "step": 7807, + "token_acc": 0.2967742421125001 + }, + { + "epoch": 4.576956904133685, + "grad_norm": 0.23273475083922707, + "learning_rate": 0.00027452321829523165, + "loss": 2.9455080032348633, + "step": 7808, + "token_acc": 0.30826714061111493 + }, + { + "epoch": 4.577543242450894, + "grad_norm": 0.24325756814840865, + "learning_rate": 0.0002745151122242191, + "loss": 3.002279758453369, + "step": 7809, + "token_acc": 0.29774522521382396 + }, + { + "epoch": 4.5781295807681035, + "grad_norm": 0.22962941851202368, + "learning_rate": 0.0002745070049835548, + "loss": 2.9595866203308105, + "step": 7810, + "token_acc": 0.30415173015349595 + }, + { + "epoch": 4.578715919085313, + "grad_norm": 0.23537222181844633, + "learning_rate": 0.000274498896573315, + "loss": 2.9786906242370605, + "step": 7811, + "token_acc": 0.30180579990384565 + }, + { + "epoch": 4.579302257402521, + "grad_norm": 0.23127312028453179, + "learning_rate": 0.0002744907869935759, + "loss": 2.956346273422241, + "step": 7812, + "token_acc": 0.30532438847752746 + }, + { + "epoch": 4.57988859571973, + "grad_norm": 0.21239023140471866, + "learning_rate": 0.00027448267624441354, + "loss": 2.9552979469299316, + "step": 7813, + "token_acc": 0.30299301755048125 + }, + { + "epoch": 4.580474934036939, + "grad_norm": 0.20728185143972744, + "learning_rate": 0.0002744745643259042, + "loss": 2.947542667388916, + "step": 7814, + "token_acc": 0.3054927718299235 + }, + { + "epoch": 4.581061272354148, + "grad_norm": 0.20450771141946977, + "learning_rate": 0.000274466451238124, + "loss": 3.003728151321411, + "step": 7815, + "token_acc": 0.298947644891393 + }, + { + "epoch": 4.581647610671357, + "grad_norm": 0.2020427747403104, + "learning_rate": 0.0002744583369811493, + "loss": 2.9895215034484863, + "step": 7816, + "token_acc": 0.30023551039803076 + }, + { + "epoch": 4.582233948988566, + "grad_norm": 0.22090675935548498, + "learning_rate": 0.0002744502215550562, + "loss": 3.0046422481536865, + "step": 7817, + "token_acc": 0.29893823722901336 + }, + { + "epoch": 4.5828202873057755, + "grad_norm": 0.19714614916653708, + "learning_rate": 0.000274442104959921, + "loss": 2.9525949954986572, + "step": 7818, + "token_acc": 0.3044728848375575 + }, + { + "epoch": 4.583406625622985, + "grad_norm": 0.23385437967076847, + "learning_rate": 0.00027443398719581986, + "loss": 2.9863271713256836, + "step": 7819, + "token_acc": 0.30175533194302373 + }, + { + "epoch": 4.583992963940194, + "grad_norm": 0.2028404990582151, + "learning_rate": 0.00027442586826282906, + "loss": 2.977673292160034, + "step": 7820, + "token_acc": 0.3018352425927726 + }, + { + "epoch": 4.584579302257403, + "grad_norm": 0.21577757454060997, + "learning_rate": 0.00027441774816102494, + "loss": 2.9683423042297363, + "step": 7821, + "token_acc": 0.3039007417369687 + }, + { + "epoch": 4.585165640574612, + "grad_norm": 0.19794335142791233, + "learning_rate": 0.00027440962689048373, + "loss": 2.999927520751953, + "step": 7822, + "token_acc": 0.2981070087609512 + }, + { + "epoch": 4.585751978891821, + "grad_norm": 0.22134803873022552, + "learning_rate": 0.0002744015044512817, + "loss": 2.9816393852233887, + "step": 7823, + "token_acc": 0.3028717977337125 + }, + { + "epoch": 4.586338317209029, + "grad_norm": 0.19659378541643652, + "learning_rate": 0.0002743933808434952, + "loss": 2.963651180267334, + "step": 7824, + "token_acc": 0.3026955661810434 + }, + { + "epoch": 4.586924655526238, + "grad_norm": 0.209383267507122, + "learning_rate": 0.00027438525606720047, + "loss": 3.0111918449401855, + "step": 7825, + "token_acc": 0.2971862660990666 + }, + { + "epoch": 4.5875109938434475, + "grad_norm": 0.1950207916267565, + "learning_rate": 0.0002743771301224739, + "loss": 2.9721925258636475, + "step": 7826, + "token_acc": 0.3031967873188099 + }, + { + "epoch": 4.588097332160657, + "grad_norm": 0.20871373709529475, + "learning_rate": 0.0002743690030093918, + "loss": 2.946967840194702, + "step": 7827, + "token_acc": 0.3060509212362024 + }, + { + "epoch": 4.588683670477866, + "grad_norm": 0.2162716324751471, + "learning_rate": 0.0002743608747280305, + "loss": 2.990880012512207, + "step": 7828, + "token_acc": 0.3000012745510394 + }, + { + "epoch": 4.589270008795075, + "grad_norm": 0.19548575533432863, + "learning_rate": 0.00027435274527846633, + "loss": 2.9647111892700195, + "step": 7829, + "token_acc": 0.303836109214473 + }, + { + "epoch": 4.589856347112284, + "grad_norm": 0.2299900436009014, + "learning_rate": 0.0002743446146607757, + "loss": 3.017063617706299, + "step": 7830, + "token_acc": 0.2952265225616874 + }, + { + "epoch": 4.590442685429493, + "grad_norm": 0.23810486831795422, + "learning_rate": 0.000274336482875035, + "loss": 2.9440646171569824, + "step": 7831, + "token_acc": 0.30581723122863697 + }, + { + "epoch": 4.591029023746702, + "grad_norm": 0.22189724423697985, + "learning_rate": 0.0002743283499213206, + "loss": 3.0022687911987305, + "step": 7832, + "token_acc": 0.2969015661678969 + }, + { + "epoch": 4.59161536206391, + "grad_norm": 0.24395286891682708, + "learning_rate": 0.0002743202157997088, + "loss": 2.932993173599243, + "step": 7833, + "token_acc": 0.30905270811391666 + }, + { + "epoch": 4.5922017003811195, + "grad_norm": 0.21818236462779567, + "learning_rate": 0.00027431208051027615, + "loss": 2.9698147773742676, + "step": 7834, + "token_acc": 0.303368093623314 + }, + { + "epoch": 4.592788038698329, + "grad_norm": 0.2403800737000575, + "learning_rate": 0.00027430394405309903, + "loss": 3.0163397789001465, + "step": 7835, + "token_acc": 0.29629178227834974 + }, + { + "epoch": 4.593374377015538, + "grad_norm": 0.22888299323431083, + "learning_rate": 0.0002742958064282539, + "loss": 2.987128257751465, + "step": 7836, + "token_acc": 0.30058310432624075 + }, + { + "epoch": 4.593960715332747, + "grad_norm": 0.22980685300617568, + "learning_rate": 0.00027428766763581703, + "loss": 2.986037254333496, + "step": 7837, + "token_acc": 0.3008650742414461 + }, + { + "epoch": 4.594547053649956, + "grad_norm": 0.21490515447494501, + "learning_rate": 0.00027427952767586513, + "loss": 2.965238571166992, + "step": 7838, + "token_acc": 0.3030383226917098 + }, + { + "epoch": 4.595133391967165, + "grad_norm": 0.22003182675776053, + "learning_rate": 0.00027427138654847447, + "loss": 3.0021140575408936, + "step": 7839, + "token_acc": 0.29780187003151304 + }, + { + "epoch": 4.595719730284374, + "grad_norm": 0.22369193224733933, + "learning_rate": 0.0002742632442537216, + "loss": 2.9564208984375, + "step": 7840, + "token_acc": 0.3048269680773863 + }, + { + "epoch": 4.596306068601583, + "grad_norm": 0.22286984454284345, + "learning_rate": 0.000274255100791683, + "loss": 2.9980783462524414, + "step": 7841, + "token_acc": 0.2984844592858978 + }, + { + "epoch": 4.596892406918792, + "grad_norm": 0.22505341687528505, + "learning_rate": 0.00027424695616243516, + "loss": 2.9832797050476074, + "step": 7842, + "token_acc": 0.30115075667917307 + }, + { + "epoch": 4.5974787452360015, + "grad_norm": 0.22150295354680086, + "learning_rate": 0.0002742388103660546, + "loss": 2.9829509258270264, + "step": 7843, + "token_acc": 0.30089812363125806 + }, + { + "epoch": 4.598065083553211, + "grad_norm": 0.22994982850211107, + "learning_rate": 0.0002742306634026178, + "loss": 2.9777042865753174, + "step": 7844, + "token_acc": 0.30211073330339383 + }, + { + "epoch": 4.598651421870419, + "grad_norm": 0.21963795006056303, + "learning_rate": 0.0002742225152722014, + "loss": 2.991011619567871, + "step": 7845, + "token_acc": 0.2991498148138165 + }, + { + "epoch": 4.599237760187628, + "grad_norm": 0.22300525072524863, + "learning_rate": 0.0002742143659748818, + "loss": 2.991039752960205, + "step": 7846, + "token_acc": 0.2993824318030518 + }, + { + "epoch": 4.599824098504837, + "grad_norm": 0.21211461369094334, + "learning_rate": 0.0002742062155107356, + "loss": 2.9675590991973877, + "step": 7847, + "token_acc": 0.3024256405427743 + }, + { + "epoch": 4.600410436822046, + "grad_norm": 0.22508377535817234, + "learning_rate": 0.0002741980638798394, + "loss": 3.0370826721191406, + "step": 7848, + "token_acc": 0.29364020910853994 + }, + { + "epoch": 4.600996775139255, + "grad_norm": 0.23910878013870746, + "learning_rate": 0.00027418991108226973, + "loss": 2.947202205657959, + "step": 7849, + "token_acc": 0.3061959559815481 + }, + { + "epoch": 4.601583113456464, + "grad_norm": 0.21117266750087316, + "learning_rate": 0.00027418175711810326, + "loss": 2.983290195465088, + "step": 7850, + "token_acc": 0.30187040606663984 + }, + { + "epoch": 4.6021694517736735, + "grad_norm": 0.21699359328477968, + "learning_rate": 0.0002741736019874164, + "loss": 2.9855363368988037, + "step": 7851, + "token_acc": 0.30096359688075935 + }, + { + "epoch": 4.602755790090883, + "grad_norm": 0.2146938981295693, + "learning_rate": 0.000274165445690286, + "loss": 3.001988172531128, + "step": 7852, + "token_acc": 0.2992727075851365 + }, + { + "epoch": 4.603342128408092, + "grad_norm": 0.22883815058911136, + "learning_rate": 0.0002741572882267885, + "loss": 2.9758567810058594, + "step": 7853, + "token_acc": 0.30150825123407066 + }, + { + "epoch": 4.603928466725301, + "grad_norm": 0.24656383987698396, + "learning_rate": 0.00027414912959700056, + "loss": 3.012301445007324, + "step": 7854, + "token_acc": 0.29631137694262843 + }, + { + "epoch": 4.604514805042509, + "grad_norm": 0.23331935213824062, + "learning_rate": 0.00027414096980099887, + "loss": 3.013822078704834, + "step": 7855, + "token_acc": 0.29604402180276046 + }, + { + "epoch": 4.605101143359718, + "grad_norm": 0.22829206542609534, + "learning_rate": 0.00027413280883886, + "loss": 2.953033924102783, + "step": 7856, + "token_acc": 0.30484906140012946 + }, + { + "epoch": 4.605687481676927, + "grad_norm": 0.21345169469453384, + "learning_rate": 0.0002741246467106607, + "loss": 2.9741950035095215, + "step": 7857, + "token_acc": 0.30215075610251646 + }, + { + "epoch": 4.606273819994136, + "grad_norm": 0.2383022616883737, + "learning_rate": 0.00027411648341647767, + "loss": 3.0167148113250732, + "step": 7858, + "token_acc": 0.29593030717084257 + }, + { + "epoch": 4.6068601583113455, + "grad_norm": 0.23026527222383206, + "learning_rate": 0.0002741083189563874, + "loss": 2.9817028045654297, + "step": 7859, + "token_acc": 0.2992523235306916 + }, + { + "epoch": 4.607446496628555, + "grad_norm": 0.23633052001149207, + "learning_rate": 0.0002741001533304668, + "loss": 2.970470666885376, + "step": 7860, + "token_acc": 0.30451676073886613 + }, + { + "epoch": 4.608032834945764, + "grad_norm": 0.21282532546358718, + "learning_rate": 0.0002740919865387924, + "loss": 3.0054333209991455, + "step": 7861, + "token_acc": 0.29650967220439906 + }, + { + "epoch": 4.608619173262973, + "grad_norm": 0.22498247755838438, + "learning_rate": 0.0002740838185814411, + "loss": 2.9955697059631348, + "step": 7862, + "token_acc": 0.29896585852445856 + }, + { + "epoch": 4.609205511580182, + "grad_norm": 0.20499326123338033, + "learning_rate": 0.0002740756494584895, + "loss": 2.973649501800537, + "step": 7863, + "token_acc": 0.3024995540789134 + }, + { + "epoch": 4.609791849897391, + "grad_norm": 0.22900751991262022, + "learning_rate": 0.00027406747917001434, + "loss": 2.984330654144287, + "step": 7864, + "token_acc": 0.300936870240607 + }, + { + "epoch": 4.6103781882146, + "grad_norm": 0.20986312679480085, + "learning_rate": 0.00027405930771609246, + "loss": 2.9691944122314453, + "step": 7865, + "token_acc": 0.3019572376049348 + }, + { + "epoch": 4.610964526531809, + "grad_norm": 0.23222456593172516, + "learning_rate": 0.0002740511350968005, + "loss": 2.9644699096679688, + "step": 7866, + "token_acc": 0.3032042484031838 + }, + { + "epoch": 4.6115508648490176, + "grad_norm": 0.21228348282596368, + "learning_rate": 0.00027404296131221527, + "loss": 3.017974615097046, + "step": 7867, + "token_acc": 0.2977157536624314 + }, + { + "epoch": 4.612137203166227, + "grad_norm": 0.22945964433191268, + "learning_rate": 0.0002740347863624136, + "loss": 3.0059428215026855, + "step": 7868, + "token_acc": 0.2987433574001389 + }, + { + "epoch": 4.612723541483436, + "grad_norm": 0.22122796979398598, + "learning_rate": 0.00027402661024747226, + "loss": 3.040830612182617, + "step": 7869, + "token_acc": 0.29267261449207105 + }, + { + "epoch": 4.613309879800645, + "grad_norm": 0.23375406089898634, + "learning_rate": 0.00027401843296746804, + "loss": 3.0091423988342285, + "step": 7870, + "token_acc": 0.2988432716795047 + }, + { + "epoch": 4.613896218117854, + "grad_norm": 0.22120853512211594, + "learning_rate": 0.00027401025452247773, + "loss": 2.9589972496032715, + "step": 7871, + "token_acc": 0.30538537459814985 + }, + { + "epoch": 4.614482556435063, + "grad_norm": 0.22418732022182805, + "learning_rate": 0.00027400207491257817, + "loss": 3.016993522644043, + "step": 7872, + "token_acc": 0.29486604348370404 + }, + { + "epoch": 4.615068894752272, + "grad_norm": 0.2070957040775095, + "learning_rate": 0.0002739938941378463, + "loss": 2.969886302947998, + "step": 7873, + "token_acc": 0.30299891919544003 + }, + { + "epoch": 4.615655233069481, + "grad_norm": 0.21857763810218245, + "learning_rate": 0.00027398571219835885, + "loss": 2.961629629135132, + "step": 7874, + "token_acc": 0.3047949306854805 + }, + { + "epoch": 4.6162415713866904, + "grad_norm": 0.21318996984907124, + "learning_rate": 0.0002739775290941927, + "loss": 2.974924087524414, + "step": 7875, + "token_acc": 0.30210956168652 + }, + { + "epoch": 4.616827909703899, + "grad_norm": 0.22567384349998743, + "learning_rate": 0.00027396934482542466, + "loss": 3.0118229389190674, + "step": 7876, + "token_acc": 0.29692728099141874 + }, + { + "epoch": 4.617414248021108, + "grad_norm": 0.21527238662508338, + "learning_rate": 0.00027396115939213174, + "loss": 2.9188170433044434, + "step": 7877, + "token_acc": 0.3106964274546804 + }, + { + "epoch": 4.618000586338317, + "grad_norm": 0.23920514899739181, + "learning_rate": 0.0002739529727943907, + "loss": 2.9834771156311035, + "step": 7878, + "token_acc": 0.2996421094580921 + }, + { + "epoch": 4.618586924655526, + "grad_norm": 0.22459202430526826, + "learning_rate": 0.0002739447850322786, + "loss": 2.9495389461517334, + "step": 7879, + "token_acc": 0.3059905696838525 + }, + { + "epoch": 4.619173262972735, + "grad_norm": 0.22470841974248387, + "learning_rate": 0.0002739365961058722, + "loss": 2.9653334617614746, + "step": 7880, + "token_acc": 0.30326915253885955 + }, + { + "epoch": 4.619759601289944, + "grad_norm": 0.22028763771626472, + "learning_rate": 0.00027392840601524855, + "loss": 3.022495985031128, + "step": 7881, + "token_acc": 0.2943040178260567 + }, + { + "epoch": 4.620345939607153, + "grad_norm": 0.21960757515597665, + "learning_rate": 0.00027392021476048444, + "loss": 2.989363193511963, + "step": 7882, + "token_acc": 0.29975820111977447 + }, + { + "epoch": 4.6209322779243625, + "grad_norm": 0.2378688109572106, + "learning_rate": 0.00027391202234165697, + "loss": 3.0106472969055176, + "step": 7883, + "token_acc": 0.2968451130871985 + }, + { + "epoch": 4.621518616241572, + "grad_norm": 0.21493326076900965, + "learning_rate": 0.00027390382875884295, + "loss": 3.0061895847320557, + "step": 7884, + "token_acc": 0.29759498317212 + }, + { + "epoch": 4.622104954558781, + "grad_norm": 0.23176526345190887, + "learning_rate": 0.0002738956340121195, + "loss": 3.021026372909546, + "step": 7885, + "token_acc": 0.29582684932095926 + }, + { + "epoch": 4.62269129287599, + "grad_norm": 0.24239896785230347, + "learning_rate": 0.00027388743810156344, + "loss": 2.9831275939941406, + "step": 7886, + "token_acc": 0.3007713109622071 + }, + { + "epoch": 4.623277631193199, + "grad_norm": 0.21538246335314448, + "learning_rate": 0.00027387924102725196, + "loss": 3.01284122467041, + "step": 7887, + "token_acc": 0.2968251691721971 + }, + { + "epoch": 4.623863969510407, + "grad_norm": 0.24420999795030499, + "learning_rate": 0.0002738710427892618, + "loss": 2.9943602085113525, + "step": 7888, + "token_acc": 0.2996280479404877 + }, + { + "epoch": 4.624450307827616, + "grad_norm": 0.24687625489877257, + "learning_rate": 0.0002738628433876702, + "loss": 3.004110336303711, + "step": 7889, + "token_acc": 0.2974887642556018 + }, + { + "epoch": 4.625036646144825, + "grad_norm": 0.25262657511549774, + "learning_rate": 0.00027385464282255405, + "loss": 2.990384101867676, + "step": 7890, + "token_acc": 0.2998030751574747 + }, + { + "epoch": 4.6256229844620345, + "grad_norm": 0.22527591476755743, + "learning_rate": 0.00027384644109399044, + "loss": 2.985879898071289, + "step": 7891, + "token_acc": 0.3001405297946894 + }, + { + "epoch": 4.626209322779244, + "grad_norm": 0.24057581210767137, + "learning_rate": 0.00027383823820205643, + "loss": 2.9474940299987793, + "step": 7892, + "token_acc": 0.3074963470494558 + }, + { + "epoch": 4.626795661096453, + "grad_norm": 0.21314238849281605, + "learning_rate": 0.000273830034146829, + "loss": 2.9497787952423096, + "step": 7893, + "token_acc": 0.3065112751698627 + }, + { + "epoch": 4.627381999413662, + "grad_norm": 0.23193421693615607, + "learning_rate": 0.0002738218289283853, + "loss": 2.974104404449463, + "step": 7894, + "token_acc": 0.3021748645348882 + }, + { + "epoch": 4.627968337730871, + "grad_norm": 0.226901147187191, + "learning_rate": 0.00027381362254680233, + "loss": 2.9980640411376953, + "step": 7895, + "token_acc": 0.2986349437948254 + }, + { + "epoch": 4.62855467604808, + "grad_norm": 0.20780275887493488, + "learning_rate": 0.00027380541500215727, + "loss": 2.960134983062744, + "step": 7896, + "token_acc": 0.3037701631653457 + }, + { + "epoch": 4.629141014365289, + "grad_norm": 0.2253003167627265, + "learning_rate": 0.00027379720629452714, + "loss": 2.9830193519592285, + "step": 7897, + "token_acc": 0.3016951504606155 + }, + { + "epoch": 4.629727352682497, + "grad_norm": 0.1984753509504225, + "learning_rate": 0.000273788996423989, + "loss": 2.9896974563598633, + "step": 7898, + "token_acc": 0.29836876150619196 + }, + { + "epoch": 4.6303136909997065, + "grad_norm": 0.2248781754928955, + "learning_rate": 0.00027378078539062016, + "loss": 3.002549171447754, + "step": 7899, + "token_acc": 0.29749584886501107 + }, + { + "epoch": 4.630900029316916, + "grad_norm": 0.20558541366780608, + "learning_rate": 0.0002737725731944976, + "loss": 2.9890408515930176, + "step": 7900, + "token_acc": 0.30151504522270983 + }, + { + "epoch": 4.631486367634125, + "grad_norm": 0.22522114441090846, + "learning_rate": 0.00027376435983569847, + "loss": 3.009610652923584, + "step": 7901, + "token_acc": 0.29667160673490023 + }, + { + "epoch": 4.632072705951334, + "grad_norm": 0.2483450278872476, + "learning_rate": 0.00027375614531430003, + "loss": 2.9677791595458984, + "step": 7902, + "token_acc": 0.30406625075212973 + }, + { + "epoch": 4.632659044268543, + "grad_norm": 0.20770817554320636, + "learning_rate": 0.0002737479296303793, + "loss": 2.9650983810424805, + "step": 7903, + "token_acc": 0.30539615647049906 + }, + { + "epoch": 4.633245382585752, + "grad_norm": 0.22437358006782393, + "learning_rate": 0.00027373971278401356, + "loss": 2.9858570098876953, + "step": 7904, + "token_acc": 0.2999925552902639 + }, + { + "epoch": 4.633831720902961, + "grad_norm": 0.23581521024405372, + "learning_rate": 0.0002737314947752799, + "loss": 2.984874963760376, + "step": 7905, + "token_acc": 0.30129264062112826 + }, + { + "epoch": 4.63441805922017, + "grad_norm": 0.22119730797346063, + "learning_rate": 0.00027372327560425564, + "loss": 2.9724478721618652, + "step": 7906, + "token_acc": 0.30100478985846507 + }, + { + "epoch": 4.635004397537379, + "grad_norm": 0.21372731126876, + "learning_rate": 0.0002737150552710179, + "loss": 2.968052387237549, + "step": 7907, + "token_acc": 0.3032832130565398 + }, + { + "epoch": 4.6355907358545885, + "grad_norm": 0.19528028068683326, + "learning_rate": 0.00027370683377564393, + "loss": 2.9111578464508057, + "step": 7908, + "token_acc": 0.309517366039971 + }, + { + "epoch": 4.636177074171798, + "grad_norm": 0.1936590538996269, + "learning_rate": 0.00027369861111821095, + "loss": 3.0161054134368896, + "step": 7909, + "token_acc": 0.2974640957733135 + }, + { + "epoch": 4.636763412489006, + "grad_norm": 0.21097598415478577, + "learning_rate": 0.0002736903872987962, + "loss": 2.970736503601074, + "step": 7910, + "token_acc": 0.3037290270583306 + }, + { + "epoch": 4.637349750806215, + "grad_norm": 0.19020597660761807, + "learning_rate": 0.0002736821623174769, + "loss": 2.9615070819854736, + "step": 7911, + "token_acc": 0.30549114643580577 + }, + { + "epoch": 4.637936089123424, + "grad_norm": 0.2299758337699185, + "learning_rate": 0.00027367393617433043, + "loss": 2.943321704864502, + "step": 7912, + "token_acc": 0.30642324286852424 + }, + { + "epoch": 4.638522427440633, + "grad_norm": 0.2274961769825738, + "learning_rate": 0.00027366570886943394, + "loss": 3.015221118927002, + "step": 7913, + "token_acc": 0.2953714123126533 + }, + { + "epoch": 4.639108765757842, + "grad_norm": 0.23127847267667376, + "learning_rate": 0.0002736574804028648, + "loss": 2.962968111038208, + "step": 7914, + "token_acc": 0.3029647328294563 + }, + { + "epoch": 4.639695104075051, + "grad_norm": 0.2204878669443697, + "learning_rate": 0.0002736492507747002, + "loss": 2.976640224456787, + "step": 7915, + "token_acc": 0.3010581551007061 + }, + { + "epoch": 4.6402814423922605, + "grad_norm": 0.21261445088989553, + "learning_rate": 0.0002736410199850175, + "loss": 2.960146188735962, + "step": 7916, + "token_acc": 0.30429978704190075 + }, + { + "epoch": 4.64086778070947, + "grad_norm": 0.21122696563166804, + "learning_rate": 0.0002736327880338941, + "loss": 2.986783742904663, + "step": 7917, + "token_acc": 0.30071990408447447 + }, + { + "epoch": 4.641454119026679, + "grad_norm": 0.21122985068569677, + "learning_rate": 0.0002736245549214072, + "loss": 2.9786078929901123, + "step": 7918, + "token_acc": 0.3016138576014159 + }, + { + "epoch": 4.642040457343887, + "grad_norm": 0.20958136558663987, + "learning_rate": 0.0002736163206476342, + "loss": 2.9917502403259277, + "step": 7919, + "token_acc": 0.29752297441880543 + }, + { + "epoch": 4.642626795661096, + "grad_norm": 0.20527415382194056, + "learning_rate": 0.0002736080852126524, + "loss": 3.037907123565674, + "step": 7920, + "token_acc": 0.2923290196287523 + }, + { + "epoch": 4.643213133978305, + "grad_norm": 0.20112905435383452, + "learning_rate": 0.0002735998486165393, + "loss": 2.9846224784851074, + "step": 7921, + "token_acc": 0.30134834610106115 + }, + { + "epoch": 4.643799472295514, + "grad_norm": 0.2170908996669939, + "learning_rate": 0.0002735916108593721, + "loss": 3.0106139183044434, + "step": 7922, + "token_acc": 0.29814833785791117 + }, + { + "epoch": 4.644385810612723, + "grad_norm": 0.21355784849917878, + "learning_rate": 0.0002735833719412283, + "loss": 2.976480722427368, + "step": 7923, + "token_acc": 0.30046614314936987 + }, + { + "epoch": 4.6449721489299325, + "grad_norm": 0.2132970589056935, + "learning_rate": 0.0002735751318621852, + "loss": 2.979160785675049, + "step": 7924, + "token_acc": 0.30112525547292834 + }, + { + "epoch": 4.645558487247142, + "grad_norm": 0.21953148166551797, + "learning_rate": 0.00027356689062232035, + "loss": 3.025585651397705, + "step": 7925, + "token_acc": 0.2943515866173485 + }, + { + "epoch": 4.646144825564351, + "grad_norm": 0.21988834165050689, + "learning_rate": 0.000273558648221711, + "loss": 3.0060176849365234, + "step": 7926, + "token_acc": 0.2976501171345198 + }, + { + "epoch": 4.64673116388156, + "grad_norm": 0.2297652147532245, + "learning_rate": 0.00027355040466043467, + "loss": 3.019186019897461, + "step": 7927, + "token_acc": 0.2949126426575435 + }, + { + "epoch": 4.647317502198769, + "grad_norm": 0.20581447729772764, + "learning_rate": 0.00027354215993856873, + "loss": 2.9856696128845215, + "step": 7928, + "token_acc": 0.2995927496247008 + }, + { + "epoch": 4.647903840515978, + "grad_norm": 0.23359021341170738, + "learning_rate": 0.0002735339140561907, + "loss": 2.971479892730713, + "step": 7929, + "token_acc": 0.3018986868395465 + }, + { + "epoch": 4.648490178833187, + "grad_norm": 0.2509271772205493, + "learning_rate": 0.000273525667013378, + "loss": 3.0536880493164062, + "step": 7930, + "token_acc": 0.2915995235759826 + }, + { + "epoch": 4.649076517150396, + "grad_norm": 0.2156042270930878, + "learning_rate": 0.0002735174188102081, + "loss": 3.0490574836730957, + "step": 7931, + "token_acc": 0.2927147027032672 + }, + { + "epoch": 4.6496628554676045, + "grad_norm": 0.25694675450277366, + "learning_rate": 0.00027350916944675856, + "loss": 2.9955108165740967, + "step": 7932, + "token_acc": 0.29926049778633196 + }, + { + "epoch": 4.650249193784814, + "grad_norm": 0.2500522912641065, + "learning_rate": 0.00027350091892310675, + "loss": 2.9811477661132812, + "step": 7933, + "token_acc": 0.30134318351573 + }, + { + "epoch": 4.650835532102023, + "grad_norm": 0.21620605054380906, + "learning_rate": 0.0002734926672393302, + "loss": 2.9665353298187256, + "step": 7934, + "token_acc": 0.30272887023353917 + }, + { + "epoch": 4.651421870419232, + "grad_norm": 0.23371372627505071, + "learning_rate": 0.0002734844143955065, + "loss": 2.954139232635498, + "step": 7935, + "token_acc": 0.30548425493323833 + }, + { + "epoch": 4.652008208736441, + "grad_norm": 0.21608538247003312, + "learning_rate": 0.00027347616039171313, + "loss": 2.9832916259765625, + "step": 7936, + "token_acc": 0.29999949384259517 + }, + { + "epoch": 4.65259454705365, + "grad_norm": 0.2507746314893533, + "learning_rate": 0.00027346790522802763, + "loss": 3.0002052783966064, + "step": 7937, + "token_acc": 0.29810756683781287 + }, + { + "epoch": 4.653180885370859, + "grad_norm": 0.2508412783959582, + "learning_rate": 0.0002734596489045275, + "loss": 2.979111909866333, + "step": 7938, + "token_acc": 0.3020699270699271 + }, + { + "epoch": 4.653767223688068, + "grad_norm": 0.2129941666855129, + "learning_rate": 0.00027345139142129037, + "loss": 2.970345973968506, + "step": 7939, + "token_acc": 0.3032420577998242 + }, + { + "epoch": 4.654353562005277, + "grad_norm": 0.2618629602286676, + "learning_rate": 0.0002734431327783937, + "loss": 3.0187361240386963, + "step": 7940, + "token_acc": 0.2961208763800097 + }, + { + "epoch": 4.654939900322486, + "grad_norm": 0.22971736973230863, + "learning_rate": 0.0002734348729759152, + "loss": 2.982077121734619, + "step": 7941, + "token_acc": 0.3004714113544011 + }, + { + "epoch": 4.655526238639695, + "grad_norm": 0.25889329874658945, + "learning_rate": 0.0002734266120139323, + "loss": 3.020895004272461, + "step": 7942, + "token_acc": 0.2934063645679777 + }, + { + "epoch": 4.656112576956904, + "grad_norm": 0.2701735401434714, + "learning_rate": 0.0002734183498925228, + "loss": 2.9888343811035156, + "step": 7943, + "token_acc": 0.29952741616362744 + }, + { + "epoch": 4.656698915274113, + "grad_norm": 0.21434114132529647, + "learning_rate": 0.00027341008661176423, + "loss": 2.9973511695861816, + "step": 7944, + "token_acc": 0.30127821679312405 + }, + { + "epoch": 4.657285253591322, + "grad_norm": 0.22631337654248745, + "learning_rate": 0.0002734018221717341, + "loss": 3.0231449604034424, + "step": 7945, + "token_acc": 0.2957242603153477 + }, + { + "epoch": 4.657871591908531, + "grad_norm": 0.2161885111566927, + "learning_rate": 0.0002733935565725102, + "loss": 2.9814815521240234, + "step": 7946, + "token_acc": 0.3022498900792502 + }, + { + "epoch": 4.65845793022574, + "grad_norm": 0.22383341434018186, + "learning_rate": 0.0002733852898141701, + "loss": 2.96597957611084, + "step": 7947, + "token_acc": 0.30511503644333776 + }, + { + "epoch": 4.659044268542949, + "grad_norm": 0.2187987145010744, + "learning_rate": 0.00027337702189679156, + "loss": 2.993025302886963, + "step": 7948, + "token_acc": 0.2987186212459504 + }, + { + "epoch": 4.6596306068601585, + "grad_norm": 0.24051137408879936, + "learning_rate": 0.000273368752820452, + "loss": 2.995567798614502, + "step": 7949, + "token_acc": 0.2980003599195658 + }, + { + "epoch": 4.660216945177368, + "grad_norm": 0.20882775280634566, + "learning_rate": 0.0002733604825852293, + "loss": 2.988506555557251, + "step": 7950, + "token_acc": 0.3006173147464966 + }, + { + "epoch": 4.660803283494577, + "grad_norm": 0.22230268418020196, + "learning_rate": 0.0002733522111912011, + "loss": 3.039658546447754, + "step": 7951, + "token_acc": 0.29181438440529106 + }, + { + "epoch": 4.661389621811786, + "grad_norm": 0.2101843486862066, + "learning_rate": 0.00027334393863844513, + "loss": 2.9997262954711914, + "step": 7952, + "token_acc": 0.29881092594584246 + }, + { + "epoch": 4.661975960128994, + "grad_norm": 0.22514878326196197, + "learning_rate": 0.00027333566492703903, + "loss": 2.9887073040008545, + "step": 7953, + "token_acc": 0.30017749455846193 + }, + { + "epoch": 4.662562298446203, + "grad_norm": 0.19654375651760705, + "learning_rate": 0.00027332739005706056, + "loss": 2.9747958183288574, + "step": 7954, + "token_acc": 0.30117640845535326 + }, + { + "epoch": 4.663148636763412, + "grad_norm": 0.21416790770461058, + "learning_rate": 0.0002733191140285874, + "loss": 2.9822826385498047, + "step": 7955, + "token_acc": 0.30093858641569476 + }, + { + "epoch": 4.663734975080621, + "grad_norm": 0.20841534737225642, + "learning_rate": 0.0002733108368416974, + "loss": 2.995788097381592, + "step": 7956, + "token_acc": 0.2987968819256259 + }, + { + "epoch": 4.6643213133978305, + "grad_norm": 0.22714883304825007, + "learning_rate": 0.00027330255849646826, + "loss": 2.9606761932373047, + "step": 7957, + "token_acc": 0.30402009419152276 + }, + { + "epoch": 4.66490765171504, + "grad_norm": 0.21841250031513512, + "learning_rate": 0.0002732942789929777, + "loss": 3.0253326892852783, + "step": 7958, + "token_acc": 0.2954864765380767 + }, + { + "epoch": 4.665493990032249, + "grad_norm": 0.19845677528562855, + "learning_rate": 0.0002732859983313035, + "loss": 2.9499659538269043, + "step": 7959, + "token_acc": 0.3043559981803472 + }, + { + "epoch": 4.666080328349458, + "grad_norm": 0.21658290055779053, + "learning_rate": 0.00027327771651152355, + "loss": 2.961183547973633, + "step": 7960, + "token_acc": 0.305185543145309 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.21055850794113065, + "learning_rate": 0.0002732694335337155, + "loss": 2.967160224914551, + "step": 7961, + "token_acc": 0.3036493274405804 + }, + { + "epoch": 4.667253004983876, + "grad_norm": 0.2172114874525312, + "learning_rate": 0.0002732611493979573, + "loss": 2.9934380054473877, + "step": 7962, + "token_acc": 0.298989679694574 + }, + { + "epoch": 4.667839343301084, + "grad_norm": 0.19295878333119915, + "learning_rate": 0.00027325286410432664, + "loss": 2.993499279022217, + "step": 7963, + "token_acc": 0.29988513911768705 + }, + { + "epoch": 4.668425681618293, + "grad_norm": 0.21053089314481063, + "learning_rate": 0.00027324457765290144, + "loss": 2.9784207344055176, + "step": 7964, + "token_acc": 0.30162871079919235 + }, + { + "epoch": 4.6690120199355025, + "grad_norm": 0.20696284240480398, + "learning_rate": 0.0002732362900437595, + "loss": 2.9707982540130615, + "step": 7965, + "token_acc": 0.3036265709156194 + }, + { + "epoch": 4.669598358252712, + "grad_norm": 0.20626097390912693, + "learning_rate": 0.0002732280012769787, + "loss": 3.0139966011047363, + "step": 7966, + "token_acc": 0.2987944415639413 + }, + { + "epoch": 4.670184696569921, + "grad_norm": 0.20826954749847715, + "learning_rate": 0.00027321971135263684, + "loss": 2.9702110290527344, + "step": 7967, + "token_acc": 0.3028619626548775 + }, + { + "epoch": 4.67077103488713, + "grad_norm": 0.21194127634520923, + "learning_rate": 0.00027321142027081184, + "loss": 2.9737303256988525, + "step": 7968, + "token_acc": 0.30234321639659895 + }, + { + "epoch": 4.671357373204339, + "grad_norm": 0.22638010308060727, + "learning_rate": 0.0002732031280315816, + "loss": 2.948225736618042, + "step": 7969, + "token_acc": 0.3060037892899339 + }, + { + "epoch": 4.671943711521548, + "grad_norm": 0.21506609732053453, + "learning_rate": 0.000273194834635024, + "loss": 2.9383599758148193, + "step": 7970, + "token_acc": 0.30870443487852356 + }, + { + "epoch": 4.672530049838757, + "grad_norm": 0.24901942576062572, + "learning_rate": 0.00027318654008121686, + "loss": 3.038952589035034, + "step": 7971, + "token_acc": 0.29367264326616027 + }, + { + "epoch": 4.673116388155966, + "grad_norm": 0.2702768988279559, + "learning_rate": 0.00027317824437023824, + "loss": 2.9810993671417236, + "step": 7972, + "token_acc": 0.30300101835110255 + }, + { + "epoch": 4.673702726473175, + "grad_norm": 0.21997267762676495, + "learning_rate": 0.000273169947502166, + "loss": 2.969866991043091, + "step": 7973, + "token_acc": 0.302663926675828 + }, + { + "epoch": 4.6742890647903845, + "grad_norm": 0.23652931365495775, + "learning_rate": 0.0002731616494770781, + "loss": 3.029613971710205, + "step": 7974, + "token_acc": 0.2952870394939917 + }, + { + "epoch": 4.674875403107593, + "grad_norm": 0.2628521175986634, + "learning_rate": 0.0002731533502950524, + "loss": 3.0189905166625977, + "step": 7975, + "token_acc": 0.2971794919821153 + }, + { + "epoch": 4.675461741424802, + "grad_norm": 0.2014823818164978, + "learning_rate": 0.0002731450499561669, + "loss": 3.0010294914245605, + "step": 7976, + "token_acc": 0.2973537242340412 + }, + { + "epoch": 4.676048079742011, + "grad_norm": 0.25947850335162304, + "learning_rate": 0.0002731367484604997, + "loss": 2.976067066192627, + "step": 7977, + "token_acc": 0.3007154213036566 + }, + { + "epoch": 4.67663441805922, + "grad_norm": 0.21624111047423913, + "learning_rate": 0.0002731284458081286, + "loss": 2.980353355407715, + "step": 7978, + "token_acc": 0.30337158256296953 + }, + { + "epoch": 4.677220756376429, + "grad_norm": 0.23859783833351592, + "learning_rate": 0.00027312014199913165, + "loss": 3.0396194458007812, + "step": 7979, + "token_acc": 0.2926389664247548 + }, + { + "epoch": 4.677807094693638, + "grad_norm": 0.21789768437574827, + "learning_rate": 0.0002731118370335869, + "loss": 2.959746837615967, + "step": 7980, + "token_acc": 0.30570397872688504 + }, + { + "epoch": 4.678393433010847, + "grad_norm": 0.21465867468670566, + "learning_rate": 0.00027310353091157237, + "loss": 2.9209461212158203, + "step": 7981, + "token_acc": 0.30912338583984533 + }, + { + "epoch": 4.6789797713280565, + "grad_norm": 0.21120978030354096, + "learning_rate": 0.000273095223633166, + "loss": 2.995389461517334, + "step": 7982, + "token_acc": 0.2997760778748432 + }, + { + "epoch": 4.679566109645266, + "grad_norm": 0.22340574434640098, + "learning_rate": 0.00027308691519844597, + "loss": 2.996685028076172, + "step": 7983, + "token_acc": 0.29921592945467157 + }, + { + "epoch": 4.680152447962474, + "grad_norm": 0.20751340054708936, + "learning_rate": 0.00027307860560749016, + "loss": 3.045945644378662, + "step": 7984, + "token_acc": 0.29261429461467564 + }, + { + "epoch": 4.680738786279683, + "grad_norm": 0.23364773205284, + "learning_rate": 0.0002730702948603767, + "loss": 2.9815478324890137, + "step": 7985, + "token_acc": 0.29951092479674796 + }, + { + "epoch": 4.681325124596892, + "grad_norm": 0.22445253211982355, + "learning_rate": 0.00027306198295718365, + "loss": 2.9784202575683594, + "step": 7986, + "token_acc": 0.30116526139156 + }, + { + "epoch": 4.681911462914101, + "grad_norm": 0.2193594392663409, + "learning_rate": 0.0002730536698979891, + "loss": 2.989856719970703, + "step": 7987, + "token_acc": 0.2990195053624067 + }, + { + "epoch": 4.68249780123131, + "grad_norm": 0.24382426202586993, + "learning_rate": 0.0002730453556828712, + "loss": 3.005772829055786, + "step": 7988, + "token_acc": 0.29937581994096424 + }, + { + "epoch": 4.683084139548519, + "grad_norm": 0.2666859932090609, + "learning_rate": 0.00027303704031190795, + "loss": 2.999368190765381, + "step": 7989, + "token_acc": 0.2990274867245834 + }, + { + "epoch": 4.6836704778657285, + "grad_norm": 0.2176027335373523, + "learning_rate": 0.00027302872378517755, + "loss": 2.9437031745910645, + "step": 7990, + "token_acc": 0.30454962348581566 + }, + { + "epoch": 4.684256816182938, + "grad_norm": 0.21290410372181298, + "learning_rate": 0.000273020406102758, + "loss": 2.9472849369049072, + "step": 7991, + "token_acc": 0.30590080376396783 + }, + { + "epoch": 4.684843154500147, + "grad_norm": 0.23209785580193537, + "learning_rate": 0.0002730120872647275, + "loss": 2.9363064765930176, + "step": 7992, + "token_acc": 0.30774810261623214 + }, + { + "epoch": 4.685429492817356, + "grad_norm": 0.24486586133460342, + "learning_rate": 0.00027300376727116426, + "loss": 2.9693081378936768, + "step": 7993, + "token_acc": 0.3037627750196539 + }, + { + "epoch": 4.686015831134565, + "grad_norm": 0.21502005844121452, + "learning_rate": 0.00027299544612214633, + "loss": 2.9695639610290527, + "step": 7994, + "token_acc": 0.30367876136828525 + }, + { + "epoch": 4.686602169451774, + "grad_norm": 0.24629947929931842, + "learning_rate": 0.00027298712381775193, + "loss": 3.0054609775543213, + "step": 7995, + "token_acc": 0.2982848919631728 + }, + { + "epoch": 4.687188507768982, + "grad_norm": 0.2630777632451376, + "learning_rate": 0.00027297880035805923, + "loss": 2.98844051361084, + "step": 7996, + "token_acc": 0.2987310172664864 + }, + { + "epoch": 4.687774846086191, + "grad_norm": 0.23916257726085585, + "learning_rate": 0.0002729704757431464, + "loss": 2.988590717315674, + "step": 7997, + "token_acc": 0.29893994262839385 + }, + { + "epoch": 4.6883611844034006, + "grad_norm": 0.21376240784310124, + "learning_rate": 0.0002729621499730917, + "loss": 2.962796449661255, + "step": 7998, + "token_acc": 0.304072253978698 + }, + { + "epoch": 4.68894752272061, + "grad_norm": 0.2557251449915677, + "learning_rate": 0.0002729538230479733, + "loss": 2.9704651832580566, + "step": 7999, + "token_acc": 0.30254340052057455 + }, + { + "epoch": 4.689533861037819, + "grad_norm": 0.22654198640257087, + "learning_rate": 0.00027294549496786934, + "loss": 3.0095033645629883, + "step": 8000, + "token_acc": 0.29787940396839696 + }, + { + "epoch": 4.690120199355028, + "grad_norm": 0.23369436041267638, + "learning_rate": 0.00027293716573285816, + "loss": 3.0713820457458496, + "step": 8001, + "token_acc": 0.2888325748697421 + }, + { + "epoch": 4.690706537672237, + "grad_norm": 0.2307123278198811, + "learning_rate": 0.000272928835343018, + "loss": 3.028798818588257, + "step": 8002, + "token_acc": 0.29379762018875794 + }, + { + "epoch": 4.691292875989446, + "grad_norm": 0.22054933620563386, + "learning_rate": 0.0002729205037984271, + "loss": 2.9625766277313232, + "step": 8003, + "token_acc": 0.3038897550405623 + }, + { + "epoch": 4.691879214306655, + "grad_norm": 0.23659716880942508, + "learning_rate": 0.00027291217109916364, + "loss": 2.9549880027770996, + "step": 8004, + "token_acc": 0.30579445159251484 + }, + { + "epoch": 4.692465552623864, + "grad_norm": 0.22905372649224817, + "learning_rate": 0.000272903837245306, + "loss": 2.971674680709839, + "step": 8005, + "token_acc": 0.30267043669494187 + }, + { + "epoch": 4.693051890941073, + "grad_norm": 0.23996159428642047, + "learning_rate": 0.0002728955022369324, + "loss": 3.0226359367370605, + "step": 8006, + "token_acc": 0.2969983787093265 + }, + { + "epoch": 4.693638229258282, + "grad_norm": 0.22981323716934784, + "learning_rate": 0.0002728871660741211, + "loss": 3.0161678791046143, + "step": 8007, + "token_acc": 0.2971675250731021 + }, + { + "epoch": 4.694224567575491, + "grad_norm": 0.21499079584913083, + "learning_rate": 0.0002728788287569506, + "loss": 2.9847888946533203, + "step": 8008, + "token_acc": 0.30068877757130164 + }, + { + "epoch": 4.6948109058927, + "grad_norm": 0.22724686993533613, + "learning_rate": 0.00027287049028549903, + "loss": 2.9753470420837402, + "step": 8009, + "token_acc": 0.30223073016369106 + }, + { + "epoch": 4.695397244209909, + "grad_norm": 0.20852963311093395, + "learning_rate": 0.00027286215065984475, + "loss": 2.974867105484009, + "step": 8010, + "token_acc": 0.30158437195959575 + }, + { + "epoch": 4.695983582527118, + "grad_norm": 0.22236619826920662, + "learning_rate": 0.00027285380988006615, + "loss": 3.038928508758545, + "step": 8011, + "token_acc": 0.2940437515908052 + }, + { + "epoch": 4.696569920844327, + "grad_norm": 0.24085254571339978, + "learning_rate": 0.0002728454679462415, + "loss": 3.0386176109313965, + "step": 8012, + "token_acc": 0.2920775564125489 + }, + { + "epoch": 4.697156259161536, + "grad_norm": 0.2330809368477201, + "learning_rate": 0.00027283712485844927, + "loss": 2.9869959354400635, + "step": 8013, + "token_acc": 0.2995997004209602 + }, + { + "epoch": 4.6977425974787455, + "grad_norm": 0.2272800090309307, + "learning_rate": 0.00027282878061676776, + "loss": 3.016906976699829, + "step": 8014, + "token_acc": 0.2957842962278611 + }, + { + "epoch": 4.698328935795955, + "grad_norm": 0.2451533334770203, + "learning_rate": 0.00027282043522127537, + "loss": 2.994156837463379, + "step": 8015, + "token_acc": 0.2986642489233487 + }, + { + "epoch": 4.698915274113164, + "grad_norm": 0.2321531741910977, + "learning_rate": 0.00027281208867205054, + "loss": 2.9755194187164307, + "step": 8016, + "token_acc": 0.30286852137544207 + }, + { + "epoch": 4.699501612430373, + "grad_norm": 0.22742654394833736, + "learning_rate": 0.00027280374096917156, + "loss": 3.000978469848633, + "step": 8017, + "token_acc": 0.2988068183300675 + }, + { + "epoch": 4.700087950747581, + "grad_norm": 0.2635632720363313, + "learning_rate": 0.00027279539211271696, + "loss": 2.9706058502197266, + "step": 8018, + "token_acc": 0.30589335317865973 + }, + { + "epoch": 4.70067428906479, + "grad_norm": 0.2249749411519614, + "learning_rate": 0.0002727870421027651, + "loss": 2.9328503608703613, + "step": 8019, + "token_acc": 0.30786230079681276 + }, + { + "epoch": 4.701260627381999, + "grad_norm": 0.25597774619735186, + "learning_rate": 0.00027277869093939445, + "loss": 2.9894392490386963, + "step": 8020, + "token_acc": 0.30166564177920463 + }, + { + "epoch": 4.701846965699208, + "grad_norm": 0.23737269335836553, + "learning_rate": 0.0002727703386226834, + "loss": 2.968196392059326, + "step": 8021, + "token_acc": 0.3024451524334965 + }, + { + "epoch": 4.7024333040164175, + "grad_norm": 0.24487329800361674, + "learning_rate": 0.0002727619851527105, + "loss": 3.0047922134399414, + "step": 8022, + "token_acc": 0.2982718564017695 + }, + { + "epoch": 4.703019642333627, + "grad_norm": 0.21445966686077036, + "learning_rate": 0.00027275363052955417, + "loss": 2.984370708465576, + "step": 8023, + "token_acc": 0.3003675492497914 + }, + { + "epoch": 4.703605980650836, + "grad_norm": 0.22382354594649123, + "learning_rate": 0.0002727452747532929, + "loss": 2.930237293243408, + "step": 8024, + "token_acc": 0.3080376000556087 + }, + { + "epoch": 4.704192318968045, + "grad_norm": 0.2112900591604549, + "learning_rate": 0.0002727369178240051, + "loss": 2.998049259185791, + "step": 8025, + "token_acc": 0.2992473817715395 + }, + { + "epoch": 4.704778657285254, + "grad_norm": 0.22146258098587585, + "learning_rate": 0.00027272855974176945, + "loss": 3.0344738960266113, + "step": 8026, + "token_acc": 0.2929223000488679 + }, + { + "epoch": 4.705364995602462, + "grad_norm": 0.2232619256832775, + "learning_rate": 0.0002727202005066643, + "loss": 2.9884607791900635, + "step": 8027, + "token_acc": 0.2995787767250887 + }, + { + "epoch": 4.705951333919671, + "grad_norm": 0.2203798126249059, + "learning_rate": 0.0002727118401187682, + "loss": 3.029386043548584, + "step": 8028, + "token_acc": 0.29271670543932404 + }, + { + "epoch": 4.70653767223688, + "grad_norm": 0.2424125213121547, + "learning_rate": 0.0002727034785781598, + "loss": 3.0184242725372314, + "step": 8029, + "token_acc": 0.2956051749221728 + }, + { + "epoch": 4.7071240105540895, + "grad_norm": 0.22413763688248656, + "learning_rate": 0.00027269511588491754, + "loss": 2.966338634490967, + "step": 8030, + "token_acc": 0.3038055087278253 + }, + { + "epoch": 4.707710348871299, + "grad_norm": 0.21755066898771866, + "learning_rate": 0.00027268675203912, + "loss": 2.9550273418426514, + "step": 8031, + "token_acc": 0.30404286719316154 + }, + { + "epoch": 4.708296687188508, + "grad_norm": 0.23536089490789863, + "learning_rate": 0.0002726783870408457, + "loss": 2.9618635177612305, + "step": 8032, + "token_acc": 0.304383621793699 + }, + { + "epoch": 4.708883025505717, + "grad_norm": 0.2319098542068753, + "learning_rate": 0.0002726700208901733, + "loss": 2.9721169471740723, + "step": 8033, + "token_acc": 0.30294178119684595 + }, + { + "epoch": 4.709469363822926, + "grad_norm": 0.2319185965026555, + "learning_rate": 0.0002726616535871814, + "loss": 2.994365692138672, + "step": 8034, + "token_acc": 0.30003728452322087 + }, + { + "epoch": 4.710055702140135, + "grad_norm": 0.24057991847969476, + "learning_rate": 0.0002726532851319485, + "loss": 2.973515272140503, + "step": 8035, + "token_acc": 0.30359241865327635 + }, + { + "epoch": 4.710642040457344, + "grad_norm": 0.2107305775809225, + "learning_rate": 0.00027264491552455325, + "loss": 2.96773624420166, + "step": 8036, + "token_acc": 0.30253070645453795 + }, + { + "epoch": 4.711228378774553, + "grad_norm": 0.24203236171735593, + "learning_rate": 0.00027263654476507434, + "loss": 3.0651187896728516, + "step": 8037, + "token_acc": 0.28943586199313576 + }, + { + "epoch": 4.711814717091762, + "grad_norm": 0.2124827569366108, + "learning_rate": 0.0002726281728535903, + "loss": 3.0008060932159424, + "step": 8038, + "token_acc": 0.2961468012636475 + }, + { + "epoch": 4.7124010554089715, + "grad_norm": 0.2516783809690654, + "learning_rate": 0.0002726197997901798, + "loss": 2.989532709121704, + "step": 8039, + "token_acc": 0.2999526666241198 + }, + { + "epoch": 4.71298739372618, + "grad_norm": 0.24149495261292253, + "learning_rate": 0.00027261142557492157, + "loss": 3.010761260986328, + "step": 8040, + "token_acc": 0.29638043039050116 + }, + { + "epoch": 4.713573732043389, + "grad_norm": 0.2416809913412415, + "learning_rate": 0.0002726030502078942, + "loss": 3.0658507347106934, + "step": 8041, + "token_acc": 0.28928612067984566 + }, + { + "epoch": 4.714160070360598, + "grad_norm": 0.23590198769124812, + "learning_rate": 0.0002725946736891764, + "loss": 2.9290599822998047, + "step": 8042, + "token_acc": 0.30786941238717386 + }, + { + "epoch": 4.714746408677807, + "grad_norm": 0.23480734618753724, + "learning_rate": 0.0002725862960188468, + "loss": 2.9555444717407227, + "step": 8043, + "token_acc": 0.3039568070623813 + }, + { + "epoch": 4.715332746995016, + "grad_norm": 0.2193194110996426, + "learning_rate": 0.00027257791719698414, + "loss": 2.9616951942443848, + "step": 8044, + "token_acc": 0.30490797053558766 + }, + { + "epoch": 4.715919085312225, + "grad_norm": 0.24527133346275176, + "learning_rate": 0.00027256953722366715, + "loss": 2.9864184856414795, + "step": 8045, + "token_acc": 0.30153216536775757 + }, + { + "epoch": 4.716505423629434, + "grad_norm": 0.20224920997425186, + "learning_rate": 0.0002725611560989745, + "loss": 3.042310953140259, + "step": 8046, + "token_acc": 0.2939957373752364 + }, + { + "epoch": 4.7170917619466435, + "grad_norm": 0.2462626820109491, + "learning_rate": 0.000272552773822985, + "loss": 2.96690034866333, + "step": 8047, + "token_acc": 0.3027801645593528 + }, + { + "epoch": 4.717678100263853, + "grad_norm": 0.22352432993628502, + "learning_rate": 0.0002725443903957772, + "loss": 3.0185132026672363, + "step": 8048, + "token_acc": 0.2964226030603625 + }, + { + "epoch": 4.718264438581061, + "grad_norm": 0.24612356666350665, + "learning_rate": 0.0002725360058174301, + "loss": 2.9798197746276855, + "step": 8049, + "token_acc": 0.3011730063199147 + }, + { + "epoch": 4.71885077689827, + "grad_norm": 0.24615218816816709, + "learning_rate": 0.0002725276200880223, + "loss": 3.0015792846679688, + "step": 8050, + "token_acc": 0.29817728738230725 + }, + { + "epoch": 4.719437115215479, + "grad_norm": 0.21594849687878387, + "learning_rate": 0.00027251923320763266, + "loss": 3.0045509338378906, + "step": 8051, + "token_acc": 0.2985994091826333 + }, + { + "epoch": 4.720023453532688, + "grad_norm": 0.23442586372261318, + "learning_rate": 0.00027251084517633983, + "loss": 2.9854891300201416, + "step": 8052, + "token_acc": 0.30055258795358264 + }, + { + "epoch": 4.720609791849897, + "grad_norm": 0.23892802843571193, + "learning_rate": 0.0002725024559942228, + "loss": 3.0064148902893066, + "step": 8053, + "token_acc": 0.2974782111180384 + }, + { + "epoch": 4.721196130167106, + "grad_norm": 0.2220125660499891, + "learning_rate": 0.0002724940656613602, + "loss": 2.9878458976745605, + "step": 8054, + "token_acc": 0.2991960274296524 + }, + { + "epoch": 4.7217824684843155, + "grad_norm": 0.23099055289930986, + "learning_rate": 0.00027248567417783096, + "loss": 2.9560627937316895, + "step": 8055, + "token_acc": 0.3056993457872745 + }, + { + "epoch": 4.722368806801525, + "grad_norm": 0.20849411239300442, + "learning_rate": 0.0002724772815437138, + "loss": 2.967928886413574, + "step": 8056, + "token_acc": 0.3035505121521061 + }, + { + "epoch": 4.722955145118734, + "grad_norm": 0.2180631513114469, + "learning_rate": 0.0002724688877590877, + "loss": 2.952347755432129, + "step": 8057, + "token_acc": 0.3060695761513072 + }, + { + "epoch": 4.723541483435943, + "grad_norm": 0.20721735193610177, + "learning_rate": 0.0002724604928240314, + "loss": 2.97845458984375, + "step": 8058, + "token_acc": 0.30116755527337724 + }, + { + "epoch": 4.724127821753152, + "grad_norm": 0.21686656030062104, + "learning_rate": 0.00027245209673862374, + "loss": 2.9854695796966553, + "step": 8059, + "token_acc": 0.29884981247743464 + }, + { + "epoch": 4.724714160070361, + "grad_norm": 0.24383148612551364, + "learning_rate": 0.0002724436995029437, + "loss": 3.012242317199707, + "step": 8060, + "token_acc": 0.2983414588264802 + }, + { + "epoch": 4.725300498387569, + "grad_norm": 0.21713807779667438, + "learning_rate": 0.00027243530111707004, + "loss": 2.961216926574707, + "step": 8061, + "token_acc": 0.3035239687641599 + }, + { + "epoch": 4.725886836704778, + "grad_norm": 0.21006561308870556, + "learning_rate": 0.0002724269015810818, + "loss": 2.9979164600372314, + "step": 8062, + "token_acc": 0.3003666692488445 + }, + { + "epoch": 4.7264731750219875, + "grad_norm": 0.2416855992976606, + "learning_rate": 0.0002724185008950577, + "loss": 3.023186206817627, + "step": 8063, + "token_acc": 0.2961096834799351 + }, + { + "epoch": 4.727059513339197, + "grad_norm": 0.2094687700245328, + "learning_rate": 0.0002724100990590768, + "loss": 2.9820547103881836, + "step": 8064, + "token_acc": 0.3020883020489891 + }, + { + "epoch": 4.727645851656406, + "grad_norm": 0.2094968697395024, + "learning_rate": 0.00027240169607321797, + "loss": 2.9976696968078613, + "step": 8065, + "token_acc": 0.29833065519247914 + }, + { + "epoch": 4.728232189973615, + "grad_norm": 0.21207830452132448, + "learning_rate": 0.0002723932919375601, + "loss": 3.010378837585449, + "step": 8066, + "token_acc": 0.2959176480723895 + }, + { + "epoch": 4.728818528290824, + "grad_norm": 0.21756340158423706, + "learning_rate": 0.0002723848866521822, + "loss": 3.0251855850219727, + "step": 8067, + "token_acc": 0.2950593200199518 + }, + { + "epoch": 4.729404866608033, + "grad_norm": 0.20257275390881424, + "learning_rate": 0.0002723764802171632, + "loss": 2.972163200378418, + "step": 8068, + "token_acc": 0.3028604556243322 + }, + { + "epoch": 4.729991204925242, + "grad_norm": 0.21146542771001683, + "learning_rate": 0.00027236807263258207, + "loss": 2.994256019592285, + "step": 8069, + "token_acc": 0.2998357675872186 + }, + { + "epoch": 4.730577543242451, + "grad_norm": 0.20894897888440442, + "learning_rate": 0.0002723596638985178, + "loss": 3.014697551727295, + "step": 8070, + "token_acc": 0.2971236561268474 + }, + { + "epoch": 4.7311638815596595, + "grad_norm": 0.2086270048059413, + "learning_rate": 0.0002723512540150494, + "loss": 2.959437847137451, + "step": 8071, + "token_acc": 0.30302420758643633 + }, + { + "epoch": 4.731750219876869, + "grad_norm": 0.2045100579126822, + "learning_rate": 0.0002723428429822558, + "loss": 2.97864031791687, + "step": 8072, + "token_acc": 0.30203557961604055 + }, + { + "epoch": 4.732336558194078, + "grad_norm": 0.21675172364814244, + "learning_rate": 0.00027233443080021603, + "loss": 2.98246431350708, + "step": 8073, + "token_acc": 0.3001004095653323 + }, + { + "epoch": 4.732922896511287, + "grad_norm": 0.23518894106907345, + "learning_rate": 0.0002723260174690092, + "loss": 3.003081798553467, + "step": 8074, + "token_acc": 0.29776394830314656 + }, + { + "epoch": 4.733509234828496, + "grad_norm": 0.19180934980549977, + "learning_rate": 0.00027231760298871425, + "loss": 3.024230480194092, + "step": 8075, + "token_acc": 0.2962506539243551 + }, + { + "epoch": 4.734095573145705, + "grad_norm": 0.24258637739096278, + "learning_rate": 0.0002723091873594102, + "loss": 2.9799065589904785, + "step": 8076, + "token_acc": 0.30042230959662153 + }, + { + "epoch": 4.734681911462914, + "grad_norm": 0.22082645500724632, + "learning_rate": 0.0002723007705811762, + "loss": 2.9562482833862305, + "step": 8077, + "token_acc": 0.3053178775026706 + }, + { + "epoch": 4.735268249780123, + "grad_norm": 0.20991252189127738, + "learning_rate": 0.0002722923526540912, + "loss": 3.039581775665283, + "step": 8078, + "token_acc": 0.2925579346013029 + }, + { + "epoch": 4.735854588097332, + "grad_norm": 0.21437221864604064, + "learning_rate": 0.00027228393357823437, + "loss": 2.9627161026000977, + "step": 8079, + "token_acc": 0.30497531438186326 + }, + { + "epoch": 4.7364409264145415, + "grad_norm": 0.21971334344857002, + "learning_rate": 0.00027227551335368475, + "loss": 3.0081300735473633, + "step": 8080, + "token_acc": 0.2966559967671268 + }, + { + "epoch": 4.737027264731751, + "grad_norm": 0.21075295148079304, + "learning_rate": 0.0002722670919805215, + "loss": 2.977956771850586, + "step": 8081, + "token_acc": 0.30253538848694556 + }, + { + "epoch": 4.73761360304896, + "grad_norm": 0.2097484888854789, + "learning_rate": 0.00027225866945882366, + "loss": 2.9872775077819824, + "step": 8082, + "token_acc": 0.30086087739228773 + }, + { + "epoch": 4.738199941366168, + "grad_norm": 0.2148846848056903, + "learning_rate": 0.0002722502457886703, + "loss": 2.9808614253997803, + "step": 8083, + "token_acc": 0.3015346340050482 + }, + { + "epoch": 4.738786279683377, + "grad_norm": 0.20979022357275806, + "learning_rate": 0.0002722418209701407, + "loss": 2.9707224369049072, + "step": 8084, + "token_acc": 0.30414855014969266 + }, + { + "epoch": 4.739372618000586, + "grad_norm": 0.19132545560111577, + "learning_rate": 0.0002722333950033139, + "loss": 2.989068031311035, + "step": 8085, + "token_acc": 0.3001100855279871 + }, + { + "epoch": 4.739958956317795, + "grad_norm": 0.21385066161289107, + "learning_rate": 0.00027222496788826905, + "loss": 2.998983144760132, + "step": 8086, + "token_acc": 0.29707095092614355 + }, + { + "epoch": 4.740545294635004, + "grad_norm": 0.19289893435702277, + "learning_rate": 0.00027221653962508527, + "loss": 2.979851245880127, + "step": 8087, + "token_acc": 0.30071025722468525 + }, + { + "epoch": 4.7411316329522135, + "grad_norm": 0.22666861844709615, + "learning_rate": 0.00027220811021384187, + "loss": 3.0059680938720703, + "step": 8088, + "token_acc": 0.2973458021739241 + }, + { + "epoch": 4.741717971269423, + "grad_norm": 0.20364376295513698, + "learning_rate": 0.00027219967965461795, + "loss": 3.0043649673461914, + "step": 8089, + "token_acc": 0.2981300163229816 + }, + { + "epoch": 4.742304309586632, + "grad_norm": 0.2035883904394352, + "learning_rate": 0.00027219124794749264, + "loss": 3.006263017654419, + "step": 8090, + "token_acc": 0.2976270484073624 + }, + { + "epoch": 4.742890647903841, + "grad_norm": 0.22064460794510354, + "learning_rate": 0.00027218281509254526, + "loss": 2.989020824432373, + "step": 8091, + "token_acc": 0.3001167811553513 + }, + { + "epoch": 4.743476986221049, + "grad_norm": 0.19365920966745645, + "learning_rate": 0.000272174381089855, + "loss": 2.98691987991333, + "step": 8092, + "token_acc": 0.3019087123782466 + }, + { + "epoch": 4.744063324538258, + "grad_norm": 0.240553898355905, + "learning_rate": 0.000272165945939501, + "loss": 3.01920747756958, + "step": 8093, + "token_acc": 0.2944937928082192 + }, + { + "epoch": 4.744649662855467, + "grad_norm": 0.24385977509853352, + "learning_rate": 0.0002721575096415626, + "loss": 2.976435661315918, + "step": 8094, + "token_acc": 0.302342078810637 + }, + { + "epoch": 4.745236001172676, + "grad_norm": 0.2155606724809469, + "learning_rate": 0.000272149072196119, + "loss": 2.934091567993164, + "step": 8095, + "token_acc": 0.30903334762744256 + }, + { + "epoch": 4.7458223394898855, + "grad_norm": 0.22586209540268945, + "learning_rate": 0.00027214063360324944, + "loss": 3.0193707942962646, + "step": 8096, + "token_acc": 0.2955047144927759 + }, + { + "epoch": 4.746408677807095, + "grad_norm": 0.21828795383280325, + "learning_rate": 0.00027213219386303323, + "loss": 3.018533706665039, + "step": 8097, + "token_acc": 0.29638760157110494 + }, + { + "epoch": 4.746995016124304, + "grad_norm": 0.23068282136102128, + "learning_rate": 0.0002721237529755496, + "loss": 2.9844436645507812, + "step": 8098, + "token_acc": 0.3004166877115006 + }, + { + "epoch": 4.747581354441513, + "grad_norm": 0.23005887375686124, + "learning_rate": 0.000272115310940878, + "loss": 3.012810707092285, + "step": 8099, + "token_acc": 0.29662058652358075 + }, + { + "epoch": 4.748167692758722, + "grad_norm": 0.21160865460004316, + "learning_rate": 0.00027210686775909753, + "loss": 2.9880380630493164, + "step": 8100, + "token_acc": 0.29978516744617134 + }, + { + "epoch": 4.748754031075931, + "grad_norm": 0.20306037280936265, + "learning_rate": 0.00027209842343028755, + "loss": 2.9798736572265625, + "step": 8101, + "token_acc": 0.30227312344384577 + }, + { + "epoch": 4.74934036939314, + "grad_norm": 0.22758907214598345, + "learning_rate": 0.0002720899779545274, + "loss": 3.0060172080993652, + "step": 8102, + "token_acc": 0.29648418197302157 + }, + { + "epoch": 4.749926707710349, + "grad_norm": 0.21858115525045563, + "learning_rate": 0.0002720815313318965, + "loss": 3.0004186630249023, + "step": 8103, + "token_acc": 0.2967534314864163 + }, + { + "epoch": 4.7505130460275575, + "grad_norm": 0.21179645845140155, + "learning_rate": 0.000272073083562474, + "loss": 2.9788644313812256, + "step": 8104, + "token_acc": 0.30281986922422455 + }, + { + "epoch": 4.751099384344767, + "grad_norm": 0.20735659827856265, + "learning_rate": 0.00027206463464633947, + "loss": 3.000202178955078, + "step": 8105, + "token_acc": 0.29807781644620357 + }, + { + "epoch": 4.751685722661976, + "grad_norm": 0.2224979698511867, + "learning_rate": 0.0002720561845835722, + "loss": 2.9806630611419678, + "step": 8106, + "token_acc": 0.3015019594722523 + }, + { + "epoch": 4.752272060979185, + "grad_norm": 0.2259317784876267, + "learning_rate": 0.0002720477333742515, + "loss": 2.9911599159240723, + "step": 8107, + "token_acc": 0.2999364236092343 + }, + { + "epoch": 4.752858399296394, + "grad_norm": 0.22649602706087044, + "learning_rate": 0.00027203928101845684, + "loss": 2.9713850021362305, + "step": 8108, + "token_acc": 0.30414369956794174 + }, + { + "epoch": 4.753444737613603, + "grad_norm": 0.23459639788005016, + "learning_rate": 0.0002720308275162675, + "loss": 2.991389274597168, + "step": 8109, + "token_acc": 0.30044359220866074 + }, + { + "epoch": 4.754031075930812, + "grad_norm": 0.23277513125107085, + "learning_rate": 0.00027202237286776305, + "loss": 2.9978554248809814, + "step": 8110, + "token_acc": 0.2991371579411362 + }, + { + "epoch": 4.754617414248021, + "grad_norm": 0.24178606349768897, + "learning_rate": 0.0002720139170730228, + "loss": 2.996026039123535, + "step": 8111, + "token_acc": 0.2987498283456748 + }, + { + "epoch": 4.75520375256523, + "grad_norm": 0.20406220790662327, + "learning_rate": 0.00027200546013212627, + "loss": 2.970203399658203, + "step": 8112, + "token_acc": 0.3033247812643905 + }, + { + "epoch": 4.7557900908824395, + "grad_norm": 0.25275325940899135, + "learning_rate": 0.0002719970020451528, + "loss": 3.013970136642456, + "step": 8113, + "token_acc": 0.29688934980239196 + }, + { + "epoch": 4.756376429199648, + "grad_norm": 0.2387717051905531, + "learning_rate": 0.0002719885428121819, + "loss": 3.034808874130249, + "step": 8114, + "token_acc": 0.2942142240322391 + }, + { + "epoch": 4.756962767516857, + "grad_norm": 0.22018240752881876, + "learning_rate": 0.000271980082433293, + "loss": 2.968825578689575, + "step": 8115, + "token_acc": 0.3022998481801281 + }, + { + "epoch": 4.757549105834066, + "grad_norm": 0.23807227169986403, + "learning_rate": 0.00027197162090856564, + "loss": 2.9648778438568115, + "step": 8116, + "token_acc": 0.3035104615796421 + }, + { + "epoch": 4.758135444151275, + "grad_norm": 0.21597600089676877, + "learning_rate": 0.0002719631582380792, + "loss": 2.9796981811523438, + "step": 8117, + "token_acc": 0.30361531941943004 + }, + { + "epoch": 4.758721782468484, + "grad_norm": 0.2200850466479177, + "learning_rate": 0.0002719546944219133, + "loss": 3.011141777038574, + "step": 8118, + "token_acc": 0.2981951063191953 + }, + { + "epoch": 4.759308120785693, + "grad_norm": 0.24007912504893117, + "learning_rate": 0.00027194622946014735, + "loss": 2.9703171253204346, + "step": 8119, + "token_acc": 0.3029364174377338 + }, + { + "epoch": 4.759894459102902, + "grad_norm": 0.23452691261476002, + "learning_rate": 0.0002719377633528609, + "loss": 2.9958205223083496, + "step": 8120, + "token_acc": 0.2997577206865629 + }, + { + "epoch": 4.7604807974201115, + "grad_norm": 0.20838469545846613, + "learning_rate": 0.00027192929610013347, + "loss": 3.018148422241211, + "step": 8121, + "token_acc": 0.2959528132565323 + }, + { + "epoch": 4.761067135737321, + "grad_norm": 0.20753609458500305, + "learning_rate": 0.00027192082770204464, + "loss": 2.954198122024536, + "step": 8122, + "token_acc": 0.30477757692193375 + }, + { + "epoch": 4.76165347405453, + "grad_norm": 0.2217914657574539, + "learning_rate": 0.00027191235815867386, + "loss": 2.9740500450134277, + "step": 8123, + "token_acc": 0.3037371185162647 + }, + { + "epoch": 4.762239812371739, + "grad_norm": 0.22738205129208827, + "learning_rate": 0.0002719038874701008, + "loss": 3.016878604888916, + "step": 8124, + "token_acc": 0.29575501824227707 + }, + { + "epoch": 4.762826150688948, + "grad_norm": 0.22558676494656488, + "learning_rate": 0.000271895415636405, + "loss": 2.962388277053833, + "step": 8125, + "token_acc": 0.3044442451468778 + }, + { + "epoch": 4.763412489006156, + "grad_norm": 0.21608433648273898, + "learning_rate": 0.000271886942657666, + "loss": 3.0116310119628906, + "step": 8126, + "token_acc": 0.2969151044751626 + }, + { + "epoch": 4.763998827323365, + "grad_norm": 0.20797664649932088, + "learning_rate": 0.00027187846853396345, + "loss": 2.9748778343200684, + "step": 8127, + "token_acc": 0.3033529368444309 + }, + { + "epoch": 4.764585165640574, + "grad_norm": 0.21519810695211816, + "learning_rate": 0.0002718699932653769, + "loss": 2.9693310260772705, + "step": 8128, + "token_acc": 0.3034183284643719 + }, + { + "epoch": 4.7651715039577835, + "grad_norm": 0.21442111456267965, + "learning_rate": 0.00027186151685198594, + "loss": 3.004716157913208, + "step": 8129, + "token_acc": 0.2998205453670066 + }, + { + "epoch": 4.765757842274993, + "grad_norm": 0.21447691442451744, + "learning_rate": 0.0002718530392938703, + "loss": 2.9433865547180176, + "step": 8130, + "token_acc": 0.30774889057140625 + }, + { + "epoch": 4.766344180592202, + "grad_norm": 0.20804921360823306, + "learning_rate": 0.0002718445605911095, + "loss": 2.9241719245910645, + "step": 8131, + "token_acc": 0.3114894478520894 + }, + { + "epoch": 4.766930518909411, + "grad_norm": 0.22946221575388911, + "learning_rate": 0.00027183608074378326, + "loss": 2.961427688598633, + "step": 8132, + "token_acc": 0.30399505528754234 + }, + { + "epoch": 4.76751685722662, + "grad_norm": 0.23151494826173108, + "learning_rate": 0.00027182759975197127, + "loss": 2.9625134468078613, + "step": 8133, + "token_acc": 0.3039790848026207 + }, + { + "epoch": 4.768103195543829, + "grad_norm": 0.24919134497492132, + "learning_rate": 0.0002718191176157531, + "loss": 3.0103061199188232, + "step": 8134, + "token_acc": 0.29623488863939007 + }, + { + "epoch": 4.768689533861037, + "grad_norm": 0.23899527028826328, + "learning_rate": 0.00027181063433520853, + "loss": 3.006040573120117, + "step": 8135, + "token_acc": 0.2983042388603791 + }, + { + "epoch": 4.7692758721782464, + "grad_norm": 0.23425494784984585, + "learning_rate": 0.0002718021499104171, + "loss": 2.9856362342834473, + "step": 8136, + "token_acc": 0.300987846526025 + }, + { + "epoch": 4.769862210495456, + "grad_norm": 0.23557434089062848, + "learning_rate": 0.0002717936643414586, + "loss": 3.052811622619629, + "step": 8137, + "token_acc": 0.2920363605295112 + }, + { + "epoch": 4.770448548812665, + "grad_norm": 0.21395881904809017, + "learning_rate": 0.0002717851776284128, + "loss": 3.035191535949707, + "step": 8138, + "token_acc": 0.29292435943333506 + }, + { + "epoch": 4.771034887129874, + "grad_norm": 0.22400448431410216, + "learning_rate": 0.0002717766897713594, + "loss": 2.959608554840088, + "step": 8139, + "token_acc": 0.3057954159505161 + }, + { + "epoch": 4.771621225447083, + "grad_norm": 0.21674650572344048, + "learning_rate": 0.00027176820077037806, + "loss": 2.987884998321533, + "step": 8140, + "token_acc": 0.3011820688092451 + }, + { + "epoch": 4.772207563764292, + "grad_norm": 0.21439150488067277, + "learning_rate": 0.00027175971062554853, + "loss": 2.947230339050293, + "step": 8141, + "token_acc": 0.3055041204073147 + }, + { + "epoch": 4.772793902081501, + "grad_norm": 0.2323999661663021, + "learning_rate": 0.00027175121933695055, + "loss": 2.991596221923828, + "step": 8142, + "token_acc": 0.29905055104258293 + }, + { + "epoch": 4.77338024039871, + "grad_norm": 0.219499360898104, + "learning_rate": 0.000271742726904664, + "loss": 2.954399824142456, + "step": 8143, + "token_acc": 0.3053051895300646 + }, + { + "epoch": 4.773966578715919, + "grad_norm": 0.23640631422816677, + "learning_rate": 0.0002717342333287686, + "loss": 2.9846839904785156, + "step": 8144, + "token_acc": 0.3016776098689685 + }, + { + "epoch": 4.7745529170331285, + "grad_norm": 0.23889392392579642, + "learning_rate": 0.0002717257386093441, + "loss": 2.983081102371216, + "step": 8145, + "token_acc": 0.3013150059418402 + }, + { + "epoch": 4.775139255350338, + "grad_norm": 0.21853203963137557, + "learning_rate": 0.00027171724274647026, + "loss": 2.9850759506225586, + "step": 8146, + "token_acc": 0.2997954311250778 + }, + { + "epoch": 4.775725593667546, + "grad_norm": 0.2406812465116604, + "learning_rate": 0.000271708745740227, + "loss": 2.9793460369110107, + "step": 8147, + "token_acc": 0.30380340562490443 + }, + { + "epoch": 4.776311931984755, + "grad_norm": 0.21811188168637588, + "learning_rate": 0.00027170024759069403, + "loss": 2.9855458736419678, + "step": 8148, + "token_acc": 0.30129878772986035 + }, + { + "epoch": 4.776898270301964, + "grad_norm": 0.2400791627311619, + "learning_rate": 0.00027169174829795123, + "loss": 3.007376194000244, + "step": 8149, + "token_acc": 0.2967872609353308 + }, + { + "epoch": 4.777484608619173, + "grad_norm": 0.2209573021018777, + "learning_rate": 0.00027168324786207846, + "loss": 2.989950180053711, + "step": 8150, + "token_acc": 0.2999348861335055 + }, + { + "epoch": 4.778070946936382, + "grad_norm": 0.20833590963821028, + "learning_rate": 0.00027167474628315557, + "loss": 2.946460723876953, + "step": 8151, + "token_acc": 0.3068991065813561 + }, + { + "epoch": 4.778657285253591, + "grad_norm": 0.23050445346755968, + "learning_rate": 0.00027166624356126236, + "loss": 3.026897430419922, + "step": 8152, + "token_acc": 0.2965768677150739 + }, + { + "epoch": 4.7792436235708005, + "grad_norm": 0.19564495143169774, + "learning_rate": 0.00027165773969647873, + "loss": 2.9401674270629883, + "step": 8153, + "token_acc": 0.3072501293306058 + }, + { + "epoch": 4.77982996188801, + "grad_norm": 0.22235335114798596, + "learning_rate": 0.00027164923468888465, + "loss": 3.0000414848327637, + "step": 8154, + "token_acc": 0.2985176426594476 + }, + { + "epoch": 4.780416300205219, + "grad_norm": 0.20555464826724013, + "learning_rate": 0.00027164072853855985, + "loss": 3.021088123321533, + "step": 8155, + "token_acc": 0.2961964268433978 + }, + { + "epoch": 4.781002638522428, + "grad_norm": 0.2155821723764754, + "learning_rate": 0.00027163222124558436, + "loss": 2.9869861602783203, + "step": 8156, + "token_acc": 0.30052691544584453 + }, + { + "epoch": 4.781588976839636, + "grad_norm": 0.23978157219585663, + "learning_rate": 0.00027162371281003796, + "loss": 3.0162949562072754, + "step": 8157, + "token_acc": 0.2951361437019702 + }, + { + "epoch": 4.782175315156845, + "grad_norm": 0.22655952906906807, + "learning_rate": 0.0002716152032320008, + "loss": 2.9829556941986084, + "step": 8158, + "token_acc": 0.30107967218350307 + }, + { + "epoch": 4.782761653474054, + "grad_norm": 0.21192069601356697, + "learning_rate": 0.00027160669251155263, + "loss": 3.0062928199768066, + "step": 8159, + "token_acc": 0.2980509330120168 + }, + { + "epoch": 4.783347991791263, + "grad_norm": 0.21789383608871188, + "learning_rate": 0.00027159818064877346, + "loss": 3.032963752746582, + "step": 8160, + "token_acc": 0.294328594076858 + }, + { + "epoch": 4.7839343301084725, + "grad_norm": 0.2233877327879554, + "learning_rate": 0.00027158966764374317, + "loss": 2.9992804527282715, + "step": 8161, + "token_acc": 0.29945699466516434 + }, + { + "epoch": 4.784520668425682, + "grad_norm": 0.2070123832777583, + "learning_rate": 0.0002715811534965419, + "loss": 2.9553630352020264, + "step": 8162, + "token_acc": 0.30494919486782057 + }, + { + "epoch": 4.785107006742891, + "grad_norm": 0.22302962149082725, + "learning_rate": 0.00027157263820724945, + "loss": 2.994645833969116, + "step": 8163, + "token_acc": 0.300642025521313 + }, + { + "epoch": 4.7856933450601, + "grad_norm": 0.21631335841293176, + "learning_rate": 0.00027156412177594595, + "loss": 2.996206521987915, + "step": 8164, + "token_acc": 0.29681981533412966 + }, + { + "epoch": 4.786279683377309, + "grad_norm": 0.19836453723948855, + "learning_rate": 0.0002715556042027113, + "loss": 2.956622838973999, + "step": 8165, + "token_acc": 0.3028201055037166 + }, + { + "epoch": 4.786866021694518, + "grad_norm": 0.20437559518155743, + "learning_rate": 0.0002715470854876255, + "loss": 2.997732162475586, + "step": 8166, + "token_acc": 0.300637151830686 + }, + { + "epoch": 4.787452360011727, + "grad_norm": 0.2046373876931285, + "learning_rate": 0.0002715385656307687, + "loss": 2.947683334350586, + "step": 8167, + "token_acc": 0.30695501525951907 + }, + { + "epoch": 4.788038698328936, + "grad_norm": 0.19805987984712534, + "learning_rate": 0.00027153004463222085, + "loss": 2.9641036987304688, + "step": 8168, + "token_acc": 0.30408580052662 + }, + { + "epoch": 4.7886250366461445, + "grad_norm": 0.20403526245266768, + "learning_rate": 0.0002715215224920619, + "loss": 3.001873731613159, + "step": 8169, + "token_acc": 0.29683457309184996 + }, + { + "epoch": 4.789211374963354, + "grad_norm": 0.19159637606105104, + "learning_rate": 0.0002715129992103721, + "loss": 2.93953275680542, + "step": 8170, + "token_acc": 0.3075050601182258 + }, + { + "epoch": 4.789797713280563, + "grad_norm": 0.21199502186767552, + "learning_rate": 0.00027150447478723133, + "loss": 2.9722204208374023, + "step": 8171, + "token_acc": 0.30234263440237735 + }, + { + "epoch": 4.790384051597772, + "grad_norm": 0.21512587753543583, + "learning_rate": 0.00027149594922271986, + "loss": 2.9400267601013184, + "step": 8172, + "token_acc": 0.3060803865131579 + }, + { + "epoch": 4.790970389914981, + "grad_norm": 0.21275268964744332, + "learning_rate": 0.00027148742251691756, + "loss": 2.9888229370117188, + "step": 8173, + "token_acc": 0.30022318737289977 + }, + { + "epoch": 4.79155672823219, + "grad_norm": 0.23277952531653753, + "learning_rate": 0.0002714788946699047, + "loss": 2.9840564727783203, + "step": 8174, + "token_acc": 0.29950300193131246 + }, + { + "epoch": 4.792143066549399, + "grad_norm": 0.22161840605118596, + "learning_rate": 0.00027147036568176124, + "loss": 2.9725589752197266, + "step": 8175, + "token_acc": 0.30141967634680245 + }, + { + "epoch": 4.792729404866608, + "grad_norm": 0.21173966062782776, + "learning_rate": 0.0002714618355525675, + "loss": 3.003312110900879, + "step": 8176, + "token_acc": 0.2992656964203674 + }, + { + "epoch": 4.793315743183817, + "grad_norm": 0.21077010005482394, + "learning_rate": 0.00027145330428240337, + "loss": 2.968500852584839, + "step": 8177, + "token_acc": 0.30230920830508873 + }, + { + "epoch": 4.7939020815010265, + "grad_norm": 0.20360883265347024, + "learning_rate": 0.00027144477187134914, + "loss": 2.97875714302063, + "step": 8178, + "token_acc": 0.3012231471328628 + }, + { + "epoch": 4.794488419818235, + "grad_norm": 0.22089973531024246, + "learning_rate": 0.0002714362383194849, + "loss": 2.98596453666687, + "step": 8179, + "token_acc": 0.3017382961770493 + }, + { + "epoch": 4.795074758135444, + "grad_norm": 0.2200155810708185, + "learning_rate": 0.00027142770362689094, + "loss": 2.9603524208068848, + "step": 8180, + "token_acc": 0.3019557346116579 + }, + { + "epoch": 4.795661096452653, + "grad_norm": 0.20658892056225175, + "learning_rate": 0.0002714191677936472, + "loss": 3.043041944503784, + "step": 8181, + "token_acc": 0.2922430665923969 + }, + { + "epoch": 4.796247434769862, + "grad_norm": 0.2267956562028409, + "learning_rate": 0.0002714106308198341, + "loss": 3.0422325134277344, + "step": 8182, + "token_acc": 0.29370239594479836 + }, + { + "epoch": 4.796833773087071, + "grad_norm": 0.21976351975746644, + "learning_rate": 0.0002714020927055317, + "loss": 3.040954351425171, + "step": 8183, + "token_acc": 0.29152869904070355 + }, + { + "epoch": 4.79742011140428, + "grad_norm": 0.21484613930388524, + "learning_rate": 0.0002713935534508202, + "loss": 2.982804298400879, + "step": 8184, + "token_acc": 0.3005432461431086 + }, + { + "epoch": 4.798006449721489, + "grad_norm": 0.24231842612008836, + "learning_rate": 0.0002713850130557799, + "loss": 3.0092010498046875, + "step": 8185, + "token_acc": 0.2977415565504063 + }, + { + "epoch": 4.7985927880386985, + "grad_norm": 0.2318808411487887, + "learning_rate": 0.0002713764715204909, + "loss": 3.0049400329589844, + "step": 8186, + "token_acc": 0.2977827436779922 + }, + { + "epoch": 4.799179126355908, + "grad_norm": 0.21779784137775013, + "learning_rate": 0.00027136792884503355, + "loss": 2.9662132263183594, + "step": 8187, + "token_acc": 0.3056335945586278 + }, + { + "epoch": 4.799765464673117, + "grad_norm": 0.2185379792459844, + "learning_rate": 0.00027135938502948804, + "loss": 2.9820468425750732, + "step": 8188, + "token_acc": 0.2992771548147628 + }, + { + "epoch": 4.800351802990326, + "grad_norm": 0.21227476231973072, + "learning_rate": 0.00027135084007393463, + "loss": 2.9727656841278076, + "step": 8189, + "token_acc": 0.3036980547082062 + }, + { + "epoch": 4.800938141307535, + "grad_norm": 0.21270226004050508, + "learning_rate": 0.0002713422939784536, + "loss": 2.944664478302002, + "step": 8190, + "token_acc": 0.3062899773114737 + }, + { + "epoch": 4.801524479624743, + "grad_norm": 0.201844602461128, + "learning_rate": 0.00027133374674312525, + "loss": 2.932436227798462, + "step": 8191, + "token_acc": 0.3071330845267683 + }, + { + "epoch": 4.802110817941952, + "grad_norm": 0.19659157653655712, + "learning_rate": 0.00027132519836802984, + "loss": 2.970036029815674, + "step": 8192, + "token_acc": 0.3028290301705469 + }, + { + "epoch": 4.802697156259161, + "grad_norm": 0.2085893591505014, + "learning_rate": 0.00027131664885324773, + "loss": 3.0007452964782715, + "step": 8193, + "token_acc": 0.2986780928541506 + }, + { + "epoch": 4.8032834945763705, + "grad_norm": 0.20289781459526415, + "learning_rate": 0.0002713080981988591, + "loss": 3.0071005821228027, + "step": 8194, + "token_acc": 0.2975272533900558 + }, + { + "epoch": 4.80386983289358, + "grad_norm": 0.20539441605345993, + "learning_rate": 0.00027129954640494437, + "loss": 3.0201001167297363, + "step": 8195, + "token_acc": 0.2957570554114547 + }, + { + "epoch": 4.804456171210789, + "grad_norm": 0.19541946898470167, + "learning_rate": 0.00027129099347158385, + "loss": 2.9966511726379395, + "step": 8196, + "token_acc": 0.2998181798386482 + }, + { + "epoch": 4.805042509527998, + "grad_norm": 0.19712611829820337, + "learning_rate": 0.0002712824393988579, + "loss": 2.946601390838623, + "step": 8197, + "token_acc": 0.3061989567850111 + }, + { + "epoch": 4.805628847845207, + "grad_norm": 0.19910101313602874, + "learning_rate": 0.0002712738841868469, + "loss": 3.009077310562134, + "step": 8198, + "token_acc": 0.29552940166722097 + }, + { + "epoch": 4.806215186162416, + "grad_norm": 0.19609388320612112, + "learning_rate": 0.00027126532783563117, + "loss": 2.982117176055908, + "step": 8199, + "token_acc": 0.301542337223793 + }, + { + "epoch": 4.806801524479624, + "grad_norm": 0.19698726744133707, + "learning_rate": 0.000271256770345291, + "loss": 3.031919002532959, + "step": 8200, + "token_acc": 0.295518697737595 + }, + { + "epoch": 4.807387862796833, + "grad_norm": 0.20186081842622225, + "learning_rate": 0.000271248211715907, + "loss": 2.9751083850860596, + "step": 8201, + "token_acc": 0.30165295763410266 + }, + { + "epoch": 4.8079742011140425, + "grad_norm": 0.19934424100404124, + "learning_rate": 0.00027123965194755936, + "loss": 3.0011074542999268, + "step": 8202, + "token_acc": 0.2984692994254879 + }, + { + "epoch": 4.808560539431252, + "grad_norm": 0.2084473765771158, + "learning_rate": 0.0002712310910403286, + "loss": 2.9742159843444824, + "step": 8203, + "token_acc": 0.30204830167839347 + }, + { + "epoch": 4.809146877748461, + "grad_norm": 0.21328753337270806, + "learning_rate": 0.00027122252899429504, + "loss": 2.990501642227173, + "step": 8204, + "token_acc": 0.2985040876211455 + }, + { + "epoch": 4.80973321606567, + "grad_norm": 0.2149387900462949, + "learning_rate": 0.0002712139658095392, + "loss": 2.981771945953369, + "step": 8205, + "token_acc": 0.30358772293654085 + }, + { + "epoch": 4.810319554382879, + "grad_norm": 0.2072341466467608, + "learning_rate": 0.00027120540148614143, + "loss": 3.0173938274383545, + "step": 8206, + "token_acc": 0.2976388770072313 + }, + { + "epoch": 4.810905892700088, + "grad_norm": 0.21906253851345914, + "learning_rate": 0.00027119683602418236, + "loss": 2.9812843799591064, + "step": 8207, + "token_acc": 0.30107715993430434 + }, + { + "epoch": 4.811492231017297, + "grad_norm": 0.21286303996702574, + "learning_rate": 0.0002711882694237423, + "loss": 2.980482339859009, + "step": 8208, + "token_acc": 0.3029348755291428 + }, + { + "epoch": 4.812078569334506, + "grad_norm": 0.2351808865580167, + "learning_rate": 0.00027117970168490167, + "loss": 2.9739975929260254, + "step": 8209, + "token_acc": 0.30425747420049676 + }, + { + "epoch": 4.812664907651715, + "grad_norm": 0.21734369655093558, + "learning_rate": 0.0002711711328077411, + "loss": 2.99165678024292, + "step": 8210, + "token_acc": 0.2984323905722173 + }, + { + "epoch": 4.8132512459689245, + "grad_norm": 0.22367612516852556, + "learning_rate": 0.00027116256279234097, + "loss": 2.998335123062134, + "step": 8211, + "token_acc": 0.2984088619290983 + }, + { + "epoch": 4.813837584286133, + "grad_norm": 0.21390273500727167, + "learning_rate": 0.0002711539916387819, + "loss": 2.983384609222412, + "step": 8212, + "token_acc": 0.3010908025040303 + }, + { + "epoch": 4.814423922603342, + "grad_norm": 0.21426138555267643, + "learning_rate": 0.0002711454193471443, + "loss": 2.9774389266967773, + "step": 8213, + "token_acc": 0.302832860454431 + }, + { + "epoch": 4.815010260920551, + "grad_norm": 0.22273444608217519, + "learning_rate": 0.00027113684591750873, + "loss": 3.0263800621032715, + "step": 8214, + "token_acc": 0.29457586064728924 + }, + { + "epoch": 4.81559659923776, + "grad_norm": 0.19905173597791648, + "learning_rate": 0.0002711282713499557, + "loss": 2.997547149658203, + "step": 8215, + "token_acc": 0.29717456071957826 + }, + { + "epoch": 4.816182937554969, + "grad_norm": 0.21811821493841554, + "learning_rate": 0.0002711196956445658, + "loss": 2.999246597290039, + "step": 8216, + "token_acc": 0.2986837031401445 + }, + { + "epoch": 4.816769275872178, + "grad_norm": 0.20459504764226327, + "learning_rate": 0.0002711111188014196, + "loss": 2.985507011413574, + "step": 8217, + "token_acc": 0.3006857958841708 + }, + { + "epoch": 4.817355614189387, + "grad_norm": 0.21544538904329882, + "learning_rate": 0.0002711025408205976, + "loss": 3.0114169120788574, + "step": 8218, + "token_acc": 0.29720481810016636 + }, + { + "epoch": 4.8179419525065965, + "grad_norm": 0.23131985320579745, + "learning_rate": 0.0002710939617021805, + "loss": 2.994990825653076, + "step": 8219, + "token_acc": 0.29902670300021295 + }, + { + "epoch": 4.818528290823806, + "grad_norm": 0.22415060949403243, + "learning_rate": 0.00027108538144624873, + "loss": 2.9914660453796387, + "step": 8220, + "token_acc": 0.2991685598277676 + }, + { + "epoch": 4.819114629141015, + "grad_norm": 0.22547287930031026, + "learning_rate": 0.00027107680005288297, + "loss": 2.9934816360473633, + "step": 8221, + "token_acc": 0.2993074923731793 + }, + { + "epoch": 4.819700967458223, + "grad_norm": 0.2055897786663909, + "learning_rate": 0.0002710682175221638, + "loss": 2.987076759338379, + "step": 8222, + "token_acc": 0.30093764882540164 + }, + { + "epoch": 4.820287305775432, + "grad_norm": 0.22406053592048655, + "learning_rate": 0.00027105963385417193, + "loss": 2.9683837890625, + "step": 8223, + "token_acc": 0.3032326593094184 + }, + { + "epoch": 4.820873644092641, + "grad_norm": 0.2212924106764679, + "learning_rate": 0.0002710510490489879, + "loss": 2.975367546081543, + "step": 8224, + "token_acc": 0.30239438359007775 + }, + { + "epoch": 4.82145998240985, + "grad_norm": 0.22303716093370132, + "learning_rate": 0.00027104246310669236, + "loss": 2.950197219848633, + "step": 8225, + "token_acc": 0.3062015605879179 + }, + { + "epoch": 4.822046320727059, + "grad_norm": 0.2220174555459367, + "learning_rate": 0.00027103387602736605, + "loss": 3.018613815307617, + "step": 8226, + "token_acc": 0.29574920240669766 + }, + { + "epoch": 4.8226326590442685, + "grad_norm": 0.22100232005838008, + "learning_rate": 0.0002710252878110895, + "loss": 3.0370845794677734, + "step": 8227, + "token_acc": 0.2938350928661447 + }, + { + "epoch": 4.823218997361478, + "grad_norm": 0.2140300602102675, + "learning_rate": 0.0002710166984579435, + "loss": 3.0071358680725098, + "step": 8228, + "token_acc": 0.29730518862636834 + }, + { + "epoch": 4.823805335678687, + "grad_norm": 0.22369306081674917, + "learning_rate": 0.0002710081079680087, + "loss": 2.9715211391448975, + "step": 8229, + "token_acc": 0.30218854563342185 + }, + { + "epoch": 4.824391673995896, + "grad_norm": 0.2049233559466746, + "learning_rate": 0.0002709995163413658, + "loss": 2.995271682739258, + "step": 8230, + "token_acc": 0.2993573797678275 + }, + { + "epoch": 4.824978012313105, + "grad_norm": 0.21839645189854812, + "learning_rate": 0.0002709909235780954, + "loss": 2.9493818283081055, + "step": 8231, + "token_acc": 0.30578885719838433 + }, + { + "epoch": 4.825564350630314, + "grad_norm": 0.20408100897243617, + "learning_rate": 0.00027098232967827834, + "loss": 2.981884717941284, + "step": 8232, + "token_acc": 0.3004746238632077 + }, + { + "epoch": 4.826150688947523, + "grad_norm": 0.21530520458024063, + "learning_rate": 0.0002709737346419954, + "loss": 2.9762909412384033, + "step": 8233, + "token_acc": 0.3014088820018426 + }, + { + "epoch": 4.826737027264731, + "grad_norm": 0.2172021562953356, + "learning_rate": 0.00027096513846932717, + "loss": 2.969449043273926, + "step": 8234, + "token_acc": 0.3031483749902265 + }, + { + "epoch": 4.8273233655819405, + "grad_norm": 0.23812863008341817, + "learning_rate": 0.00027095654116035447, + "loss": 3.0026800632476807, + "step": 8235, + "token_acc": 0.29722296134908066 + }, + { + "epoch": 4.82790970389915, + "grad_norm": 0.22151080109328217, + "learning_rate": 0.000270947942715158, + "loss": 2.9960665702819824, + "step": 8236, + "token_acc": 0.29833930886586885 + }, + { + "epoch": 4.828496042216359, + "grad_norm": 0.23800322924273773, + "learning_rate": 0.0002709393431338187, + "loss": 2.992833137512207, + "step": 8237, + "token_acc": 0.2982764609423593 + }, + { + "epoch": 4.829082380533568, + "grad_norm": 0.22825325827546616, + "learning_rate": 0.0002709307424164172, + "loss": 2.9995877742767334, + "step": 8238, + "token_acc": 0.29828417761872444 + }, + { + "epoch": 4.829668718850777, + "grad_norm": 0.21221241550834927, + "learning_rate": 0.00027092214056303435, + "loss": 2.9751338958740234, + "step": 8239, + "token_acc": 0.3038743105288025 + }, + { + "epoch": 4.830255057167986, + "grad_norm": 0.20241885570461254, + "learning_rate": 0.0002709135375737508, + "loss": 3.0011754035949707, + "step": 8240, + "token_acc": 0.29941396124828623 + }, + { + "epoch": 4.830841395485195, + "grad_norm": 0.2263023123222594, + "learning_rate": 0.0002709049334486477, + "loss": 3.011152744293213, + "step": 8241, + "token_acc": 0.29755369017205185 + }, + { + "epoch": 4.831427733802404, + "grad_norm": 0.21280120438115002, + "learning_rate": 0.00027089632818780556, + "loss": 3.030346155166626, + "step": 8242, + "token_acc": 0.2926867810037504 + }, + { + "epoch": 4.8320140721196125, + "grad_norm": 0.22158115126532177, + "learning_rate": 0.0002708877217913053, + "loss": 2.9899709224700928, + "step": 8243, + "token_acc": 0.30078662505066467 + }, + { + "epoch": 4.832600410436822, + "grad_norm": 0.21514492619127298, + "learning_rate": 0.00027087911425922786, + "loss": 3.009211778640747, + "step": 8244, + "token_acc": 0.29765799048716673 + }, + { + "epoch": 4.833186748754031, + "grad_norm": 0.19368398781172047, + "learning_rate": 0.000270870505591654, + "loss": 3.0132951736450195, + "step": 8245, + "token_acc": 0.29665190602508956 + }, + { + "epoch": 4.83377308707124, + "grad_norm": 0.21864186466879718, + "learning_rate": 0.00027086189578866466, + "loss": 2.986602306365967, + "step": 8246, + "token_acc": 0.30208327804463786 + }, + { + "epoch": 4.834359425388449, + "grad_norm": 0.22493255414681182, + "learning_rate": 0.00027085328485034057, + "loss": 2.9913735389709473, + "step": 8247, + "token_acc": 0.30096203848153924 + }, + { + "epoch": 4.834945763705658, + "grad_norm": 0.21598239675071565, + "learning_rate": 0.0002708446727767628, + "loss": 3.005964756011963, + "step": 8248, + "token_acc": 0.29787409764646067 + }, + { + "epoch": 4.835532102022867, + "grad_norm": 0.19631652455296356, + "learning_rate": 0.00027083605956801214, + "loss": 2.9520821571350098, + "step": 8249, + "token_acc": 0.30555667944529297 + }, + { + "epoch": 4.836118440340076, + "grad_norm": 0.21117866598709542, + "learning_rate": 0.00027082744522416956, + "loss": 2.9988796710968018, + "step": 8250, + "token_acc": 0.298918479619905 + }, + { + "epoch": 4.836704778657285, + "grad_norm": 0.21200048723371773, + "learning_rate": 0.0002708188297453159, + "loss": 3.023862600326538, + "step": 8251, + "token_acc": 0.2945858210513131 + }, + { + "epoch": 4.8372911169744945, + "grad_norm": 0.2420840966171339, + "learning_rate": 0.00027081021313153213, + "loss": 2.991298198699951, + "step": 8252, + "token_acc": 0.29910673197232884 + }, + { + "epoch": 4.837877455291704, + "grad_norm": 0.20225896962337814, + "learning_rate": 0.0002708015953828993, + "loss": 2.9621973037719727, + "step": 8253, + "token_acc": 0.30516010692376505 + }, + { + "epoch": 4.838463793608913, + "grad_norm": 0.22738052193645644, + "learning_rate": 0.0002707929764994982, + "loss": 3.000058650970459, + "step": 8254, + "token_acc": 0.2997276452222076 + }, + { + "epoch": 4.839050131926121, + "grad_norm": 0.20931269824707144, + "learning_rate": 0.00027078435648140986, + "loss": 2.960587978363037, + "step": 8255, + "token_acc": 0.3060943082363837 + }, + { + "epoch": 4.83963647024333, + "grad_norm": 0.204573918118151, + "learning_rate": 0.00027077573532871524, + "loss": 3.00370454788208, + "step": 8256, + "token_acc": 0.2986082979083446 + }, + { + "epoch": 4.840222808560539, + "grad_norm": 0.2147273382469844, + "learning_rate": 0.0002707671130414953, + "loss": 2.945596694946289, + "step": 8257, + "token_acc": 0.3042680566528976 + }, + { + "epoch": 4.840809146877748, + "grad_norm": 0.21555106436562346, + "learning_rate": 0.0002707584896198312, + "loss": 2.968683958053589, + "step": 8258, + "token_acc": 0.30197661498312894 + }, + { + "epoch": 4.841395485194957, + "grad_norm": 0.23213510728065895, + "learning_rate": 0.00027074986506380366, + "loss": 2.9743924140930176, + "step": 8259, + "token_acc": 0.30279447700192463 + }, + { + "epoch": 4.8419818235121665, + "grad_norm": 0.2084991570675108, + "learning_rate": 0.0002707412393734939, + "loss": 2.9817371368408203, + "step": 8260, + "token_acc": 0.3009927604523646 + }, + { + "epoch": 4.842568161829376, + "grad_norm": 0.2106397042983924, + "learning_rate": 0.00027073261254898293, + "loss": 2.984109401702881, + "step": 8261, + "token_acc": 0.3010085761507528 + }, + { + "epoch": 4.843154500146585, + "grad_norm": 0.23451372429900544, + "learning_rate": 0.00027072398459035174, + "loss": 2.994997262954712, + "step": 8262, + "token_acc": 0.2989970872680527 + }, + { + "epoch": 4.843740838463794, + "grad_norm": 0.24065187245771597, + "learning_rate": 0.0002707153554976814, + "loss": 2.9774169921875, + "step": 8263, + "token_acc": 0.3022227262417782 + }, + { + "epoch": 4.844327176781003, + "grad_norm": 0.2072442610765007, + "learning_rate": 0.0002707067252710529, + "loss": 2.968035936355591, + "step": 8264, + "token_acc": 0.3024128770776678 + }, + { + "epoch": 4.844913515098211, + "grad_norm": 0.22094257179755328, + "learning_rate": 0.00027069809391054746, + "loss": 3.0021698474884033, + "step": 8265, + "token_acc": 0.2983368208775601 + }, + { + "epoch": 4.84549985341542, + "grad_norm": 0.21890303209505382, + "learning_rate": 0.00027068946141624604, + "loss": 2.98079776763916, + "step": 8266, + "token_acc": 0.30096179466683065 + }, + { + "epoch": 4.8460861917326294, + "grad_norm": 0.21832868162886424, + "learning_rate": 0.00027068082778822976, + "loss": 2.9945108890533447, + "step": 8267, + "token_acc": 0.2989355944434422 + }, + { + "epoch": 4.846672530049839, + "grad_norm": 0.20786520690932794, + "learning_rate": 0.0002706721930265797, + "loss": 2.977018356323242, + "step": 8268, + "token_acc": 0.3000324367546147 + }, + { + "epoch": 4.847258868367048, + "grad_norm": 0.2305965951917622, + "learning_rate": 0.000270663557131377, + "loss": 3.026834487915039, + "step": 8269, + "token_acc": 0.2946109284825244 + }, + { + "epoch": 4.847845206684257, + "grad_norm": 0.2184734153248759, + "learning_rate": 0.0002706549201027028, + "loss": 2.970600128173828, + "step": 8270, + "token_acc": 0.30350279056258683 + }, + { + "epoch": 4.848431545001466, + "grad_norm": 0.2298998745977003, + "learning_rate": 0.00027064628194063825, + "loss": 2.9935102462768555, + "step": 8271, + "token_acc": 0.29928317184101244 + }, + { + "epoch": 4.849017883318675, + "grad_norm": 0.245548225775597, + "learning_rate": 0.0002706376426452643, + "loss": 3.0653042793273926, + "step": 8272, + "token_acc": 0.29028715952497425 + }, + { + "epoch": 4.849604221635884, + "grad_norm": 0.20621047769560788, + "learning_rate": 0.0002706290022166624, + "loss": 2.979208469390869, + "step": 8273, + "token_acc": 0.30176831599285026 + }, + { + "epoch": 4.850190559953093, + "grad_norm": 0.2306553403659329, + "learning_rate": 0.00027062036065491355, + "loss": 2.962268352508545, + "step": 8274, + "token_acc": 0.30426853098267403 + }, + { + "epoch": 4.850776898270302, + "grad_norm": 0.23109540804108505, + "learning_rate": 0.00027061171796009895, + "loss": 3.0084023475646973, + "step": 8275, + "token_acc": 0.297135769647427 + }, + { + "epoch": 4.8513632365875115, + "grad_norm": 0.22251421060882473, + "learning_rate": 0.00027060307413229976, + "loss": 2.9927330017089844, + "step": 8276, + "token_acc": 0.2992310272888339 + }, + { + "epoch": 4.85194957490472, + "grad_norm": 0.2224051389323478, + "learning_rate": 0.00027059442917159716, + "loss": 2.990953207015991, + "step": 8277, + "token_acc": 0.2982096651943232 + }, + { + "epoch": 4.852535913221929, + "grad_norm": 0.20377955719138155, + "learning_rate": 0.0002705857830780725, + "loss": 2.960538864135742, + "step": 8278, + "token_acc": 0.30470049755363 + }, + { + "epoch": 4.853122251539138, + "grad_norm": 0.21275551043603683, + "learning_rate": 0.00027057713585180684, + "loss": 2.9693145751953125, + "step": 8279, + "token_acc": 0.30256442345361945 + }, + { + "epoch": 4.853708589856347, + "grad_norm": 0.18886876465750535, + "learning_rate": 0.00027056848749288146, + "loss": 2.984286308288574, + "step": 8280, + "token_acc": 0.30131887606012375 + }, + { + "epoch": 4.854294928173556, + "grad_norm": 0.2113240148454506, + "learning_rate": 0.0002705598380013776, + "loss": 3.0266451835632324, + "step": 8281, + "token_acc": 0.29370867329182937 + }, + { + "epoch": 4.854881266490765, + "grad_norm": 0.21126946429065754, + "learning_rate": 0.0002705511873773766, + "loss": 3.0113062858581543, + "step": 8282, + "token_acc": 0.2974305036191765 + }, + { + "epoch": 4.855467604807974, + "grad_norm": 0.22110017260548004, + "learning_rate": 0.0002705425356209596, + "loss": 2.980210065841675, + "step": 8283, + "token_acc": 0.30142394873048545 + }, + { + "epoch": 4.8560539431251835, + "grad_norm": 0.18245448567354983, + "learning_rate": 0.00027053388273220785, + "loss": 2.9665417671203613, + "step": 8284, + "token_acc": 0.3038380116760129 + }, + { + "epoch": 4.856640281442393, + "grad_norm": 0.22451408348846177, + "learning_rate": 0.0002705252287112028, + "loss": 2.973139762878418, + "step": 8285, + "token_acc": 0.30140251662007755 + }, + { + "epoch": 4.857226619759601, + "grad_norm": 0.21268845985964765, + "learning_rate": 0.00027051657355802556, + "loss": 3.049710750579834, + "step": 8286, + "token_acc": 0.29100559580825 + }, + { + "epoch": 4.85781295807681, + "grad_norm": 0.20462371436907503, + "learning_rate": 0.0002705079172727575, + "loss": 3.0063636302948, + "step": 8287, + "token_acc": 0.2966566752447988 + }, + { + "epoch": 4.858399296394019, + "grad_norm": 0.20753750365678078, + "learning_rate": 0.00027049925985547996, + "loss": 2.9956183433532715, + "step": 8288, + "token_acc": 0.2993776025343255 + }, + { + "epoch": 4.858985634711228, + "grad_norm": 0.20977617614297875, + "learning_rate": 0.00027049060130627427, + "loss": 3.0029563903808594, + "step": 8289, + "token_acc": 0.29884217879691777 + }, + { + "epoch": 4.859571973028437, + "grad_norm": 0.21549819867105793, + "learning_rate": 0.00027048194162522174, + "loss": 3.0235514640808105, + "step": 8290, + "token_acc": 0.29508175554935806 + }, + { + "epoch": 4.860158311345646, + "grad_norm": 0.21163361187905572, + "learning_rate": 0.00027047328081240374, + "loss": 2.985368013381958, + "step": 8291, + "token_acc": 0.2999753793043197 + }, + { + "epoch": 4.8607446496628555, + "grad_norm": 0.2268461296799456, + "learning_rate": 0.0002704646188679015, + "loss": 2.991875171661377, + "step": 8292, + "token_acc": 0.29971449039673287 + }, + { + "epoch": 4.861330987980065, + "grad_norm": 0.22995644597671372, + "learning_rate": 0.00027045595579179663, + "loss": 2.974151849746704, + "step": 8293, + "token_acc": 0.30289845050718894 + }, + { + "epoch": 4.861917326297274, + "grad_norm": 0.2259457387208414, + "learning_rate": 0.00027044729158417027, + "loss": 2.9690887928009033, + "step": 8294, + "token_acc": 0.3042366653210363 + }, + { + "epoch": 4.862503664614483, + "grad_norm": 0.2057039044228907, + "learning_rate": 0.000270438626245104, + "loss": 2.943039655685425, + "step": 8295, + "token_acc": 0.308653000187586 + }, + { + "epoch": 4.863090002931692, + "grad_norm": 0.22562904220503474, + "learning_rate": 0.00027042995977467904, + "loss": 3.0035948753356934, + "step": 8296, + "token_acc": 0.2986725305123481 + }, + { + "epoch": 4.863676341248901, + "grad_norm": 0.19778530180446824, + "learning_rate": 0.00027042129217297697, + "loss": 3.009737968444824, + "step": 8297, + "token_acc": 0.2970891677589317 + }, + { + "epoch": 4.86426267956611, + "grad_norm": 0.20897457298158292, + "learning_rate": 0.00027041262344007906, + "loss": 2.993360996246338, + "step": 8298, + "token_acc": 0.2983904349487359 + }, + { + "epoch": 4.864849017883318, + "grad_norm": 0.1959273235638305, + "learning_rate": 0.00027040395357606686, + "loss": 2.9853620529174805, + "step": 8299, + "token_acc": 0.30009978291907496 + }, + { + "epoch": 4.8654353562005275, + "grad_norm": 0.20599792891736124, + "learning_rate": 0.0002703952825810218, + "loss": 2.9691696166992188, + "step": 8300, + "token_acc": 0.3048215495614276 + }, + { + "epoch": 4.866021694517737, + "grad_norm": 0.207600793725992, + "learning_rate": 0.0002703866104550252, + "loss": 3.0058791637420654, + "step": 8301, + "token_acc": 0.2978461683107033 + }, + { + "epoch": 4.866608032834946, + "grad_norm": 0.20320972403720206, + "learning_rate": 0.00027037793719815863, + "loss": 2.973942756652832, + "step": 8302, + "token_acc": 0.301723815278008 + }, + { + "epoch": 4.867194371152155, + "grad_norm": 0.21385604701228395, + "learning_rate": 0.00027036926281050357, + "loss": 3.0055935382843018, + "step": 8303, + "token_acc": 0.29723260371238625 + }, + { + "epoch": 4.867780709469364, + "grad_norm": 0.21743104364286145, + "learning_rate": 0.00027036058729214155, + "loss": 2.9955458641052246, + "step": 8304, + "token_acc": 0.29993386421193563 + }, + { + "epoch": 4.868367047786573, + "grad_norm": 0.2089473135263253, + "learning_rate": 0.00027035191064315393, + "loss": 2.988430976867676, + "step": 8305, + "token_acc": 0.2996390645429847 + }, + { + "epoch": 4.868953386103782, + "grad_norm": 0.2051701429427241, + "learning_rate": 0.0002703432328636223, + "loss": 3.013498544692993, + "step": 8306, + "token_acc": 0.29519866095781966 + }, + { + "epoch": 4.869539724420991, + "grad_norm": 0.20540858157012495, + "learning_rate": 0.0002703345539536282, + "loss": 3.0219955444335938, + "step": 8307, + "token_acc": 0.2935993281644456 + }, + { + "epoch": 4.8701260627381995, + "grad_norm": 0.22780299336992843, + "learning_rate": 0.00027032587391325304, + "loss": 2.9904799461364746, + "step": 8308, + "token_acc": 0.299680403957194 + }, + { + "epoch": 4.870712401055409, + "grad_norm": 0.22370764201963325, + "learning_rate": 0.00027031719274257847, + "loss": 2.9609920978546143, + "step": 8309, + "token_acc": 0.3043538628866165 + }, + { + "epoch": 4.871298739372618, + "grad_norm": 0.20870684594517774, + "learning_rate": 0.000270308510441686, + "loss": 2.9610705375671387, + "step": 8310, + "token_acc": 0.304167827114587 + }, + { + "epoch": 4.871885077689827, + "grad_norm": 0.24826345593862675, + "learning_rate": 0.0002702998270106572, + "loss": 2.986482858657837, + "step": 8311, + "token_acc": 0.3018383553275969 + }, + { + "epoch": 4.872471416007036, + "grad_norm": 0.19853341617748643, + "learning_rate": 0.00027029114244957365, + "loss": 3.026829957962036, + "step": 8312, + "token_acc": 0.29290353556234244 + }, + { + "epoch": 4.873057754324245, + "grad_norm": 0.2786651819651593, + "learning_rate": 0.00027028245675851686, + "loss": 3.0408358573913574, + "step": 8313, + "token_acc": 0.29253726756588566 + }, + { + "epoch": 4.873644092641454, + "grad_norm": 0.2269772697968515, + "learning_rate": 0.00027027376993756853, + "loss": 3.0026705265045166, + "step": 8314, + "token_acc": 0.29847208006067133 + }, + { + "epoch": 4.874230430958663, + "grad_norm": 0.25695568579294104, + "learning_rate": 0.00027026508198681025, + "loss": 3.007025957107544, + "step": 8315, + "token_acc": 0.2972591339829803 + }, + { + "epoch": 4.874816769275872, + "grad_norm": 0.2631847657840096, + "learning_rate": 0.00027025639290632344, + "loss": 2.943148374557495, + "step": 8316, + "token_acc": 0.30700712271294556 + }, + { + "epoch": 4.8754031075930815, + "grad_norm": 0.23661515736191657, + "learning_rate": 0.00027024770269618996, + "loss": 2.9593827724456787, + "step": 8317, + "token_acc": 0.30481104816744164 + }, + { + "epoch": 4.875989445910291, + "grad_norm": 0.25111721648964436, + "learning_rate": 0.00027023901135649135, + "loss": 3.0096023082733154, + "step": 8318, + "token_acc": 0.2975572551428995 + }, + { + "epoch": 4.8765757842275, + "grad_norm": 0.2357558755775501, + "learning_rate": 0.00027023031888730924, + "loss": 3.0190649032592773, + "step": 8319, + "token_acc": 0.2969311621460563 + }, + { + "epoch": 4.877162122544708, + "grad_norm": 0.2152152197424242, + "learning_rate": 0.00027022162528872527, + "loss": 3.0297937393188477, + "step": 8320, + "token_acc": 0.295582867267216 + }, + { + "epoch": 4.877748460861917, + "grad_norm": 0.24459521765895484, + "learning_rate": 0.00027021293056082114, + "loss": 2.99194073677063, + "step": 8321, + "token_acc": 0.2978930171720939 + }, + { + "epoch": 4.878334799179126, + "grad_norm": 0.2058645297594416, + "learning_rate": 0.0002702042347036785, + "loss": 2.942354679107666, + "step": 8322, + "token_acc": 0.30625731448462173 + }, + { + "epoch": 4.878921137496335, + "grad_norm": 0.23631278594597172, + "learning_rate": 0.0002701955377173791, + "loss": 2.993741512298584, + "step": 8323, + "token_acc": 0.300544375769899 + }, + { + "epoch": 4.879507475813544, + "grad_norm": 0.23075767503014266, + "learning_rate": 0.00027018683960200457, + "loss": 2.969167947769165, + "step": 8324, + "token_acc": 0.302992447748009 + }, + { + "epoch": 4.8800938141307535, + "grad_norm": 0.22332381557828496, + "learning_rate": 0.00027017814035763663, + "loss": 2.994985342025757, + "step": 8325, + "token_acc": 0.30034226524395746 + }, + { + "epoch": 4.880680152447963, + "grad_norm": 0.24463309465615365, + "learning_rate": 0.000270169439984357, + "loss": 3.049400806427002, + "step": 8326, + "token_acc": 0.2920832496821254 + }, + { + "epoch": 4.881266490765172, + "grad_norm": 0.2334382853092266, + "learning_rate": 0.00027016073848224744, + "loss": 2.9844210147857666, + "step": 8327, + "token_acc": 0.30257162010240574 + }, + { + "epoch": 4.881852829082381, + "grad_norm": 0.22393271407282483, + "learning_rate": 0.0002701520358513896, + "loss": 3.002007484436035, + "step": 8328, + "token_acc": 0.2982487270068126 + }, + { + "epoch": 4.88243916739959, + "grad_norm": 0.2271573288808901, + "learning_rate": 0.0002701433320918653, + "loss": 2.9973502159118652, + "step": 8329, + "token_acc": 0.29873357399612105 + }, + { + "epoch": 4.883025505716798, + "grad_norm": 0.20608651754746962, + "learning_rate": 0.0002701346272037564, + "loss": 2.96848201751709, + "step": 8330, + "token_acc": 0.30349997602391215 + }, + { + "epoch": 4.883611844034007, + "grad_norm": 0.22779613192528259, + "learning_rate": 0.00027012592118714443, + "loss": 2.9799840450286865, + "step": 8331, + "token_acc": 0.303440595510228 + }, + { + "epoch": 4.884198182351216, + "grad_norm": 0.19277712424960758, + "learning_rate": 0.00027011721404211135, + "loss": 3.00592041015625, + "step": 8332, + "token_acc": 0.297942335093454 + }, + { + "epoch": 4.8847845206684255, + "grad_norm": 0.23224211302648523, + "learning_rate": 0.00027010850576873887, + "loss": 2.938991069793701, + "step": 8333, + "token_acc": 0.30752372648753157 + }, + { + "epoch": 4.885370858985635, + "grad_norm": 0.2234232331299914, + "learning_rate": 0.0002700997963671089, + "loss": 2.9563779830932617, + "step": 8334, + "token_acc": 0.3043632734739515 + }, + { + "epoch": 4.885957197302844, + "grad_norm": 0.22176051902812696, + "learning_rate": 0.00027009108583730317, + "loss": 3.010709285736084, + "step": 8335, + "token_acc": 0.29798133614462535 + }, + { + "epoch": 4.886543535620053, + "grad_norm": 0.22805129246129396, + "learning_rate": 0.00027008237417940344, + "loss": 2.9923486709594727, + "step": 8336, + "token_acc": 0.2989377882994904 + }, + { + "epoch": 4.887129873937262, + "grad_norm": 0.2160863244366599, + "learning_rate": 0.0002700736613934917, + "loss": 3.003962993621826, + "step": 8337, + "token_acc": 0.2977441065690178 + }, + { + "epoch": 4.887716212254471, + "grad_norm": 0.2081190836802079, + "learning_rate": 0.0002700649474796496, + "loss": 2.9922757148742676, + "step": 8338, + "token_acc": 0.2981770372430533 + }, + { + "epoch": 4.88830255057168, + "grad_norm": 0.20960946312061488, + "learning_rate": 0.00027005623243795925, + "loss": 2.9644925594329834, + "step": 8339, + "token_acc": 0.30314812238341576 + }, + { + "epoch": 4.888888888888889, + "grad_norm": 0.2219174063424716, + "learning_rate": 0.00027004751626850227, + "loss": 2.954298257827759, + "step": 8340, + "token_acc": 0.305631932364439 + }, + { + "epoch": 4.889475227206098, + "grad_norm": 0.20114860864952394, + "learning_rate": 0.00027003879897136065, + "loss": 2.9308090209960938, + "step": 8341, + "token_acc": 0.30858802518933376 + }, + { + "epoch": 4.890061565523307, + "grad_norm": 0.19597048494203545, + "learning_rate": 0.00027003008054661635, + "loss": 3.0033202171325684, + "step": 8342, + "token_acc": 0.29898269083375406 + }, + { + "epoch": 4.890647903840516, + "grad_norm": 0.20652956121923874, + "learning_rate": 0.0002700213609943511, + "loss": 3.0004847049713135, + "step": 8343, + "token_acc": 0.2995970078491282 + }, + { + "epoch": 4.891234242157725, + "grad_norm": 0.199535098714059, + "learning_rate": 0.00027001264031464693, + "loss": 2.9969570636749268, + "step": 8344, + "token_acc": 0.29919315257388746 + }, + { + "epoch": 4.891820580474934, + "grad_norm": 0.2106188376121522, + "learning_rate": 0.00027000391850758574, + "loss": 3.0193095207214355, + "step": 8345, + "token_acc": 0.2973497417166156 + }, + { + "epoch": 4.892406918792143, + "grad_norm": 0.220587921191076, + "learning_rate": 0.0002699951955732494, + "loss": 2.9307024478912354, + "step": 8346, + "token_acc": 0.3094149357005887 + }, + { + "epoch": 4.892993257109352, + "grad_norm": 0.23580207513280918, + "learning_rate": 0.0002699864715117199, + "loss": 3.0445942878723145, + "step": 8347, + "token_acc": 0.29174571595582127 + }, + { + "epoch": 4.893579595426561, + "grad_norm": 0.23390306811699116, + "learning_rate": 0.0002699777463230792, + "loss": 3.0159993171691895, + "step": 8348, + "token_acc": 0.29759288970249814 + }, + { + "epoch": 4.89416593374377, + "grad_norm": 0.201915639197607, + "learning_rate": 0.00026996902000740925, + "loss": 3.046464443206787, + "step": 8349, + "token_acc": 0.29314712808006055 + }, + { + "epoch": 4.8947522720609795, + "grad_norm": 0.2315374185054265, + "learning_rate": 0.000269960292564792, + "loss": 2.9855873584747314, + "step": 8350, + "token_acc": 0.29996698831894364 + }, + { + "epoch": 4.895338610378188, + "grad_norm": 0.21431515928075984, + "learning_rate": 0.0002699515639953095, + "loss": 2.999284029006958, + "step": 8351, + "token_acc": 0.2980395545918314 + }, + { + "epoch": 4.895924948695397, + "grad_norm": 0.20033907740782358, + "learning_rate": 0.0002699428342990436, + "loss": 3.0185885429382324, + "step": 8352, + "token_acc": 0.2965744841489138 + }, + { + "epoch": 4.896511287012606, + "grad_norm": 0.22491898272875785, + "learning_rate": 0.0002699341034760765, + "loss": 3.009448766708374, + "step": 8353, + "token_acc": 0.29726778442568874 + }, + { + "epoch": 4.897097625329815, + "grad_norm": 0.2109926414410152, + "learning_rate": 0.0002699253715264901, + "loss": 3.0009264945983887, + "step": 8354, + "token_acc": 0.2988948922616668 + }, + { + "epoch": 4.897683963647024, + "grad_norm": 0.22789136467725807, + "learning_rate": 0.00026991663845036634, + "loss": 2.967909812927246, + "step": 8355, + "token_acc": 0.30253024726583566 + }, + { + "epoch": 4.898270301964233, + "grad_norm": 0.20359132702118812, + "learning_rate": 0.00026990790424778744, + "loss": 2.99458646774292, + "step": 8356, + "token_acc": 0.2989665903499951 + }, + { + "epoch": 4.898856640281442, + "grad_norm": 0.21796669226261495, + "learning_rate": 0.0002698991689188353, + "loss": 3.0155787467956543, + "step": 8357, + "token_acc": 0.2963291957290243 + }, + { + "epoch": 4.8994429785986515, + "grad_norm": 0.2018675695516116, + "learning_rate": 0.00026989043246359207, + "loss": 2.9886474609375, + "step": 8358, + "token_acc": 0.30043004883425767 + }, + { + "epoch": 4.900029316915861, + "grad_norm": 0.212024831789168, + "learning_rate": 0.0002698816948821398, + "loss": 2.927656412124634, + "step": 8359, + "token_acc": 0.3092044794734197 + }, + { + "epoch": 4.90061565523307, + "grad_norm": 0.21955611695040633, + "learning_rate": 0.00026987295617456053, + "loss": 2.958770751953125, + "step": 8360, + "token_acc": 0.30328605436993206 + }, + { + "epoch": 4.901201993550279, + "grad_norm": 0.1929131331019733, + "learning_rate": 0.0002698642163409363, + "loss": 2.9613914489746094, + "step": 8361, + "token_acc": 0.3043022908301823 + }, + { + "epoch": 4.901788331867488, + "grad_norm": 0.25042426940239154, + "learning_rate": 0.00026985547538134935, + "loss": 2.9870407581329346, + "step": 8362, + "token_acc": 0.30051919534026833 + }, + { + "epoch": 4.902374670184696, + "grad_norm": 0.22773550843637352, + "learning_rate": 0.00026984673329588166, + "loss": 3.0202860832214355, + "step": 8363, + "token_acc": 0.29658324671517283 + }, + { + "epoch": 4.902961008501905, + "grad_norm": 0.22044379674509915, + "learning_rate": 0.0002698379900846155, + "loss": 3.0652928352355957, + "step": 8364, + "token_acc": 0.2889725126629253 + }, + { + "epoch": 4.903547346819114, + "grad_norm": 0.2029750977073379, + "learning_rate": 0.0002698292457476328, + "loss": 3.005385398864746, + "step": 8365, + "token_acc": 0.2972461269050371 + }, + { + "epoch": 4.9041336851363235, + "grad_norm": 0.2048933439272638, + "learning_rate": 0.00026982050028501593, + "loss": 2.997194766998291, + "step": 8366, + "token_acc": 0.2990568584093764 + }, + { + "epoch": 4.904720023453533, + "grad_norm": 0.21303290208827466, + "learning_rate": 0.00026981175369684686, + "loss": 2.996439218521118, + "step": 8367, + "token_acc": 0.2976712122532745 + }, + { + "epoch": 4.905306361770742, + "grad_norm": 0.2126438781667676, + "learning_rate": 0.0002698030059832078, + "loss": 2.990018844604492, + "step": 8368, + "token_acc": 0.3001522513338541 + }, + { + "epoch": 4.905892700087951, + "grad_norm": 0.21031879554456198, + "learning_rate": 0.0002697942571441809, + "loss": 2.9971084594726562, + "step": 8369, + "token_acc": 0.30014850264208764 + }, + { + "epoch": 4.90647903840516, + "grad_norm": 0.23065621452013596, + "learning_rate": 0.00026978550717984845, + "loss": 2.9923644065856934, + "step": 8370, + "token_acc": 0.30014770659140666 + }, + { + "epoch": 4.907065376722369, + "grad_norm": 0.2308186422445125, + "learning_rate": 0.00026977675609029253, + "loss": 3.0064454078674316, + "step": 8371, + "token_acc": 0.29700669544694 + }, + { + "epoch": 4.907651715039578, + "grad_norm": 0.20070861798165995, + "learning_rate": 0.0002697680038755954, + "loss": 2.9877872467041016, + "step": 8372, + "token_acc": 0.299996883796782 + }, + { + "epoch": 4.908238053356786, + "grad_norm": 0.24113697071260487, + "learning_rate": 0.00026975925053583926, + "loss": 2.956625461578369, + "step": 8373, + "token_acc": 0.30445273605658413 + }, + { + "epoch": 4.9088243916739955, + "grad_norm": 0.1946465123500959, + "learning_rate": 0.00026975049607110636, + "loss": 3.0029759407043457, + "step": 8374, + "token_acc": 0.2986755613492966 + }, + { + "epoch": 4.909410729991205, + "grad_norm": 0.23342854008300248, + "learning_rate": 0.00026974174048147886, + "loss": 3.003960132598877, + "step": 8375, + "token_acc": 0.29725691217104344 + }, + { + "epoch": 4.909997068308414, + "grad_norm": 0.206435509363085, + "learning_rate": 0.00026973298376703914, + "loss": 3.0221593379974365, + "step": 8376, + "token_acc": 0.29548151829911545 + }, + { + "epoch": 4.910583406625623, + "grad_norm": 0.19098095810438953, + "learning_rate": 0.0002697242259278693, + "loss": 2.9960806369781494, + "step": 8377, + "token_acc": 0.29971028526359356 + }, + { + "epoch": 4.911169744942832, + "grad_norm": 0.2021270139908501, + "learning_rate": 0.00026971546696405175, + "loss": 2.9911251068115234, + "step": 8378, + "token_acc": 0.2984395875218886 + }, + { + "epoch": 4.911756083260041, + "grad_norm": 0.2058673375755389, + "learning_rate": 0.0002697067068756687, + "loss": 2.9774391651153564, + "step": 8379, + "token_acc": 0.3007433794434786 + }, + { + "epoch": 4.91234242157725, + "grad_norm": 0.21314327746217107, + "learning_rate": 0.0002696979456628024, + "loss": 2.985125780105591, + "step": 8380, + "token_acc": 0.30076058158152186 + }, + { + "epoch": 4.912928759894459, + "grad_norm": 0.20556273554346913, + "learning_rate": 0.00026968918332553524, + "loss": 2.9810850620269775, + "step": 8381, + "token_acc": 0.30111268276816716 + }, + { + "epoch": 4.913515098211668, + "grad_norm": 0.21560597605152523, + "learning_rate": 0.00026968041986394944, + "loss": 2.987072229385376, + "step": 8382, + "token_acc": 0.30056176213241353 + }, + { + "epoch": 4.9141014365288775, + "grad_norm": 0.221942776286736, + "learning_rate": 0.0002696716552781274, + "loss": 3.0008859634399414, + "step": 8383, + "token_acc": 0.30010557803976773 + }, + { + "epoch": 4.914687774846087, + "grad_norm": 0.18795833429439313, + "learning_rate": 0.0002696628895681514, + "loss": 2.9854702949523926, + "step": 8384, + "token_acc": 0.3020270546823838 + }, + { + "epoch": 4.915274113163295, + "grad_norm": 0.22173167719645814, + "learning_rate": 0.00026965412273410384, + "loss": 2.942479133605957, + "step": 8385, + "token_acc": 0.3066781903921374 + }, + { + "epoch": 4.915860451480504, + "grad_norm": 0.2289871099873596, + "learning_rate": 0.00026964535477606703, + "loss": 2.985018253326416, + "step": 8386, + "token_acc": 0.29989182490337757 + }, + { + "epoch": 4.916446789797713, + "grad_norm": 0.2138179200384933, + "learning_rate": 0.0002696365856941233, + "loss": 3.042973279953003, + "step": 8387, + "token_acc": 0.2925601503152205 + }, + { + "epoch": 4.917033128114922, + "grad_norm": 0.19969635755408183, + "learning_rate": 0.0002696278154883551, + "loss": 2.9995951652526855, + "step": 8388, + "token_acc": 0.2999229479938676 + }, + { + "epoch": 4.917619466432131, + "grad_norm": 0.2134099973756909, + "learning_rate": 0.0002696190441588447, + "loss": 2.9686405658721924, + "step": 8389, + "token_acc": 0.30155211809033994 + }, + { + "epoch": 4.91820580474934, + "grad_norm": 0.21013660157159572, + "learning_rate": 0.0002696102717056746, + "loss": 2.9872312545776367, + "step": 8390, + "token_acc": 0.3018252377500107 + }, + { + "epoch": 4.9187921430665495, + "grad_norm": 0.20304796541311146, + "learning_rate": 0.0002696014981289272, + "loss": 3.01591157913208, + "step": 8391, + "token_acc": 0.2967994324830936 + }, + { + "epoch": 4.919378481383759, + "grad_norm": 0.21070608137065513, + "learning_rate": 0.0002695927234286849, + "loss": 3.03450870513916, + "step": 8392, + "token_acc": 0.29282850892168394 + }, + { + "epoch": 4.919964819700968, + "grad_norm": 0.2171034532229004, + "learning_rate": 0.00026958394760503014, + "loss": 2.9867115020751953, + "step": 8393, + "token_acc": 0.3000039200825831 + }, + { + "epoch": 4.920551158018176, + "grad_norm": 0.20959457681160368, + "learning_rate": 0.00026957517065804524, + "loss": 2.987800121307373, + "step": 8394, + "token_acc": 0.2997534131232333 + }, + { + "epoch": 4.921137496335385, + "grad_norm": 0.21093424197436736, + "learning_rate": 0.00026956639258781284, + "loss": 2.936405658721924, + "step": 8395, + "token_acc": 0.3083024187752035 + }, + { + "epoch": 4.921723834652594, + "grad_norm": 0.20153722135438673, + "learning_rate": 0.00026955761339441525, + "loss": 2.9935460090637207, + "step": 8396, + "token_acc": 0.30088912365245857 + }, + { + "epoch": 4.922310172969803, + "grad_norm": 0.21499051780596798, + "learning_rate": 0.00026954883307793497, + "loss": 2.996640205383301, + "step": 8397, + "token_acc": 0.298638714806364 + }, + { + "epoch": 4.9228965112870124, + "grad_norm": 0.22118314826799587, + "learning_rate": 0.0002695400516384545, + "loss": 2.956268072128296, + "step": 8398, + "token_acc": 0.3065479556543353 + }, + { + "epoch": 4.923482849604222, + "grad_norm": 0.23391369790275318, + "learning_rate": 0.00026953126907605635, + "loss": 3.018904447555542, + "step": 8399, + "token_acc": 0.2945343618550592 + }, + { + "epoch": 4.924069187921431, + "grad_norm": 0.2165648295214148, + "learning_rate": 0.000269522485390823, + "loss": 2.942016124725342, + "step": 8400, + "token_acc": 0.305956547771339 + }, + { + "epoch": 4.92465552623864, + "grad_norm": 0.23227379973558326, + "learning_rate": 0.00026951370058283695, + "loss": 3.0330958366394043, + "step": 8401, + "token_acc": 0.29258421552750974 + }, + { + "epoch": 4.925241864555849, + "grad_norm": 0.27364698104265506, + "learning_rate": 0.0002695049146521807, + "loss": 2.997887134552002, + "step": 8402, + "token_acc": 0.29722415950308095 + }, + { + "epoch": 4.925828202873058, + "grad_norm": 0.2018351753700248, + "learning_rate": 0.0002694961275989369, + "loss": 2.988598346710205, + "step": 8403, + "token_acc": 0.3002343557805601 + }, + { + "epoch": 4.926414541190267, + "grad_norm": 0.24482478802891694, + "learning_rate": 0.0002694873394231879, + "loss": 2.9950923919677734, + "step": 8404, + "token_acc": 0.2999200333681361 + }, + { + "epoch": 4.927000879507476, + "grad_norm": 0.22260276034979332, + "learning_rate": 0.0002694785501250164, + "loss": 2.9766478538513184, + "step": 8405, + "token_acc": 0.30282172547773145 + }, + { + "epoch": 4.927587217824685, + "grad_norm": 0.2003946066684013, + "learning_rate": 0.0002694697597045049, + "loss": 2.9792351722717285, + "step": 8406, + "token_acc": 0.301676268525774 + }, + { + "epoch": 4.928173556141894, + "grad_norm": 0.21692017672320285, + "learning_rate": 0.00026946096816173607, + "loss": 2.974802255630493, + "step": 8407, + "token_acc": 0.30251938085940566 + }, + { + "epoch": 4.928759894459103, + "grad_norm": 0.19313556769733614, + "learning_rate": 0.0002694521754967924, + "loss": 2.968822956085205, + "step": 8408, + "token_acc": 0.30276796059872624 + }, + { + "epoch": 4.929346232776312, + "grad_norm": 0.22271084516522124, + "learning_rate": 0.00026944338170975643, + "loss": 3.0156126022338867, + "step": 8409, + "token_acc": 0.29636260154188443 + }, + { + "epoch": 4.929932571093521, + "grad_norm": 0.2190101779363888, + "learning_rate": 0.00026943458680071087, + "loss": 3.0207104682922363, + "step": 8410, + "token_acc": 0.2955366811817308 + }, + { + "epoch": 4.93051890941073, + "grad_norm": 0.20434589375052417, + "learning_rate": 0.0002694257907697383, + "loss": 2.9921469688415527, + "step": 8411, + "token_acc": 0.3005560101992216 + }, + { + "epoch": 4.931105247727939, + "grad_norm": 0.2240634989308571, + "learning_rate": 0.0002694169936169214, + "loss": 2.998490810394287, + "step": 8412, + "token_acc": 0.2985619709425901 + }, + { + "epoch": 4.931691586045148, + "grad_norm": 0.2070128268531123, + "learning_rate": 0.0002694081953423428, + "loss": 2.9799141883850098, + "step": 8413, + "token_acc": 0.3021201228266907 + }, + { + "epoch": 4.932277924362357, + "grad_norm": 0.22889648050386702, + "learning_rate": 0.000269399395946085, + "loss": 2.9714977741241455, + "step": 8414, + "token_acc": 0.3032450742100636 + }, + { + "epoch": 4.9328642626795665, + "grad_norm": 0.20199597724945453, + "learning_rate": 0.0002693905954282308, + "loss": 3.0099167823791504, + "step": 8415, + "token_acc": 0.2986618532268701 + }, + { + "epoch": 4.933450600996775, + "grad_norm": 0.20759498431735665, + "learning_rate": 0.0002693817937888629, + "loss": 3.017467498779297, + "step": 8416, + "token_acc": 0.2963571813030786 + }, + { + "epoch": 4.934036939313984, + "grad_norm": 0.19768901082411183, + "learning_rate": 0.00026937299102806385, + "loss": 2.999526023864746, + "step": 8417, + "token_acc": 0.29828122778471117 + }, + { + "epoch": 4.934623277631193, + "grad_norm": 0.2307507312159836, + "learning_rate": 0.0002693641871459164, + "loss": 2.9789161682128906, + "step": 8418, + "token_acc": 0.30155039800260414 + }, + { + "epoch": 4.935209615948402, + "grad_norm": 0.2175448400383013, + "learning_rate": 0.0002693553821425033, + "loss": 2.9536073207855225, + "step": 8419, + "token_acc": 0.3042566873079124 + }, + { + "epoch": 4.935795954265611, + "grad_norm": 0.20223555643901286, + "learning_rate": 0.00026934657601790715, + "loss": 2.9949638843536377, + "step": 8420, + "token_acc": 0.3004747843958531 + }, + { + "epoch": 4.93638229258282, + "grad_norm": 0.1956559284687212, + "learning_rate": 0.00026933776877221085, + "loss": 2.9929745197296143, + "step": 8421, + "token_acc": 0.2997855971749275 + }, + { + "epoch": 4.936968630900029, + "grad_norm": 0.22867970828366463, + "learning_rate": 0.00026932896040549693, + "loss": 3.0099945068359375, + "step": 8422, + "token_acc": 0.2975509226750177 + }, + { + "epoch": 4.9375549692172385, + "grad_norm": 0.21787166826352594, + "learning_rate": 0.00026932015091784824, + "loss": 3.006866216659546, + "step": 8423, + "token_acc": 0.29695328266348053 + }, + { + "epoch": 4.938141307534448, + "grad_norm": 0.1973800338440578, + "learning_rate": 0.00026931134030934754, + "loss": 3.0022268295288086, + "step": 8424, + "token_acc": 0.2988993577228871 + }, + { + "epoch": 4.938727645851657, + "grad_norm": 0.21314846864445536, + "learning_rate": 0.00026930252858007757, + "loss": 2.996411085128784, + "step": 8425, + "token_acc": 0.297467146126185 + }, + { + "epoch": 4.939313984168866, + "grad_norm": 0.20750597850772923, + "learning_rate": 0.0002692937157301211, + "loss": 2.9750428199768066, + "step": 8426, + "token_acc": 0.3034199264209025 + }, + { + "epoch": 4.939900322486075, + "grad_norm": 0.2199610803785113, + "learning_rate": 0.00026928490175956096, + "loss": 2.990264654159546, + "step": 8427, + "token_acc": 0.3009576140640633 + }, + { + "epoch": 4.940486660803283, + "grad_norm": 0.21524123817502536, + "learning_rate": 0.00026927608666847984, + "loss": 2.986807346343994, + "step": 8428, + "token_acc": 0.29978333145571395 + }, + { + "epoch": 4.941072999120492, + "grad_norm": 0.20568120707832382, + "learning_rate": 0.00026926727045696066, + "loss": 2.991528272628784, + "step": 8429, + "token_acc": 0.29982868712038624 + }, + { + "epoch": 4.941659337437701, + "grad_norm": 0.20347567623846854, + "learning_rate": 0.00026925845312508615, + "loss": 2.975145101547241, + "step": 8430, + "token_acc": 0.30244403533315084 + }, + { + "epoch": 4.9422456757549105, + "grad_norm": 0.23440120849188176, + "learning_rate": 0.0002692496346729392, + "loss": 2.952730417251587, + "step": 8431, + "token_acc": 0.3040475518038514 + }, + { + "epoch": 4.94283201407212, + "grad_norm": 0.22134977484793977, + "learning_rate": 0.0002692408151006026, + "loss": 3.020005226135254, + "step": 8432, + "token_acc": 0.29472090373291093 + }, + { + "epoch": 4.943418352389329, + "grad_norm": 0.23924042969055048, + "learning_rate": 0.00026923199440815926, + "loss": 3.0366058349609375, + "step": 8433, + "token_acc": 0.2942967488812146 + }, + { + "epoch": 4.944004690706538, + "grad_norm": 0.2649927303413495, + "learning_rate": 0.000269223172595692, + "loss": 2.9961647987365723, + "step": 8434, + "token_acc": 0.2989492962416224 + }, + { + "epoch": 4.944591029023747, + "grad_norm": 0.23854407432808508, + "learning_rate": 0.00026921434966328367, + "loss": 3.0154669284820557, + "step": 8435, + "token_acc": 0.29681241871719544 + }, + { + "epoch": 4.945177367340956, + "grad_norm": 0.2158491479406104, + "learning_rate": 0.0002692055256110171, + "loss": 2.978804111480713, + "step": 8436, + "token_acc": 0.3018929161949497 + }, + { + "epoch": 4.945763705658165, + "grad_norm": 0.2025285500797107, + "learning_rate": 0.00026919670043897535, + "loss": 2.9478654861450195, + "step": 8437, + "token_acc": 0.3051872959143974 + }, + { + "epoch": 4.946350043975373, + "grad_norm": 0.20950336186903448, + "learning_rate": 0.0002691878741472412, + "loss": 2.9977848529815674, + "step": 8438, + "token_acc": 0.2970232689444683 + }, + { + "epoch": 4.9469363822925825, + "grad_norm": 0.2018905349798462, + "learning_rate": 0.0002691790467358976, + "loss": 3.015772819519043, + "step": 8439, + "token_acc": 0.2954351272199683 + }, + { + "epoch": 4.947522720609792, + "grad_norm": 0.20918986965550712, + "learning_rate": 0.00026917021820502734, + "loss": 2.980820417404175, + "step": 8440, + "token_acc": 0.3016046914635738 + }, + { + "epoch": 4.948109058927001, + "grad_norm": 0.221952899334816, + "learning_rate": 0.0002691613885547136, + "loss": 3.004291534423828, + "step": 8441, + "token_acc": 0.29754385020983537 + }, + { + "epoch": 4.94869539724421, + "grad_norm": 0.19124200755718296, + "learning_rate": 0.00026915255778503906, + "loss": 2.9752614498138428, + "step": 8442, + "token_acc": 0.3008053138665609 + }, + { + "epoch": 4.949281735561419, + "grad_norm": 0.22770180341123622, + "learning_rate": 0.0002691437258960869, + "loss": 3.0074715614318848, + "step": 8443, + "token_acc": 0.29726466952473635 + }, + { + "epoch": 4.949868073878628, + "grad_norm": 0.21329574813881888, + "learning_rate": 0.00026913489288793994, + "loss": 2.9973888397216797, + "step": 8444, + "token_acc": 0.30099966135082246 + }, + { + "epoch": 4.950454412195837, + "grad_norm": 0.20505384439753171, + "learning_rate": 0.00026912605876068123, + "loss": 2.97145676612854, + "step": 8445, + "token_acc": 0.30133277645602285 + }, + { + "epoch": 4.951040750513046, + "grad_norm": 0.1948742082031514, + "learning_rate": 0.0002691172235143937, + "loss": 3.0285253524780273, + "step": 8446, + "token_acc": 0.29586629309346 + }, + { + "epoch": 4.951627088830255, + "grad_norm": 0.2112920860753504, + "learning_rate": 0.0002691083871491604, + "loss": 2.980862617492676, + "step": 8447, + "token_acc": 0.30051653393727545 + }, + { + "epoch": 4.9522134271474645, + "grad_norm": 0.19654947768045805, + "learning_rate": 0.00026909954966506424, + "loss": 3.0201306343078613, + "step": 8448, + "token_acc": 0.2949476185561827 + }, + { + "epoch": 4.952799765464674, + "grad_norm": 0.19769170493678753, + "learning_rate": 0.00026909071106218834, + "loss": 3.0439023971557617, + "step": 8449, + "token_acc": 0.29305726153643286 + }, + { + "epoch": 4.953386103781882, + "grad_norm": 0.20819369297640491, + "learning_rate": 0.00026908187134061566, + "loss": 2.9871439933776855, + "step": 8450, + "token_acc": 0.3008230726339859 + }, + { + "epoch": 4.953972442099091, + "grad_norm": 0.21233919880545474, + "learning_rate": 0.00026907303050042927, + "loss": 2.997241973876953, + "step": 8451, + "token_acc": 0.29816929490665217 + }, + { + "epoch": 4.9545587804163, + "grad_norm": 0.20627163432831816, + "learning_rate": 0.0002690641885417122, + "loss": 3.0203399658203125, + "step": 8452, + "token_acc": 0.29620003631364616 + }, + { + "epoch": 4.955145118733509, + "grad_norm": 0.19123315223970358, + "learning_rate": 0.00026905534546454757, + "loss": 2.993912696838379, + "step": 8453, + "token_acc": 0.2990920320399353 + }, + { + "epoch": 4.955731457050718, + "grad_norm": 0.2081198969617762, + "learning_rate": 0.00026904650126901837, + "loss": 2.9965202808380127, + "step": 8454, + "token_acc": 0.30055903549538626 + }, + { + "epoch": 4.956317795367927, + "grad_norm": 0.1952698248411993, + "learning_rate": 0.0002690376559552077, + "loss": 2.9936485290527344, + "step": 8455, + "token_acc": 0.297996685068452 + }, + { + "epoch": 4.9569041336851365, + "grad_norm": 0.20232938908006545, + "learning_rate": 0.0002690288095231987, + "loss": 3.019227981567383, + "step": 8456, + "token_acc": 0.2936725815274348 + }, + { + "epoch": 4.957490472002346, + "grad_norm": 0.20036035873785094, + "learning_rate": 0.0002690199619730744, + "loss": 2.952414035797119, + "step": 8457, + "token_acc": 0.3059009406044965 + }, + { + "epoch": 4.958076810319555, + "grad_norm": 0.20267484208594616, + "learning_rate": 0.0002690111133049179, + "loss": 2.9739437103271484, + "step": 8458, + "token_acc": 0.3013980092265492 + }, + { + "epoch": 4.958663148636763, + "grad_norm": 0.19394045548068875, + "learning_rate": 0.00026900226351881236, + "loss": 2.982703447341919, + "step": 8459, + "token_acc": 0.3016168830492769 + }, + { + "epoch": 4.959249486953972, + "grad_norm": 0.22145016718680308, + "learning_rate": 0.000268993412614841, + "loss": 2.9804892539978027, + "step": 8460, + "token_acc": 0.30077323105664205 + }, + { + "epoch": 4.959835825271181, + "grad_norm": 0.22650879298993257, + "learning_rate": 0.00026898456059308676, + "loss": 2.9736311435699463, + "step": 8461, + "token_acc": 0.3037633533090151 + }, + { + "epoch": 4.96042216358839, + "grad_norm": 0.25046212827401537, + "learning_rate": 0.00026897570745363297, + "loss": 2.9817047119140625, + "step": 8462, + "token_acc": 0.30017916573085324 + }, + { + "epoch": 4.961008501905599, + "grad_norm": 0.18876787692334743, + "learning_rate": 0.00026896685319656276, + "loss": 2.9657788276672363, + "step": 8463, + "token_acc": 0.30202046842597907 + }, + { + "epoch": 4.9615948402228085, + "grad_norm": 0.2145776393511897, + "learning_rate": 0.00026895799782195923, + "loss": 3.033235549926758, + "step": 8464, + "token_acc": 0.29612808240388117 + }, + { + "epoch": 4.962181178540018, + "grad_norm": 0.21159072824452757, + "learning_rate": 0.0002689491413299056, + "loss": 2.962999105453491, + "step": 8465, + "token_acc": 0.303319281082986 + }, + { + "epoch": 4.962767516857227, + "grad_norm": 0.20996798544046502, + "learning_rate": 0.0002689402837204851, + "loss": 3.029125928878784, + "step": 8466, + "token_acc": 0.2930430035471911 + }, + { + "epoch": 4.963353855174436, + "grad_norm": 0.19915183699198472, + "learning_rate": 0.0002689314249937809, + "loss": 2.9940290451049805, + "step": 8467, + "token_acc": 0.2995471772153206 + }, + { + "epoch": 4.963940193491645, + "grad_norm": 0.21439359307416384, + "learning_rate": 0.0002689225651498762, + "loss": 2.9982776641845703, + "step": 8468, + "token_acc": 0.2991146779955971 + }, + { + "epoch": 4.964526531808854, + "grad_norm": 0.2227121993941797, + "learning_rate": 0.0002689137041888543, + "loss": 2.987152576446533, + "step": 8469, + "token_acc": 0.3008716801148285 + }, + { + "epoch": 4.965112870126063, + "grad_norm": 0.20264125439393568, + "learning_rate": 0.00026890484211079834, + "loss": 2.965209484100342, + "step": 8470, + "token_acc": 0.3029685174555268 + }, + { + "epoch": 4.965699208443271, + "grad_norm": 0.21333189836358857, + "learning_rate": 0.00026889597891579164, + "loss": 2.974865436553955, + "step": 8471, + "token_acc": 0.30254201050162943 + }, + { + "epoch": 4.9662855467604805, + "grad_norm": 0.217476352584251, + "learning_rate": 0.0002688871146039174, + "loss": 2.9489784240722656, + "step": 8472, + "token_acc": 0.3052412652063079 + }, + { + "epoch": 4.96687188507769, + "grad_norm": 0.21975665916074055, + "learning_rate": 0.00026887824917525894, + "loss": 3.0387122631073, + "step": 8473, + "token_acc": 0.2941518743175061 + }, + { + "epoch": 4.967458223394899, + "grad_norm": 0.21780992866489557, + "learning_rate": 0.0002688693826298996, + "loss": 2.9538016319274902, + "step": 8474, + "token_acc": 0.3058410299618433 + }, + { + "epoch": 4.968044561712108, + "grad_norm": 0.22998742603428857, + "learning_rate": 0.00026886051496792247, + "loss": 2.9767539501190186, + "step": 8475, + "token_acc": 0.30383988454909777 + }, + { + "epoch": 4.968630900029317, + "grad_norm": 0.20713760888149024, + "learning_rate": 0.000268851646189411, + "loss": 2.9757001399993896, + "step": 8476, + "token_acc": 0.303928524574363 + }, + { + "epoch": 4.969217238346526, + "grad_norm": 0.21319637710003644, + "learning_rate": 0.0002688427762944485, + "loss": 2.9882278442382812, + "step": 8477, + "token_acc": 0.3001053876208102 + }, + { + "epoch": 4.969803576663735, + "grad_norm": 0.20508565859759853, + "learning_rate": 0.0002688339052831182, + "loss": 3.0037784576416016, + "step": 8478, + "token_acc": 0.2977761084244957 + }, + { + "epoch": 4.970389914980944, + "grad_norm": 0.23151396736611776, + "learning_rate": 0.0002688250331555036, + "loss": 3.0115020275115967, + "step": 8479, + "token_acc": 0.29728166570271836 + }, + { + "epoch": 4.970976253298153, + "grad_norm": 0.20139569422768752, + "learning_rate": 0.00026881615991168785, + "loss": 2.991868734359741, + "step": 8480, + "token_acc": 0.29873567433284615 + }, + { + "epoch": 4.971562591615362, + "grad_norm": 0.19226075973248716, + "learning_rate": 0.0002688072855517544, + "loss": 2.9649338722229004, + "step": 8481, + "token_acc": 0.30468180966849107 + }, + { + "epoch": 4.972148929932571, + "grad_norm": 0.22781695536934862, + "learning_rate": 0.0002687984100757866, + "loss": 2.9693360328674316, + "step": 8482, + "token_acc": 0.30332206630475606 + }, + { + "epoch": 4.97273526824978, + "grad_norm": 0.2094257698110659, + "learning_rate": 0.00026878953348386783, + "loss": 3.0008745193481445, + "step": 8483, + "token_acc": 0.297136942162911 + }, + { + "epoch": 4.973321606566989, + "grad_norm": 0.1906570165359877, + "learning_rate": 0.0002687806557760815, + "loss": 2.983187675476074, + "step": 8484, + "token_acc": 0.3008018673766588 + }, + { + "epoch": 4.973907944884198, + "grad_norm": 0.2528615748365615, + "learning_rate": 0.00026877177695251085, + "loss": 3.0421366691589355, + "step": 8485, + "token_acc": 0.2918308459225226 + }, + { + "epoch": 4.974494283201407, + "grad_norm": 0.21558673000252443, + "learning_rate": 0.00026876289701323943, + "loss": 3.0209274291992188, + "step": 8486, + "token_acc": 0.29600109126798385 + }, + { + "epoch": 4.975080621518616, + "grad_norm": 0.2195569043358475, + "learning_rate": 0.0002687540159583507, + "loss": 2.9999489784240723, + "step": 8487, + "token_acc": 0.2997634631328288 + }, + { + "epoch": 4.975666959835825, + "grad_norm": 0.22676693059740008, + "learning_rate": 0.000268745133787928, + "loss": 2.9698002338409424, + "step": 8488, + "token_acc": 0.3044100580270793 + }, + { + "epoch": 4.9762532981530345, + "grad_norm": 0.2076299132726171, + "learning_rate": 0.00026873625050205475, + "loss": 2.958136558532715, + "step": 8489, + "token_acc": 0.30499698161910116 + }, + { + "epoch": 4.976839636470244, + "grad_norm": 0.2222294543663013, + "learning_rate": 0.00026872736610081445, + "loss": 2.975961685180664, + "step": 8490, + "token_acc": 0.30270208671644533 + }, + { + "epoch": 4.977425974787453, + "grad_norm": 0.20067974200373415, + "learning_rate": 0.0002687184805842905, + "loss": 3.0124568939208984, + "step": 8491, + "token_acc": 0.29955250450489684 + }, + { + "epoch": 4.978012313104662, + "grad_norm": 0.2211391695843876, + "learning_rate": 0.0002687095939525664, + "loss": 2.9497406482696533, + "step": 8492, + "token_acc": 0.3059322945754705 + }, + { + "epoch": 4.97859865142187, + "grad_norm": 0.242848362485957, + "learning_rate": 0.00026870070620572565, + "loss": 2.9625041484832764, + "step": 8493, + "token_acc": 0.30511928402234145 + }, + { + "epoch": 4.979184989739079, + "grad_norm": 0.20145584357890522, + "learning_rate": 0.00026869181734385177, + "loss": 2.9758644104003906, + "step": 8494, + "token_acc": 0.3011712769215066 + }, + { + "epoch": 4.979771328056288, + "grad_norm": 0.21269066245735013, + "learning_rate": 0.0002686829273670281, + "loss": 2.9367735385894775, + "step": 8495, + "token_acc": 0.3068337432866343 + }, + { + "epoch": 4.980357666373497, + "grad_norm": 0.2067552948377258, + "learning_rate": 0.0002686740362753383, + "loss": 3.0068559646606445, + "step": 8496, + "token_acc": 0.29747539283251084 + }, + { + "epoch": 4.9809440046907065, + "grad_norm": 0.20468657401543952, + "learning_rate": 0.00026866514406886585, + "loss": 2.9954683780670166, + "step": 8497, + "token_acc": 0.29878859373104355 + }, + { + "epoch": 4.981530343007916, + "grad_norm": 0.21790574319168096, + "learning_rate": 0.0002686562507476943, + "loss": 3.014962911605835, + "step": 8498, + "token_acc": 0.2963923442978795 + }, + { + "epoch": 4.982116681325125, + "grad_norm": 0.19771411631356903, + "learning_rate": 0.0002686473563119071, + "loss": 2.980262517929077, + "step": 8499, + "token_acc": 0.301705067200935 + }, + { + "epoch": 4.982703019642334, + "grad_norm": 0.23090012021942957, + "learning_rate": 0.0002686384607615879, + "loss": 3.062157392501831, + "step": 8500, + "token_acc": 0.2896665596952349 + }, + { + "epoch": 4.983289357959543, + "grad_norm": 0.22546734852786815, + "learning_rate": 0.0002686295640968203, + "loss": 2.979034900665283, + "step": 8501, + "token_acc": 0.30035945438005196 + }, + { + "epoch": 4.983875696276751, + "grad_norm": 0.2111143415019514, + "learning_rate": 0.0002686206663176877, + "loss": 2.9808335304260254, + "step": 8502, + "token_acc": 0.30095291568060534 + }, + { + "epoch": 4.98446203459396, + "grad_norm": 0.1935376485788839, + "learning_rate": 0.00026861176742427385, + "loss": 3.055887222290039, + "step": 8503, + "token_acc": 0.2902880538698146 + }, + { + "epoch": 4.985048372911169, + "grad_norm": 0.21508832953377718, + "learning_rate": 0.0002686028674166622, + "loss": 3.050093650817871, + "step": 8504, + "token_acc": 0.29282662433561574 + }, + { + "epoch": 4.9856347112283785, + "grad_norm": 0.20413416276731422, + "learning_rate": 0.0002685939662949365, + "loss": 2.9800338745117188, + "step": 8505, + "token_acc": 0.30172705111503784 + }, + { + "epoch": 4.986221049545588, + "grad_norm": 0.19560517290676374, + "learning_rate": 0.0002685850640591803, + "loss": 2.972660779953003, + "step": 8506, + "token_acc": 0.3033042965013331 + }, + { + "epoch": 4.986807387862797, + "grad_norm": 0.227106455877966, + "learning_rate": 0.00026857616070947716, + "loss": 3.0394527912139893, + "step": 8507, + "token_acc": 0.2938980236364878 + }, + { + "epoch": 4.987393726180006, + "grad_norm": 0.21508854612138037, + "learning_rate": 0.00026856725624591077, + "loss": 2.954577684402466, + "step": 8508, + "token_acc": 0.3040899513923709 + }, + { + "epoch": 4.987980064497215, + "grad_norm": 0.23193811498604655, + "learning_rate": 0.00026855835066856484, + "loss": 3.043628692626953, + "step": 8509, + "token_acc": 0.29314740759883523 + }, + { + "epoch": 4.988566402814424, + "grad_norm": 0.24117976327701496, + "learning_rate": 0.0002685494439775229, + "loss": 2.993330478668213, + "step": 8510, + "token_acc": 0.30112268705065437 + }, + { + "epoch": 4.989152741131633, + "grad_norm": 0.24443736466924584, + "learning_rate": 0.00026854053617286875, + "loss": 2.9977176189422607, + "step": 8511, + "token_acc": 0.3018649044760815 + }, + { + "epoch": 4.989739079448842, + "grad_norm": 0.21939386391594667, + "learning_rate": 0.000268531627254686, + "loss": 2.9714248180389404, + "step": 8512, + "token_acc": 0.30218344195399205 + }, + { + "epoch": 4.990325417766051, + "grad_norm": 0.24162567612846325, + "learning_rate": 0.0002685227172230583, + "loss": 2.9805681705474854, + "step": 8513, + "token_acc": 0.2997437936213446 + }, + { + "epoch": 4.99091175608326, + "grad_norm": 0.22742279283167072, + "learning_rate": 0.0002685138060780693, + "loss": 2.9601595401763916, + "step": 8514, + "token_acc": 0.3030044317502697 + }, + { + "epoch": 4.991498094400469, + "grad_norm": 0.22258674173850046, + "learning_rate": 0.0002685048938198029, + "loss": 3.0183377265930176, + "step": 8515, + "token_acc": 0.29600136063401483 + }, + { + "epoch": 4.992084432717678, + "grad_norm": 0.2279149068103971, + "learning_rate": 0.0002684959804483427, + "loss": 2.96256685256958, + "step": 8516, + "token_acc": 0.30310142541574625 + }, + { + "epoch": 4.992670771034887, + "grad_norm": 0.22246681186620215, + "learning_rate": 0.0002684870659637724, + "loss": 3.0014727115631104, + "step": 8517, + "token_acc": 0.2997433531737932 + }, + { + "epoch": 4.993257109352096, + "grad_norm": 0.22669150093585383, + "learning_rate": 0.00026847815036617584, + "loss": 3.0169224739074707, + "step": 8518, + "token_acc": 0.29893758539327864 + }, + { + "epoch": 4.993843447669305, + "grad_norm": 0.23032630239969018, + "learning_rate": 0.00026846923365563664, + "loss": 3.028780221939087, + "step": 8519, + "token_acc": 0.2946393113659854 + }, + { + "epoch": 4.994429785986514, + "grad_norm": 0.21039961834699555, + "learning_rate": 0.00026846031583223866, + "loss": 3.0046772956848145, + "step": 8520, + "token_acc": 0.29804268129585826 + }, + { + "epoch": 4.995016124303723, + "grad_norm": 0.2172861001487408, + "learning_rate": 0.00026845139689606566, + "loss": 3.0222673416137695, + "step": 8521, + "token_acc": 0.29566145732844556 + }, + { + "epoch": 4.9956024626209325, + "grad_norm": 0.2344632773659439, + "learning_rate": 0.00026844247684720135, + "loss": 2.9567339420318604, + "step": 8522, + "token_acc": 0.30555903571689585 + }, + { + "epoch": 4.996188800938142, + "grad_norm": 0.19150579539406898, + "learning_rate": 0.00026843355568572963, + "loss": 2.9834752082824707, + "step": 8523, + "token_acc": 0.2995720147588075 + }, + { + "epoch": 4.99677513925535, + "grad_norm": 0.23290790577776418, + "learning_rate": 0.0002684246334117342, + "loss": 3.0050179958343506, + "step": 8524, + "token_acc": 0.2980218322175985 + }, + { + "epoch": 4.997361477572559, + "grad_norm": 0.22853961310330662, + "learning_rate": 0.0002684157100252989, + "loss": 3.019460439682007, + "step": 8525, + "token_acc": 0.295701180530556 + }, + { + "epoch": 4.997947815889768, + "grad_norm": 0.21390865040351112, + "learning_rate": 0.0002684067855265076, + "loss": 3.009934902191162, + "step": 8526, + "token_acc": 0.2981720213515963 + }, + { + "epoch": 4.998534154206977, + "grad_norm": 0.2266954669656876, + "learning_rate": 0.00026839785991544416, + "loss": 3.0203042030334473, + "step": 8527, + "token_acc": 0.2959147888279368 + }, + { + "epoch": 4.999120492524186, + "grad_norm": 0.22410167785565435, + "learning_rate": 0.0002683889331921923, + "loss": 3.0135276317596436, + "step": 8528, + "token_acc": 0.29794654180079994 + }, + { + "epoch": 4.999706830841395, + "grad_norm": 0.23718210248128158, + "learning_rate": 0.000268380005356836, + "loss": 3.0254733562469482, + "step": 8529, + "token_acc": 0.2943432656682148 + }, + { + "epoch": 5.0, + "grad_norm": 0.2427673885137112, + "learning_rate": 0.00026837107640945905, + "loss": 3.0045809745788574, + "step": 8530, + "token_acc": 0.2979218530143955 + }, + { + "epoch": 5.0, + "eval_loss": 3.078289031982422, + "eval_runtime": 8.7208, + "eval_samples_per_second": 29.355, + "eval_steps_per_second": 3.669, + "eval_token_acc": 0.28924068596139113, + "step": 8530 + } + ], + "logging_steps": 1, + "max_steps": 34120, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": -34120, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3859518101340160.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}