diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,41005 @@ +{ + "best_global_step": 5118, + "best_metric": 3.08605433, + "best_model_checkpoint": "/inspire/hdd/project/deepanalysis/guitao-25013/Muse/workspace/Finals/ckpt/Muse_8b_main_1.4e-4/v0-20251230-182110/checkpoint-5118", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 5118, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005863383172090296, + "grad_norm": 353.3991972472677, + "learning_rate": 8.2063305978898e-08, + "loss": 24.279613494873047, + "step": 1, + "token_acc": 0.006257334790174416 + }, + { + "epoch": 0.0011726766344180592, + "grad_norm": 350.0536601985609, + "learning_rate": 1.64126611957796e-07, + "loss": 24.32593536376953, + "step": 2, + "token_acc": 0.006629192320853969 + }, + { + "epoch": 0.001759014951627089, + "grad_norm": 350.72277612537886, + "learning_rate": 2.46189917936694e-07, + "loss": 24.30904769897461, + "step": 3, + "token_acc": 0.006821216035803277 + }, + { + "epoch": 0.0023453532688361184, + "grad_norm": 349.95484398867524, + "learning_rate": 3.28253223915592e-07, + "loss": 24.276325225830078, + "step": 4, + "token_acc": 0.006935026587752649 + }, + { + "epoch": 0.002931691586045148, + "grad_norm": 350.92364205592605, + "learning_rate": 4.1031652989449e-07, + "loss": 24.29950523376465, + "step": 5, + "token_acc": 0.006618784925222573 + }, + { + "epoch": 0.003518029903254178, + "grad_norm": 353.063279858596, + "learning_rate": 4.92379835873388e-07, + "loss": 24.292869567871094, + "step": 6, + "token_acc": 0.006742489814595313 + }, + { + "epoch": 0.004104368220463207, + "grad_norm": 351.1884071141757, + "learning_rate": 5.744431418522861e-07, + "loss": 24.115386962890625, + "step": 7, + "token_acc": 0.006782887460025678 + }, + { + "epoch": 0.004690706537672237, + "grad_norm": 354.1448133387045, + "learning_rate": 6.56506447831184e-07, + "loss": 24.09915542602539, + "step": 8, + "token_acc": 0.006792604077671951 + }, + { + "epoch": 0.005277044854881266, + "grad_norm": 350.42175428441675, + "learning_rate": 7.38569753810082e-07, + "loss": 23.74787139892578, + "step": 9, + "token_acc": 0.0068703800772349465 + }, + { + "epoch": 0.005863383172090296, + "grad_norm": 352.6013674230611, + "learning_rate": 8.2063305978898e-07, + "loss": 23.604782104492188, + "step": 10, + "token_acc": 0.00683007561417538 + }, + { + "epoch": 0.006449721489299325, + "grad_norm": 351.32521265674916, + "learning_rate": 9.02696365767878e-07, + "loss": 23.572376251220703, + "step": 11, + "token_acc": 0.006992549309122515 + }, + { + "epoch": 0.007036059806508356, + "grad_norm": 347.02622475928257, + "learning_rate": 9.84759671746776e-07, + "loss": 22.248552322387695, + "step": 12, + "token_acc": 0.007175440695333753 + }, + { + "epoch": 0.007622398123717385, + "grad_norm": 348.8617545488523, + "learning_rate": 1.066822977725674e-06, + "loss": 22.154541015625, + "step": 13, + "token_acc": 0.006777886747962537 + }, + { + "epoch": 0.008208736440926415, + "grad_norm": 343.6973379841624, + "learning_rate": 1.1488862837045722e-06, + "loss": 21.72881317138672, + "step": 14, + "token_acc": 0.006503657347827978 + }, + { + "epoch": 0.008795074758135445, + "grad_norm": 341.3048722022376, + "learning_rate": 1.23094958968347e-06, + "loss": 21.626178741455078, + "step": 15, + "token_acc": 0.00670334386827422 + }, + { + "epoch": 0.009381413075344474, + "grad_norm": 180.2068215087911, + "learning_rate": 1.313012895662368e-06, + "loss": 18.097524642944336, + "step": 16, + "token_acc": 0.006140843631906853 + }, + { + "epoch": 0.009967751392553504, + "grad_norm": 173.98568033854303, + "learning_rate": 1.3950762016412662e-06, + "loss": 17.94785499572754, + "step": 17, + "token_acc": 0.006328160704640824 + }, + { + "epoch": 0.010554089709762533, + "grad_norm": 174.28139288515845, + "learning_rate": 1.477139507620164e-06, + "loss": 17.822351455688477, + "step": 18, + "token_acc": 0.006275670161458306 + }, + { + "epoch": 0.011140428026971563, + "grad_norm": 166.6479427524692, + "learning_rate": 1.5592028135990621e-06, + "loss": 17.317907333374023, + "step": 19, + "token_acc": 0.0059726743079873915 + }, + { + "epoch": 0.011726766344180592, + "grad_norm": 166.1027966467043, + "learning_rate": 1.64126611957796e-06, + "loss": 17.139856338500977, + "step": 20, + "token_acc": 0.006182355736233568 + }, + { + "epoch": 0.012313104661389622, + "grad_norm": 90.74082551872074, + "learning_rate": 1.7233294255568579e-06, + "loss": 14.137916564941406, + "step": 21, + "token_acc": 0.004946787386758282 + }, + { + "epoch": 0.01289944297859865, + "grad_norm": 62.72798753896125, + "learning_rate": 1.805392731535756e-06, + "loss": 13.710620880126953, + "step": 22, + "token_acc": 0.004691528834664486 + }, + { + "epoch": 0.013485781295807681, + "grad_norm": 55.78773460505893, + "learning_rate": 1.887456037514654e-06, + "loss": 13.558979034423828, + "step": 23, + "token_acc": 0.004765177425676655 + }, + { + "epoch": 0.014072119613016711, + "grad_norm": 50.273687477426186, + "learning_rate": 1.969519343493552e-06, + "loss": 13.42824935913086, + "step": 24, + "token_acc": 0.0046091386721859575 + }, + { + "epoch": 0.01465845793022574, + "grad_norm": 43.14364478235981, + "learning_rate": 2.05158264947245e-06, + "loss": 13.265416145324707, + "step": 25, + "token_acc": 0.004781320972733189 + }, + { + "epoch": 0.01524479624743477, + "grad_norm": 32.03762628531264, + "learning_rate": 2.133645955451348e-06, + "loss": 13.020715713500977, + "step": 26, + "token_acc": 0.004930619867678786 + }, + { + "epoch": 0.0158311345646438, + "grad_norm": 26.96031330799516, + "learning_rate": 2.215709261430246e-06, + "loss": 12.895532608032227, + "step": 27, + "token_acc": 0.005856069238352517 + }, + { + "epoch": 0.01641747288185283, + "grad_norm": 24.26451087839124, + "learning_rate": 2.2977725674091444e-06, + "loss": 12.806621551513672, + "step": 28, + "token_acc": 0.005940171808046141 + }, + { + "epoch": 0.017003811199061858, + "grad_norm": 9.740196653338913, + "learning_rate": 2.379835873388042e-06, + "loss": 12.42845344543457, + "step": 29, + "token_acc": 0.006846418397555078 + }, + { + "epoch": 0.01759014951627089, + "grad_norm": 7.486303824112801, + "learning_rate": 2.46189917936694e-06, + "loss": 12.334634780883789, + "step": 30, + "token_acc": 0.006826872012745619 + }, + { + "epoch": 0.01817648783347992, + "grad_norm": 6.001783632824859, + "learning_rate": 2.543962485345838e-06, + "loss": 12.280149459838867, + "step": 31, + "token_acc": 0.007606588508354323 + }, + { + "epoch": 0.018762826150688947, + "grad_norm": 4.880549492241735, + "learning_rate": 2.626025791324736e-06, + "loss": 12.226173400878906, + "step": 32, + "token_acc": 0.008839033926522591 + }, + { + "epoch": 0.019349164467897976, + "grad_norm": 4.17141709429935, + "learning_rate": 2.7080890973036343e-06, + "loss": 12.18889045715332, + "step": 33, + "token_acc": 0.008684544605667642 + }, + { + "epoch": 0.019935502785107008, + "grad_norm": 3.6115598562353433, + "learning_rate": 2.7901524032825323e-06, + "loss": 12.151744842529297, + "step": 34, + "token_acc": 0.00917035559157886 + }, + { + "epoch": 0.020521841102316037, + "grad_norm": 3.0890799514881246, + "learning_rate": 2.87221570926143e-06, + "loss": 12.10763931274414, + "step": 35, + "token_acc": 0.009937812623164599 + }, + { + "epoch": 0.021108179419525065, + "grad_norm": 2.6866379814053225, + "learning_rate": 2.954279015240328e-06, + "loss": 12.076554298400879, + "step": 36, + "token_acc": 0.0096938114127194 + }, + { + "epoch": 0.021694517736734097, + "grad_norm": 2.1172737148331486, + "learning_rate": 3.036342321219226e-06, + "loss": 12.0507230758667, + "step": 37, + "token_acc": 0.009487984343427925 + }, + { + "epoch": 0.022280856053943126, + "grad_norm": 1.7242673706242342, + "learning_rate": 3.1184056271981242e-06, + "loss": 12.029277801513672, + "step": 38, + "token_acc": 0.008936445257403641 + }, + { + "epoch": 0.022867194371152155, + "grad_norm": 1.487620116583969, + "learning_rate": 3.2004689331770222e-06, + "loss": 12.003844261169434, + "step": 39, + "token_acc": 0.008377003307046552 + }, + { + "epoch": 0.023453532688361183, + "grad_norm": 1.3178672925010604, + "learning_rate": 3.28253223915592e-06, + "loss": 11.979639053344727, + "step": 40, + "token_acc": 0.00939225773348996 + }, + { + "epoch": 0.024039871005570215, + "grad_norm": 1.1303030998807853, + "learning_rate": 3.3645955451348178e-06, + "loss": 11.960676193237305, + "step": 41, + "token_acc": 0.009738004464688135 + }, + { + "epoch": 0.024626209322779244, + "grad_norm": 0.9186250060482735, + "learning_rate": 3.4466588511137157e-06, + "loss": 11.949100494384766, + "step": 42, + "token_acc": 0.009821430926823348 + }, + { + "epoch": 0.025212547639988273, + "grad_norm": 0.9473638571784654, + "learning_rate": 3.528722157092614e-06, + "loss": 11.953974723815918, + "step": 43, + "token_acc": 0.008738816621751464 + }, + { + "epoch": 0.0257988859571973, + "grad_norm": 0.8210116536176688, + "learning_rate": 3.610785463071512e-06, + "loss": 11.93989086151123, + "step": 44, + "token_acc": 0.009321738043008175 + }, + { + "epoch": 0.026385224274406333, + "grad_norm": 0.7126951090689018, + "learning_rate": 3.69284876905041e-06, + "loss": 11.929862022399902, + "step": 45, + "token_acc": 0.009835562515872344 + }, + { + "epoch": 0.026971562591615362, + "grad_norm": 0.5711748472296502, + "learning_rate": 3.774912075029308e-06, + "loss": 11.915067672729492, + "step": 46, + "token_acc": 0.011866672596514818 + }, + { + "epoch": 0.02755790090882439, + "grad_norm": 0.5158692138559006, + "learning_rate": 3.856975381008206e-06, + "loss": 11.9104585647583, + "step": 47, + "token_acc": 0.011654396655386036 + }, + { + "epoch": 0.028144239226033423, + "grad_norm": 0.4769871603502968, + "learning_rate": 3.939038686987104e-06, + "loss": 11.898885726928711, + "step": 48, + "token_acc": 0.012095299320557298 + }, + { + "epoch": 0.02873057754324245, + "grad_norm": 0.580043562125423, + "learning_rate": 4.021101992966002e-06, + "loss": 11.89678955078125, + "step": 49, + "token_acc": 0.011564227778812823 + }, + { + "epoch": 0.02931691586045148, + "grad_norm": 0.43284830981288447, + "learning_rate": 4.1031652989449e-06, + "loss": 11.880815505981445, + "step": 50, + "token_acc": 0.012207451987845902 + }, + { + "epoch": 0.02990325417766051, + "grad_norm": 0.42267623135522364, + "learning_rate": 4.185228604923798e-06, + "loss": 11.875591278076172, + "step": 51, + "token_acc": 0.011903238527006876 + }, + { + "epoch": 0.03048959249486954, + "grad_norm": 0.4190264564203129, + "learning_rate": 4.267291910902696e-06, + "loss": 11.860198974609375, + "step": 52, + "token_acc": 0.012763264206840829 + }, + { + "epoch": 0.03107593081207857, + "grad_norm": 0.4316295201650057, + "learning_rate": 4.349355216881594e-06, + "loss": 11.856969833374023, + "step": 53, + "token_acc": 0.012433101148374258 + }, + { + "epoch": 0.0316622691292876, + "grad_norm": 0.4093283954262731, + "learning_rate": 4.431418522860492e-06, + "loss": 11.856557846069336, + "step": 54, + "token_acc": 0.01180031945909887 + }, + { + "epoch": 0.03224860744649663, + "grad_norm": 0.39209180275718025, + "learning_rate": 4.513481828839389e-06, + "loss": 11.849498748779297, + "step": 55, + "token_acc": 0.011768190806312053 + }, + { + "epoch": 0.03283494576370566, + "grad_norm": 0.4466528623134847, + "learning_rate": 4.595545134818289e-06, + "loss": 11.836506843566895, + "step": 56, + "token_acc": 0.012197683813627794 + }, + { + "epoch": 0.03342128408091469, + "grad_norm": 0.42285522610376636, + "learning_rate": 4.677608440797186e-06, + "loss": 11.831693649291992, + "step": 57, + "token_acc": 0.011911346238627711 + }, + { + "epoch": 0.034007622398123716, + "grad_norm": 0.41595729785113533, + "learning_rate": 4.759671746776084e-06, + "loss": 11.828054428100586, + "step": 58, + "token_acc": 0.011446931133495307 + }, + { + "epoch": 0.034593960715332744, + "grad_norm": 0.41794444908679035, + "learning_rate": 4.841735052754982e-06, + "loss": 11.808276176452637, + "step": 59, + "token_acc": 0.012374518254694083 + }, + { + "epoch": 0.03518029903254178, + "grad_norm": 0.44009696917083724, + "learning_rate": 4.92379835873388e-06, + "loss": 11.800575256347656, + "step": 60, + "token_acc": 0.01197958001758864 + }, + { + "epoch": 0.03576663734975081, + "grad_norm": 0.47676000285615605, + "learning_rate": 5.005861664712778e-06, + "loss": 11.784139633178711, + "step": 61, + "token_acc": 0.012095167177077652 + }, + { + "epoch": 0.03635297566695984, + "grad_norm": 0.5162531183377757, + "learning_rate": 5.087924970691676e-06, + "loss": 11.77094841003418, + "step": 62, + "token_acc": 0.0116184836595014 + }, + { + "epoch": 0.036939313984168866, + "grad_norm": 0.5354361559615101, + "learning_rate": 5.169988276670574e-06, + "loss": 11.753301620483398, + "step": 63, + "token_acc": 0.011516011989272755 + }, + { + "epoch": 0.037525652301377894, + "grad_norm": 0.5409300302992422, + "learning_rate": 5.252051582649472e-06, + "loss": 11.736246109008789, + "step": 64, + "token_acc": 0.011390995192671368 + }, + { + "epoch": 0.03811199061858692, + "grad_norm": 0.7359393050093247, + "learning_rate": 5.33411488862837e-06, + "loss": 11.713890075683594, + "step": 65, + "token_acc": 0.011716271182914269 + }, + { + "epoch": 0.03869832893579595, + "grad_norm": 0.8767697201056378, + "learning_rate": 5.416178194607269e-06, + "loss": 11.699317932128906, + "step": 66, + "token_acc": 0.011596935769004125 + }, + { + "epoch": 0.03928466725300499, + "grad_norm": 0.9267768490692999, + "learning_rate": 5.498241500586167e-06, + "loss": 11.68133544921875, + "step": 67, + "token_acc": 0.011488018933044079 + }, + { + "epoch": 0.039871005570214016, + "grad_norm": 0.7227860686025017, + "learning_rate": 5.580304806565065e-06, + "loss": 11.657129287719727, + "step": 68, + "token_acc": 0.011695423933334021 + }, + { + "epoch": 0.040457343887423045, + "grad_norm": 0.8402313651744766, + "learning_rate": 5.662368112543962e-06, + "loss": 11.631108283996582, + "step": 69, + "token_acc": 0.012419254683429972 + }, + { + "epoch": 0.04104368220463207, + "grad_norm": 0.7906118410184275, + "learning_rate": 5.74443141852286e-06, + "loss": 11.614664077758789, + "step": 70, + "token_acc": 0.011516319820151064 + }, + { + "epoch": 0.0416300205218411, + "grad_norm": 0.8322877908916292, + "learning_rate": 5.826494724501758e-06, + "loss": 11.590474128723145, + "step": 71, + "token_acc": 0.011352121595946802 + }, + { + "epoch": 0.04221635883905013, + "grad_norm": 1.059437267103675, + "learning_rate": 5.908558030480656e-06, + "loss": 11.550149917602539, + "step": 72, + "token_acc": 0.012050529335972265 + }, + { + "epoch": 0.04280269715625916, + "grad_norm": 1.084329561242705, + "learning_rate": 5.990621336459554e-06, + "loss": 11.522337913513184, + "step": 73, + "token_acc": 0.011590306481574587 + }, + { + "epoch": 0.043389035473468195, + "grad_norm": 1.3397929902734904, + "learning_rate": 6.072684642438452e-06, + "loss": 11.488041877746582, + "step": 74, + "token_acc": 0.011866717683133849 + }, + { + "epoch": 0.04397537379067722, + "grad_norm": 1.256433443371083, + "learning_rate": 6.15474794841735e-06, + "loss": 11.448644638061523, + "step": 75, + "token_acc": 0.012370915556290638 + }, + { + "epoch": 0.04456171210788625, + "grad_norm": 1.5273264321656337, + "learning_rate": 6.2368112543962485e-06, + "loss": 11.4195556640625, + "step": 76, + "token_acc": 0.011814501288185498 + }, + { + "epoch": 0.04514805042509528, + "grad_norm": 1.5489725307554358, + "learning_rate": 6.3188745603751465e-06, + "loss": 11.383012771606445, + "step": 77, + "token_acc": 0.01227650103264134 + }, + { + "epoch": 0.04573438874230431, + "grad_norm": 1.9959633742104053, + "learning_rate": 6.4009378663540444e-06, + "loss": 11.34876823425293, + "step": 78, + "token_acc": 0.011815972733584397 + }, + { + "epoch": 0.04632072705951334, + "grad_norm": 1.7373266034204045, + "learning_rate": 6.4830011723329424e-06, + "loss": 11.306633949279785, + "step": 79, + "token_acc": 0.012449112351533974 + }, + { + "epoch": 0.046907065376722366, + "grad_norm": 3.739424525440159, + "learning_rate": 6.56506447831184e-06, + "loss": 11.277594566345215, + "step": 80, + "token_acc": 0.012040986573906474 + }, + { + "epoch": 0.047493403693931395, + "grad_norm": 2.0105480424981095, + "learning_rate": 6.6471277842907375e-06, + "loss": 11.239297866821289, + "step": 81, + "token_acc": 0.01217278105266473 + }, + { + "epoch": 0.04807974201114043, + "grad_norm": 2.682525004385048, + "learning_rate": 6.7291910902696355e-06, + "loss": 11.20435905456543, + "step": 82, + "token_acc": 0.011083016156077825 + }, + { + "epoch": 0.04866608032834946, + "grad_norm": 2.922170204901006, + "learning_rate": 6.8112543962485335e-06, + "loss": 11.158313751220703, + "step": 83, + "token_acc": 0.011732856814205692 + }, + { + "epoch": 0.04925241864555849, + "grad_norm": 3.007051223128323, + "learning_rate": 6.8933177022274315e-06, + "loss": 11.121021270751953, + "step": 84, + "token_acc": 0.011253999215766588 + }, + { + "epoch": 0.049838756962767516, + "grad_norm": 4.693021109603985, + "learning_rate": 6.9753810082063295e-06, + "loss": 11.084024429321289, + "step": 85, + "token_acc": 0.01177730466363885 + }, + { + "epoch": 0.050425095279976545, + "grad_norm": 2.86869186485627, + "learning_rate": 7.057444314185228e-06, + "loss": 11.044998168945312, + "step": 86, + "token_acc": 0.010470217199688187 + }, + { + "epoch": 0.051011433597185574, + "grad_norm": 6.349549681675803, + "learning_rate": 7.139507620164126e-06, + "loss": 10.99063777923584, + "step": 87, + "token_acc": 0.011309202804301621 + }, + { + "epoch": 0.0515977719143946, + "grad_norm": 3.5156715298922077, + "learning_rate": 7.221570926143024e-06, + "loss": 10.937030792236328, + "step": 88, + "token_acc": 0.011740314842940141 + }, + { + "epoch": 0.05218411023160364, + "grad_norm": 4.024231165096492, + "learning_rate": 7.303634232121922e-06, + "loss": 10.878375053405762, + "step": 89, + "token_acc": 0.01106387934548419 + }, + { + "epoch": 0.052770448548812667, + "grad_norm": 6.073982718880735, + "learning_rate": 7.38569753810082e-06, + "loss": 10.81357192993164, + "step": 90, + "token_acc": 0.011670034233179477 + }, + { + "epoch": 0.053356786866021695, + "grad_norm": 4.404701215620994, + "learning_rate": 7.467760844079718e-06, + "loss": 10.762024879455566, + "step": 91, + "token_acc": 0.01219048825280223 + }, + { + "epoch": 0.053943125183230724, + "grad_norm": 15.215084711758854, + "learning_rate": 7.549824150058616e-06, + "loss": 10.722940444946289, + "step": 92, + "token_acc": 0.011778320281538315 + }, + { + "epoch": 0.05452946350043975, + "grad_norm": 7.19663634587313, + "learning_rate": 7.631887456037514e-06, + "loss": 10.657026290893555, + "step": 93, + "token_acc": 0.011953774413732514 + }, + { + "epoch": 0.05511580181764878, + "grad_norm": 5.453563211261255, + "learning_rate": 7.713950762016412e-06, + "loss": 10.620040893554688, + "step": 94, + "token_acc": 0.011370734985964496 + }, + { + "epoch": 0.05570214013485781, + "grad_norm": 5.232700925527675, + "learning_rate": 7.79601406799531e-06, + "loss": 10.54349136352539, + "step": 95, + "token_acc": 0.012036520255683912 + }, + { + "epoch": 0.056288478452066845, + "grad_norm": 16.321493056218788, + "learning_rate": 7.878077373974208e-06, + "loss": 10.472469329833984, + "step": 96, + "token_acc": 0.012198682289817336 + }, + { + "epoch": 0.056874816769275874, + "grad_norm": 7.0082057914626805, + "learning_rate": 7.960140679953106e-06, + "loss": 10.43171501159668, + "step": 97, + "token_acc": 0.011481548078652043 + }, + { + "epoch": 0.0574611550864849, + "grad_norm": 5.873538407073087, + "learning_rate": 8.042203985932004e-06, + "loss": 10.332284927368164, + "step": 98, + "token_acc": 0.011738283236948985 + }, + { + "epoch": 0.05804749340369393, + "grad_norm": 8.474957766386694, + "learning_rate": 8.124267291910902e-06, + "loss": 10.282196044921875, + "step": 99, + "token_acc": 0.011019985088469022 + }, + { + "epoch": 0.05863383172090296, + "grad_norm": 5.445353053809421, + "learning_rate": 8.2063305978898e-06, + "loss": 10.199283599853516, + "step": 100, + "token_acc": 0.011515075139409596 + }, + { + "epoch": 0.05922017003811199, + "grad_norm": 10.711733916953884, + "learning_rate": 8.288393903868698e-06, + "loss": 10.146799087524414, + "step": 101, + "token_acc": 0.011802311285960166 + }, + { + "epoch": 0.05980650835532102, + "grad_norm": 5.565870787655604, + "learning_rate": 8.370457209847596e-06, + "loss": 10.075040817260742, + "step": 102, + "token_acc": 0.011042258066142622 + }, + { + "epoch": 0.06039284667253005, + "grad_norm": 6.184076865108105, + "learning_rate": 8.452520515826494e-06, + "loss": 10.03441333770752, + "step": 103, + "token_acc": 0.011396305491181799 + }, + { + "epoch": 0.06097918498973908, + "grad_norm": 4.677904369315326, + "learning_rate": 8.534583821805392e-06, + "loss": 9.944074630737305, + "step": 104, + "token_acc": 0.011589492193744338 + }, + { + "epoch": 0.06156552330694811, + "grad_norm": 8.633749726575598, + "learning_rate": 8.61664712778429e-06, + "loss": 9.924915313720703, + "step": 105, + "token_acc": 0.011250097929125428 + }, + { + "epoch": 0.06215186162415714, + "grad_norm": 5.693080709704519, + "learning_rate": 8.698710433763188e-06, + "loss": 9.872126579284668, + "step": 106, + "token_acc": 0.012050421500488889 + }, + { + "epoch": 0.06273819994136617, + "grad_norm": 6.8026045550650975, + "learning_rate": 8.780773739742086e-06, + "loss": 9.798917770385742, + "step": 107, + "token_acc": 0.011372199585936653 + }, + { + "epoch": 0.0633245382585752, + "grad_norm": 4.402933915963803, + "learning_rate": 8.862837045720984e-06, + "loss": 9.762984275817871, + "step": 108, + "token_acc": 0.011015209658706988 + }, + { + "epoch": 0.06391087657578423, + "grad_norm": 4.451685236842638, + "learning_rate": 8.944900351699882e-06, + "loss": 9.714855194091797, + "step": 109, + "token_acc": 0.011815973159538777 + }, + { + "epoch": 0.06449721489299326, + "grad_norm": 4.467985578441344, + "learning_rate": 9.026963657678778e-06, + "loss": 9.674742698669434, + "step": 110, + "token_acc": 0.011135588616953858 + }, + { + "epoch": 0.06508355321020229, + "grad_norm": 5.027999857942171, + "learning_rate": 9.109026963657678e-06, + "loss": 9.61811637878418, + "step": 111, + "token_acc": 0.011828176856410338 + }, + { + "epoch": 0.06566989152741132, + "grad_norm": 3.5740083419814157, + "learning_rate": 9.191090269636578e-06, + "loss": 9.597053527832031, + "step": 112, + "token_acc": 0.011521914309518282 + }, + { + "epoch": 0.06625622984462035, + "grad_norm": 3.2066645264068163, + "learning_rate": 9.273153575615474e-06, + "loss": 9.595144271850586, + "step": 113, + "token_acc": 0.011856729851139261 + }, + { + "epoch": 0.06684256816182937, + "grad_norm": 3.380436631672164, + "learning_rate": 9.355216881594372e-06, + "loss": 9.505526542663574, + "step": 114, + "token_acc": 0.011621402619128526 + }, + { + "epoch": 0.0674289064790384, + "grad_norm": 2.586374201673117, + "learning_rate": 9.43728018757327e-06, + "loss": 9.478507995605469, + "step": 115, + "token_acc": 0.011260276930828993 + }, + { + "epoch": 0.06801524479624743, + "grad_norm": 4.66133607577173, + "learning_rate": 9.519343493552168e-06, + "loss": 9.459450721740723, + "step": 116, + "token_acc": 0.01190094039268104 + }, + { + "epoch": 0.06860158311345646, + "grad_norm": 2.658962787207946, + "learning_rate": 9.601406799531066e-06, + "loss": 9.433343887329102, + "step": 117, + "token_acc": 0.012701033123868357 + }, + { + "epoch": 0.06918792143066549, + "grad_norm": 3.2761725155113046, + "learning_rate": 9.683470105509964e-06, + "loss": 9.341836929321289, + "step": 118, + "token_acc": 0.012102275793135877 + }, + { + "epoch": 0.06977425974787452, + "grad_norm": 2.292269171824197, + "learning_rate": 9.765533411488862e-06, + "loss": 9.363323211669922, + "step": 119, + "token_acc": 0.012448431162093276 + }, + { + "epoch": 0.07036059806508356, + "grad_norm": 2.0287524472259113, + "learning_rate": 9.84759671746776e-06, + "loss": 9.30771541595459, + "step": 120, + "token_acc": 0.011561001740279884 + }, + { + "epoch": 0.07094693638229259, + "grad_norm": 2.1390630144273044, + "learning_rate": 9.92966002344666e-06, + "loss": 9.288138389587402, + "step": 121, + "token_acc": 0.01157997128167122 + }, + { + "epoch": 0.07153327469950162, + "grad_norm": 1.7250787565218983, + "learning_rate": 1.0011723329425556e-05, + "loss": 9.297527313232422, + "step": 122, + "token_acc": 0.012586028094654473 + }, + { + "epoch": 0.07211961301671065, + "grad_norm": 3.6634110434700897, + "learning_rate": 1.0093786635404455e-05, + "loss": 9.258310317993164, + "step": 123, + "token_acc": 0.011125447142537066 + }, + { + "epoch": 0.07270595133391967, + "grad_norm": 1.7460773975467718, + "learning_rate": 1.0175849941383352e-05, + "loss": 9.217559814453125, + "step": 124, + "token_acc": 0.011685694001289344 + }, + { + "epoch": 0.0732922896511287, + "grad_norm": 1.731892016464575, + "learning_rate": 1.025791324736225e-05, + "loss": 9.188346862792969, + "step": 125, + "token_acc": 0.011616803743544022 + }, + { + "epoch": 0.07387862796833773, + "grad_norm": 1.3978453266801356, + "learning_rate": 1.0339976553341148e-05, + "loss": 9.213885307312012, + "step": 126, + "token_acc": 0.012242752597743906 + }, + { + "epoch": 0.07446496628554676, + "grad_norm": 1.7358613672686785, + "learning_rate": 1.0422039859320046e-05, + "loss": 9.196192741394043, + "step": 127, + "token_acc": 0.011191089160931015 + }, + { + "epoch": 0.07505130460275579, + "grad_norm": 1.219226645483535, + "learning_rate": 1.0504103165298944e-05, + "loss": 9.227466583251953, + "step": 128, + "token_acc": 0.011883061692494375 + }, + { + "epoch": 0.07563764291996482, + "grad_norm": 1.8476996989018628, + "learning_rate": 1.0586166471277842e-05, + "loss": 9.175331115722656, + "step": 129, + "token_acc": 0.011889448439998125 + }, + { + "epoch": 0.07622398123717385, + "grad_norm": 1.3553990282390567, + "learning_rate": 1.066822977725674e-05, + "loss": 9.11203384399414, + "step": 130, + "token_acc": 0.011545309921107048 + }, + { + "epoch": 0.07681031955438287, + "grad_norm": 1.1180921806879636, + "learning_rate": 1.0750293083235638e-05, + "loss": 9.18380355834961, + "step": 131, + "token_acc": 0.011438533176333858 + }, + { + "epoch": 0.0773966578715919, + "grad_norm": 1.1790581495895291, + "learning_rate": 1.0832356389214537e-05, + "loss": 9.122804641723633, + "step": 132, + "token_acc": 0.011879717856700903 + }, + { + "epoch": 0.07798299618880093, + "grad_norm": 1.0455468945801532, + "learning_rate": 1.0914419695193434e-05, + "loss": 9.169681549072266, + "step": 133, + "token_acc": 0.011402734220381816 + }, + { + "epoch": 0.07856933450600997, + "grad_norm": 1.059244558103296, + "learning_rate": 1.0996483001172333e-05, + "loss": 9.144702911376953, + "step": 134, + "token_acc": 0.011548831635710006 + }, + { + "epoch": 0.079155672823219, + "grad_norm": 1.2616172974199422, + "learning_rate": 1.107854630715123e-05, + "loss": 9.184444427490234, + "step": 135, + "token_acc": 0.011868313055922102 + }, + { + "epoch": 0.07974201114042803, + "grad_norm": 1.1106130347793868, + "learning_rate": 1.116060961313013e-05, + "loss": 9.035835266113281, + "step": 136, + "token_acc": 0.011981428785382657 + }, + { + "epoch": 0.08032834945763706, + "grad_norm": 0.9232883027339179, + "learning_rate": 1.1242672919109025e-05, + "loss": 9.106751441955566, + "step": 137, + "token_acc": 0.011643069088370279 + }, + { + "epoch": 0.08091468777484609, + "grad_norm": 1.8956896598899826, + "learning_rate": 1.1324736225087923e-05, + "loss": 9.110843658447266, + "step": 138, + "token_acc": 0.011811310273412956 + }, + { + "epoch": 0.08150102609205512, + "grad_norm": 1.0734978900537682, + "learning_rate": 1.1406799531066821e-05, + "loss": 9.132123947143555, + "step": 139, + "token_acc": 0.011321745577362687 + }, + { + "epoch": 0.08208736440926415, + "grad_norm": 1.0608066801730718, + "learning_rate": 1.148886283704572e-05, + "loss": 9.035505294799805, + "step": 140, + "token_acc": 0.012309549356223175 + }, + { + "epoch": 0.08267370272647317, + "grad_norm": 0.9601927192660569, + "learning_rate": 1.1570926143024619e-05, + "loss": 9.115060806274414, + "step": 141, + "token_acc": 0.012348977664434786 + }, + { + "epoch": 0.0832600410436822, + "grad_norm": 0.9718876666025947, + "learning_rate": 1.1652989449003515e-05, + "loss": 9.108566284179688, + "step": 142, + "token_acc": 0.011118297991364089 + }, + { + "epoch": 0.08384637936089123, + "grad_norm": 1.0404533955981914, + "learning_rate": 1.1735052754982415e-05, + "loss": 9.093795776367188, + "step": 143, + "token_acc": 0.011180620258219087 + }, + { + "epoch": 0.08443271767810026, + "grad_norm": 1.119453938213593, + "learning_rate": 1.1817116060961311e-05, + "loss": 9.01313591003418, + "step": 144, + "token_acc": 0.012133318926554417 + }, + { + "epoch": 0.08501905599530929, + "grad_norm": 1.3259193691505942, + "learning_rate": 1.1899179366940211e-05, + "loss": 9.161356925964355, + "step": 145, + "token_acc": 0.011806872178118105 + }, + { + "epoch": 0.08560539431251832, + "grad_norm": 1.0389449968394369, + "learning_rate": 1.1981242672919107e-05, + "loss": 9.028068542480469, + "step": 146, + "token_acc": 0.013024734686180633 + }, + { + "epoch": 0.08619173262972735, + "grad_norm": 1.6791781677609807, + "learning_rate": 1.2063305978898007e-05, + "loss": 9.033411026000977, + "step": 147, + "token_acc": 0.011740527322155594 + }, + { + "epoch": 0.08677807094693639, + "grad_norm": 1.2765556676733394, + "learning_rate": 1.2145369284876903e-05, + "loss": 9.072059631347656, + "step": 148, + "token_acc": 0.013162245999535386 + }, + { + "epoch": 0.08736440926414542, + "grad_norm": 0.87087934831265, + "learning_rate": 1.2227432590855801e-05, + "loss": 9.04871940612793, + "step": 149, + "token_acc": 0.013032462026013686 + }, + { + "epoch": 0.08795074758135445, + "grad_norm": 1.6822057068986385, + "learning_rate": 1.23094958968347e-05, + "loss": 9.062845230102539, + "step": 150, + "token_acc": 0.01138171417150358 + }, + { + "epoch": 0.08853708589856348, + "grad_norm": 0.932020401542373, + "learning_rate": 1.2391559202813597e-05, + "loss": 9.097454071044922, + "step": 151, + "token_acc": 0.01342642041624755 + }, + { + "epoch": 0.0891234242157725, + "grad_norm": 0.8594862583222933, + "learning_rate": 1.2473622508792497e-05, + "loss": 9.063352584838867, + "step": 152, + "token_acc": 0.012999197992925886 + }, + { + "epoch": 0.08970976253298153, + "grad_norm": 0.7916235626101663, + "learning_rate": 1.2555685814771393e-05, + "loss": 9.045036315917969, + "step": 153, + "token_acc": 0.013683493250336102 + }, + { + "epoch": 0.09029610085019056, + "grad_norm": 1.05315092748921, + "learning_rate": 1.2637749120750293e-05, + "loss": 9.06739616394043, + "step": 154, + "token_acc": 0.01398111716815656 + }, + { + "epoch": 0.09088243916739959, + "grad_norm": 0.9189294333009608, + "learning_rate": 1.271981242672919e-05, + "loss": 9.10274887084961, + "step": 155, + "token_acc": 0.013108848026538729 + }, + { + "epoch": 0.09146877748460862, + "grad_norm": 0.7663692494192156, + "learning_rate": 1.2801875732708089e-05, + "loss": 9.035738945007324, + "step": 156, + "token_acc": 0.0133319040123828 + }, + { + "epoch": 0.09205511580181765, + "grad_norm": 0.9596511889568455, + "learning_rate": 1.2883939038686985e-05, + "loss": 9.071704864501953, + "step": 157, + "token_acc": 0.012912617787564576 + }, + { + "epoch": 0.09264145411902668, + "grad_norm": 0.7432858536845889, + "learning_rate": 1.2966002344665885e-05, + "loss": 9.02535629272461, + "step": 158, + "token_acc": 0.013088371505557427 + }, + { + "epoch": 0.0932277924362357, + "grad_norm": 0.6770469244480984, + "learning_rate": 1.3048065650644781e-05, + "loss": 9.005430221557617, + "step": 159, + "token_acc": 0.014889611500941298 + }, + { + "epoch": 0.09381413075344473, + "grad_norm": 0.6737060733315569, + "learning_rate": 1.313012895662368e-05, + "loss": 9.101192474365234, + "step": 160, + "token_acc": 0.01478247413003605 + }, + { + "epoch": 0.09440046907065376, + "grad_norm": 0.6230351849146687, + "learning_rate": 1.3212192262602579e-05, + "loss": 9.082185745239258, + "step": 161, + "token_acc": 0.015209576605777871 + }, + { + "epoch": 0.09498680738786279, + "grad_norm": 0.7702695193286074, + "learning_rate": 1.3294255568581475e-05, + "loss": 9.05761432647705, + "step": 162, + "token_acc": 0.015812216591090958 + }, + { + "epoch": 0.09557314570507183, + "grad_norm": 0.6827575379345043, + "learning_rate": 1.3376318874560375e-05, + "loss": 9.047271728515625, + "step": 163, + "token_acc": 0.014247530801344058 + }, + { + "epoch": 0.09615948402228086, + "grad_norm": 0.809068409116969, + "learning_rate": 1.3458382180539271e-05, + "loss": 9.059006690979004, + "step": 164, + "token_acc": 0.013906798965290915 + }, + { + "epoch": 0.09674582233948989, + "grad_norm": 0.7564059704111886, + "learning_rate": 1.354044548651817e-05, + "loss": 9.077688217163086, + "step": 165, + "token_acc": 0.016063849621091652 + }, + { + "epoch": 0.09733216065669892, + "grad_norm": 0.7190274744792501, + "learning_rate": 1.3622508792497067e-05, + "loss": 9.102840423583984, + "step": 166, + "token_acc": 0.014970634624164696 + }, + { + "epoch": 0.09791849897390795, + "grad_norm": 1.190147392988638, + "learning_rate": 1.3704572098475967e-05, + "loss": 9.020709991455078, + "step": 167, + "token_acc": 0.0154521933344787 + }, + { + "epoch": 0.09850483729111698, + "grad_norm": 1.0577410783171501, + "learning_rate": 1.3786635404454863e-05, + "loss": 8.99871826171875, + "step": 168, + "token_acc": 0.01499610823211323 + }, + { + "epoch": 0.099091175608326, + "grad_norm": 0.920999990588546, + "learning_rate": 1.3868698710433763e-05, + "loss": 8.981106758117676, + "step": 169, + "token_acc": 0.013833339647455499 + }, + { + "epoch": 0.09967751392553503, + "grad_norm": 1.0723165989104564, + "learning_rate": 1.3950762016412659e-05, + "loss": 9.016414642333984, + "step": 170, + "token_acc": 0.011658098624594626 + }, + { + "epoch": 0.10026385224274406, + "grad_norm": 1.3850130882105414, + "learning_rate": 1.4032825322391559e-05, + "loss": 9.004114151000977, + "step": 171, + "token_acc": 0.01422531889027997 + }, + { + "epoch": 0.10085019055995309, + "grad_norm": 2.257634817344095, + "learning_rate": 1.4114888628370457e-05, + "loss": 9.023462295532227, + "step": 172, + "token_acc": 0.014484141721755923 + }, + { + "epoch": 0.10143652887716212, + "grad_norm": 1.2596914152256267, + "learning_rate": 1.4196951934349353e-05, + "loss": 9.032676696777344, + "step": 173, + "token_acc": 0.014700998632404432 + }, + { + "epoch": 0.10202286719437115, + "grad_norm": 18.55502151573403, + "learning_rate": 1.4279015240328253e-05, + "loss": 9.127123832702637, + "step": 174, + "token_acc": 0.013074868278523877 + }, + { + "epoch": 0.10260920551158018, + "grad_norm": 9.186057786949455, + "learning_rate": 1.4361078546307149e-05, + "loss": 9.103059768676758, + "step": 175, + "token_acc": 0.012738161732762366 + }, + { + "epoch": 0.1031955438287892, + "grad_norm": 1.9558323444805998, + "learning_rate": 1.4443141852286049e-05, + "loss": 9.05103874206543, + "step": 176, + "token_acc": 0.013382028665931642 + }, + { + "epoch": 0.10378188214599825, + "grad_norm": 2.5295637996349667, + "learning_rate": 1.4525205158264945e-05, + "loss": 9.01467227935791, + "step": 177, + "token_acc": 0.012551975658660968 + }, + { + "epoch": 0.10436822046320728, + "grad_norm": 2.7714986381954905, + "learning_rate": 1.4607268464243845e-05, + "loss": 8.968669891357422, + "step": 178, + "token_acc": 0.013999376775353269 + }, + { + "epoch": 0.1049545587804163, + "grad_norm": 2.582398339449192, + "learning_rate": 1.468933177022274e-05, + "loss": 9.037277221679688, + "step": 179, + "token_acc": 0.014366541368012265 + }, + { + "epoch": 0.10554089709762533, + "grad_norm": 1.5364599226520086, + "learning_rate": 1.477139507620164e-05, + "loss": 9.023276329040527, + "step": 180, + "token_acc": 0.01389188308103248 + }, + { + "epoch": 0.10612723541483436, + "grad_norm": 2.6782551689863863, + "learning_rate": 1.4853458382180538e-05, + "loss": 9.028421401977539, + "step": 181, + "token_acc": 0.013619759173794741 + }, + { + "epoch": 0.10671357373204339, + "grad_norm": 1.1239082426817306, + "learning_rate": 1.4935521688159436e-05, + "loss": 8.895050048828125, + "step": 182, + "token_acc": 0.014584678882321608 + }, + { + "epoch": 0.10729991204925242, + "grad_norm": 2.334851579986876, + "learning_rate": 1.5017584994138334e-05, + "loss": 8.980743408203125, + "step": 183, + "token_acc": 0.014908741391693776 + }, + { + "epoch": 0.10788625036646145, + "grad_norm": 1.800946676496231, + "learning_rate": 1.5099648300117232e-05, + "loss": 8.919689178466797, + "step": 184, + "token_acc": 0.015940293972027862 + }, + { + "epoch": 0.10847258868367048, + "grad_norm": 2.1229729445682617, + "learning_rate": 1.518171160609613e-05, + "loss": 8.939411163330078, + "step": 185, + "token_acc": 0.015107531594334417 + }, + { + "epoch": 0.1090589270008795, + "grad_norm": 1.645077219388169, + "learning_rate": 1.526377491207503e-05, + "loss": 8.945083618164062, + "step": 186, + "token_acc": 0.014263792170958484 + }, + { + "epoch": 0.10964526531808853, + "grad_norm": 1.8219637248501346, + "learning_rate": 1.5345838218053926e-05, + "loss": 8.937055587768555, + "step": 187, + "token_acc": 0.014278861077404972 + }, + { + "epoch": 0.11023160363529756, + "grad_norm": 2.059649870790315, + "learning_rate": 1.5427901524032824e-05, + "loss": 8.867799758911133, + "step": 188, + "token_acc": 0.015887760914351422 + }, + { + "epoch": 0.11081794195250659, + "grad_norm": 0.9169745168696901, + "learning_rate": 1.5509964830011722e-05, + "loss": 8.889871597290039, + "step": 189, + "token_acc": 0.015403889840678043 + }, + { + "epoch": 0.11140428026971562, + "grad_norm": 2.609275098651519, + "learning_rate": 1.559202813599062e-05, + "loss": 8.920961380004883, + "step": 190, + "token_acc": 0.01487325605279374 + }, + { + "epoch": 0.11199061858692466, + "grad_norm": 1.8123467450507191, + "learning_rate": 1.567409144196952e-05, + "loss": 8.887933731079102, + "step": 191, + "token_acc": 0.014357918994870196 + }, + { + "epoch": 0.11257695690413369, + "grad_norm": 1.9861415498325221, + "learning_rate": 1.5756154747948416e-05, + "loss": 8.976934432983398, + "step": 192, + "token_acc": 0.015038570426742162 + }, + { + "epoch": 0.11316329522134272, + "grad_norm": 1.9241559785813132, + "learning_rate": 1.5838218053927314e-05, + "loss": 8.90761947631836, + "step": 193, + "token_acc": 0.014950170275573287 + }, + { + "epoch": 0.11374963353855175, + "grad_norm": 1.7736670508127494, + "learning_rate": 1.5920281359906212e-05, + "loss": 8.877954483032227, + "step": 194, + "token_acc": 0.015467830707914215 + }, + { + "epoch": 0.11433597185576078, + "grad_norm": 1.5080366661739326, + "learning_rate": 1.600234466588511e-05, + "loss": 8.865504264831543, + "step": 195, + "token_acc": 0.015455776173285198 + }, + { + "epoch": 0.1149223101729698, + "grad_norm": 2.438855266656718, + "learning_rate": 1.6084407971864008e-05, + "loss": 8.839834213256836, + "step": 196, + "token_acc": 0.016078665171384073 + }, + { + "epoch": 0.11550864849017883, + "grad_norm": 2.127691573927158, + "learning_rate": 1.6166471277842906e-05, + "loss": 8.907093048095703, + "step": 197, + "token_acc": 0.015969636207390434 + }, + { + "epoch": 0.11609498680738786, + "grad_norm": 3.0162677928241837, + "learning_rate": 1.6248534583821804e-05, + "loss": 8.805242538452148, + "step": 198, + "token_acc": 0.016205660322364396 + }, + { + "epoch": 0.11668132512459689, + "grad_norm": 2.3716217136494353, + "learning_rate": 1.6330597889800702e-05, + "loss": 8.783966064453125, + "step": 199, + "token_acc": 0.016431821422552922 + }, + { + "epoch": 0.11726766344180592, + "grad_norm": 3.2748743629557824, + "learning_rate": 1.64126611957796e-05, + "loss": 8.744998931884766, + "step": 200, + "token_acc": 0.015434622618366178 + }, + { + "epoch": 0.11785400175901495, + "grad_norm": 3.6470104627077555, + "learning_rate": 1.6494724501758498e-05, + "loss": 8.865713119506836, + "step": 201, + "token_acc": 0.015887437121664252 + }, + { + "epoch": 0.11844034007622398, + "grad_norm": 2.29250274115636, + "learning_rate": 1.6576787807737396e-05, + "loss": 8.724906921386719, + "step": 202, + "token_acc": 0.016373821165205565 + }, + { + "epoch": 0.119026678393433, + "grad_norm": 1.7072872873335114, + "learning_rate": 1.6658851113716294e-05, + "loss": 8.845634460449219, + "step": 203, + "token_acc": 0.01621559326650829 + }, + { + "epoch": 0.11961301671064203, + "grad_norm": 2.5391311504519374, + "learning_rate": 1.6740914419695192e-05, + "loss": 8.695985794067383, + "step": 204, + "token_acc": 0.017065286958205375 + }, + { + "epoch": 0.12019935502785108, + "grad_norm": 2.022096895331037, + "learning_rate": 1.682297772567409e-05, + "loss": 8.82380199432373, + "step": 205, + "token_acc": 0.016400515653967947 + }, + { + "epoch": 0.1207856933450601, + "grad_norm": 3.698623099299713, + "learning_rate": 1.6905041031652988e-05, + "loss": 8.76661491394043, + "step": 206, + "token_acc": 0.01617085006334142 + }, + { + "epoch": 0.12137203166226913, + "grad_norm": 3.562685569080831, + "learning_rate": 1.6987104337631886e-05, + "loss": 8.79345417022705, + "step": 207, + "token_acc": 0.015884653625936582 + }, + { + "epoch": 0.12195836997947816, + "grad_norm": 1.9644534950012935, + "learning_rate": 1.7069167643610784e-05, + "loss": 8.809921264648438, + "step": 208, + "token_acc": 0.01643835616438356 + }, + { + "epoch": 0.12254470829668719, + "grad_norm": 1.7221158612192842, + "learning_rate": 1.7151230949589682e-05, + "loss": 8.77109146118164, + "step": 209, + "token_acc": 0.015771729036000628 + }, + { + "epoch": 0.12313104661389622, + "grad_norm": 4.371569377140252, + "learning_rate": 1.723329425556858e-05, + "loss": 8.699161529541016, + "step": 210, + "token_acc": 0.016544503731353202 + }, + { + "epoch": 0.12371738493110525, + "grad_norm": 2.414047499445958, + "learning_rate": 1.7315357561547478e-05, + "loss": 8.802984237670898, + "step": 211, + "token_acc": 0.017387951453280207 + }, + { + "epoch": 0.12430372324831428, + "grad_norm": 5.732381832473199, + "learning_rate": 1.7397420867526376e-05, + "loss": 8.686369895935059, + "step": 212, + "token_acc": 0.016319822218106902 + }, + { + "epoch": 0.1248900615655233, + "grad_norm": 4.903230529198032, + "learning_rate": 1.7479484173505274e-05, + "loss": 8.629064559936523, + "step": 213, + "token_acc": 0.01756457036560063 + }, + { + "epoch": 0.12547639988273235, + "grad_norm": 2.6330107001735357, + "learning_rate": 1.7561547479484172e-05, + "loss": 8.737181663513184, + "step": 214, + "token_acc": 0.016296636656955167 + }, + { + "epoch": 0.12606273819994138, + "grad_norm": 1.8675640446596997, + "learning_rate": 1.764361078546307e-05, + "loss": 8.717007637023926, + "step": 215, + "token_acc": 0.015939513686357605 + }, + { + "epoch": 0.1266490765171504, + "grad_norm": 4.7828685900387535, + "learning_rate": 1.7725674091441968e-05, + "loss": 8.679970741271973, + "step": 216, + "token_acc": 0.017213250035849663 + }, + { + "epoch": 0.12723541483435943, + "grad_norm": 3.650826995216099, + "learning_rate": 1.7807737397420866e-05, + "loss": 8.63546371459961, + "step": 217, + "token_acc": 0.016065830721003135 + }, + { + "epoch": 0.12782175315156846, + "grad_norm": 3.51135271677668, + "learning_rate": 1.7889800703399764e-05, + "loss": 8.736555099487305, + "step": 218, + "token_acc": 0.016175878338191875 + }, + { + "epoch": 0.1284080914687775, + "grad_norm": 3.5710733832505324, + "learning_rate": 1.7971864009378662e-05, + "loss": 8.606273651123047, + "step": 219, + "token_acc": 0.01632660509323344 + }, + { + "epoch": 0.12899442978598652, + "grad_norm": 3.181540331467067, + "learning_rate": 1.8053927315357556e-05, + "loss": 8.612173080444336, + "step": 220, + "token_acc": 0.01634141056200686 + }, + { + "epoch": 0.12958076810319555, + "grad_norm": 2.6383588906249344, + "learning_rate": 1.8135990621336458e-05, + "loss": 8.658451080322266, + "step": 221, + "token_acc": 0.017745876685469356 + }, + { + "epoch": 0.13016710642040458, + "grad_norm": 3.941296506495943, + "learning_rate": 1.8218053927315356e-05, + "loss": 8.594991683959961, + "step": 222, + "token_acc": 0.017774013841288325 + }, + { + "epoch": 0.1307534447376136, + "grad_norm": 3.6636457347707627, + "learning_rate": 1.8300117233294254e-05, + "loss": 8.673286437988281, + "step": 223, + "token_acc": 0.0164903084174946 + }, + { + "epoch": 0.13133978305482263, + "grad_norm": 3.808915854304146, + "learning_rate": 1.8382180539273155e-05, + "loss": 8.668746948242188, + "step": 224, + "token_acc": 0.016302585039617366 + }, + { + "epoch": 0.13192612137203166, + "grad_norm": 2.3985883902435345, + "learning_rate": 1.846424384525205e-05, + "loss": 8.526185989379883, + "step": 225, + "token_acc": 0.01716734873032881 + }, + { + "epoch": 0.1325124596892407, + "grad_norm": 3.6023911529966663, + "learning_rate": 1.8546307151230948e-05, + "loss": 8.546903610229492, + "step": 226, + "token_acc": 0.01680478475136238 + }, + { + "epoch": 0.13309879800644972, + "grad_norm": 2.6307965541736964, + "learning_rate": 1.8628370457209846e-05, + "loss": 8.570002555847168, + "step": 227, + "token_acc": 0.016355015857474297 + }, + { + "epoch": 0.13368513632365875, + "grad_norm": 3.4037061692756354, + "learning_rate": 1.8710433763188744e-05, + "loss": 8.559468269348145, + "step": 228, + "token_acc": 0.016592329700163905 + }, + { + "epoch": 0.13427147464086778, + "grad_norm": 2.823218410919002, + "learning_rate": 1.8792497069167642e-05, + "loss": 8.55791187286377, + "step": 229, + "token_acc": 0.01710821801408109 + }, + { + "epoch": 0.1348578129580768, + "grad_norm": 3.812520602189062, + "learning_rate": 1.887456037514654e-05, + "loss": 8.501962661743164, + "step": 230, + "token_acc": 0.017194684134282295 + }, + { + "epoch": 0.13544415127528583, + "grad_norm": 3.193214573723467, + "learning_rate": 1.8956623681125438e-05, + "loss": 8.538557052612305, + "step": 231, + "token_acc": 0.01703534659370867 + }, + { + "epoch": 0.13603048959249486, + "grad_norm": 3.759886324853259, + "learning_rate": 1.9038686987104336e-05, + "loss": 8.469820022583008, + "step": 232, + "token_acc": 0.01721747320951697 + }, + { + "epoch": 0.1366168279097039, + "grad_norm": 2.8283854072183128, + "learning_rate": 1.9120750293083237e-05, + "loss": 8.443784713745117, + "step": 233, + "token_acc": 0.019059061761727978 + }, + { + "epoch": 0.13720316622691292, + "grad_norm": 3.954198915207869, + "learning_rate": 1.920281359906213e-05, + "loss": 8.47078800201416, + "step": 234, + "token_acc": 0.01827121258631942 + }, + { + "epoch": 0.13778950454412195, + "grad_norm": 4.480390519163684, + "learning_rate": 1.928487690504103e-05, + "loss": 8.386969566345215, + "step": 235, + "token_acc": 0.01862292796845987 + }, + { + "epoch": 0.13837584286133098, + "grad_norm": 2.4160714716488725, + "learning_rate": 1.9366940211019928e-05, + "loss": 8.319723129272461, + "step": 236, + "token_acc": 0.018424398087135287 + }, + { + "epoch": 0.13896218117854, + "grad_norm": 4.574217363915171, + "learning_rate": 1.944900351699883e-05, + "loss": 8.428250312805176, + "step": 237, + "token_acc": 0.01719873272495711 + }, + { + "epoch": 0.13954851949574903, + "grad_norm": 3.142113642341402, + "learning_rate": 1.9531066822977724e-05, + "loss": 8.369216918945312, + "step": 238, + "token_acc": 0.020373488364952074 + }, + { + "epoch": 0.14013485781295806, + "grad_norm": 6.991475948022251, + "learning_rate": 1.961313012895662e-05, + "loss": 8.391242980957031, + "step": 239, + "token_acc": 0.01896038375651311 + }, + { + "epoch": 0.14072119613016712, + "grad_norm": 4.36073806027156, + "learning_rate": 1.969519343493552e-05, + "loss": 8.310012817382812, + "step": 240, + "token_acc": 0.0186510248740758 + }, + { + "epoch": 0.14130753444737615, + "grad_norm": 6.04086116037777, + "learning_rate": 1.9777256740914418e-05, + "loss": 8.3284912109375, + "step": 241, + "token_acc": 0.020519232382607094 + }, + { + "epoch": 0.14189387276458518, + "grad_norm": 4.783184744366244, + "learning_rate": 1.985932004689332e-05, + "loss": 8.355376243591309, + "step": 242, + "token_acc": 0.01939424037023927 + }, + { + "epoch": 0.1424802110817942, + "grad_norm": 5.292141128220087, + "learning_rate": 1.9941383352872213e-05, + "loss": 8.244965553283691, + "step": 243, + "token_acc": 0.01935396271504026 + }, + { + "epoch": 0.14306654939900323, + "grad_norm": 3.5960840951276247, + "learning_rate": 2.002344665885111e-05, + "loss": 8.289033889770508, + "step": 244, + "token_acc": 0.019544080687080436 + }, + { + "epoch": 0.14365288771621226, + "grad_norm": 7.075470761558914, + "learning_rate": 2.010550996483001e-05, + "loss": 8.371967315673828, + "step": 245, + "token_acc": 0.019690230526722086 + }, + { + "epoch": 0.1442392260334213, + "grad_norm": 6.061144658817595, + "learning_rate": 2.018757327080891e-05, + "loss": 8.17292594909668, + "step": 246, + "token_acc": 0.02201123450576521 + }, + { + "epoch": 0.14482556435063032, + "grad_norm": 5.3686971169578825, + "learning_rate": 2.0269636576787805e-05, + "loss": 8.156320571899414, + "step": 247, + "token_acc": 0.020546058744945212 + }, + { + "epoch": 0.14541190266783935, + "grad_norm": 5.813999207393693, + "learning_rate": 2.0351699882766703e-05, + "loss": 8.236780166625977, + "step": 248, + "token_acc": 0.01902194979507667 + }, + { + "epoch": 0.14599824098504838, + "grad_norm": 5.754271220090866, + "learning_rate": 2.04337631887456e-05, + "loss": 8.23939323425293, + "step": 249, + "token_acc": 0.020121951219512196 + }, + { + "epoch": 0.1465845793022574, + "grad_norm": 6.106856803911396, + "learning_rate": 2.05158264947245e-05, + "loss": 8.162118911743164, + "step": 250, + "token_acc": 0.020759554231215255 + }, + { + "epoch": 0.14717091761946643, + "grad_norm": 3.4240663409771495, + "learning_rate": 2.05978898007034e-05, + "loss": 8.219804763793945, + "step": 251, + "token_acc": 0.019182313256929203 + }, + { + "epoch": 0.14775725593667546, + "grad_norm": 4.822865595006713, + "learning_rate": 2.0679953106682295e-05, + "loss": 8.139307022094727, + "step": 252, + "token_acc": 0.020586942415171748 + }, + { + "epoch": 0.1483435942538845, + "grad_norm": 6.702502171739605, + "learning_rate": 2.0762016412661193e-05, + "loss": 8.039722442626953, + "step": 253, + "token_acc": 0.02168764768616298 + }, + { + "epoch": 0.14892993257109352, + "grad_norm": 3.9043752850950275, + "learning_rate": 2.084407971864009e-05, + "loss": 8.088377952575684, + "step": 254, + "token_acc": 0.021260662721157533 + }, + { + "epoch": 0.14951627088830255, + "grad_norm": 4.284181847635292, + "learning_rate": 2.0926143024618993e-05, + "loss": 8.146215438842773, + "step": 255, + "token_acc": 0.021590979493216034 + }, + { + "epoch": 0.15010260920551158, + "grad_norm": 3.491380624884426, + "learning_rate": 2.1008206330597887e-05, + "loss": 8.097631454467773, + "step": 256, + "token_acc": 0.022211920316846115 + }, + { + "epoch": 0.1506889475227206, + "grad_norm": 11.848809112024556, + "learning_rate": 2.1090269636576785e-05, + "loss": 8.041440963745117, + "step": 257, + "token_acc": 0.021933909901265505 + }, + { + "epoch": 0.15127528583992964, + "grad_norm": 9.172985930552384, + "learning_rate": 2.1172332942555683e-05, + "loss": 8.039102554321289, + "step": 258, + "token_acc": 0.022924036929887474 + }, + { + "epoch": 0.15186162415713866, + "grad_norm": 9.194445948985182, + "learning_rate": 2.1254396248534585e-05, + "loss": 8.055174827575684, + "step": 259, + "token_acc": 0.024041792857532135 + }, + { + "epoch": 0.1524479624743477, + "grad_norm": 6.509250229713062, + "learning_rate": 2.133645955451348e-05, + "loss": 7.9800825119018555, + "step": 260, + "token_acc": 0.022898969817458883 + }, + { + "epoch": 0.15303430079155672, + "grad_norm": 10.140065009616492, + "learning_rate": 2.1418522860492377e-05, + "loss": 8.078319549560547, + "step": 261, + "token_acc": 0.020740872961138922 + }, + { + "epoch": 0.15362063910876575, + "grad_norm": 4.3058758575889104, + "learning_rate": 2.1500586166471275e-05, + "loss": 7.975379467010498, + "step": 262, + "token_acc": 0.023266629666173325 + }, + { + "epoch": 0.15420697742597478, + "grad_norm": 12.060044818859536, + "learning_rate": 2.1582649472450173e-05, + "loss": 8.065200805664062, + "step": 263, + "token_acc": 0.024049699087555814 + }, + { + "epoch": 0.1547933157431838, + "grad_norm": 10.333809573113374, + "learning_rate": 2.1664712778429075e-05, + "loss": 7.982080459594727, + "step": 264, + "token_acc": 0.024507800390212303 + }, + { + "epoch": 0.15537965406039284, + "grad_norm": 6.81295307370986, + "learning_rate": 2.174677608440797e-05, + "loss": 7.957369327545166, + "step": 265, + "token_acc": 0.024173902818712345 + }, + { + "epoch": 0.15596599237760186, + "grad_norm": 6.357761566520793, + "learning_rate": 2.1828839390386867e-05, + "loss": 7.925359725952148, + "step": 266, + "token_acc": 0.02534466714108088 + }, + { + "epoch": 0.1565523306948109, + "grad_norm": 6.6801064329722815, + "learning_rate": 2.1910902696365765e-05, + "loss": 7.931022644042969, + "step": 267, + "token_acc": 0.024822094433108313 + }, + { + "epoch": 0.15713866901201995, + "grad_norm": 6.844260065703394, + "learning_rate": 2.1992966002344666e-05, + "loss": 7.986782073974609, + "step": 268, + "token_acc": 0.0230308800145842 + }, + { + "epoch": 0.15772500732922898, + "grad_norm": 4.084529110818654, + "learning_rate": 2.207502930832356e-05, + "loss": 7.826689720153809, + "step": 269, + "token_acc": 0.02608825362662419 + }, + { + "epoch": 0.158311345646438, + "grad_norm": 6.692001490056697, + "learning_rate": 2.215709261430246e-05, + "loss": 7.939952850341797, + "step": 270, + "token_acc": 0.024584668915782988 + }, + { + "epoch": 0.15889768396364704, + "grad_norm": 3.795413463681977, + "learning_rate": 2.2239155920281357e-05, + "loss": 7.781843185424805, + "step": 271, + "token_acc": 0.02768234655302416 + }, + { + "epoch": 0.15948402228085606, + "grad_norm": 8.508992899379706, + "learning_rate": 2.232121922626026e-05, + "loss": 7.877396106719971, + "step": 272, + "token_acc": 0.024171704610924712 + }, + { + "epoch": 0.1600703605980651, + "grad_norm": 5.624666565897748, + "learning_rate": 2.2403282532239156e-05, + "loss": 7.861850261688232, + "step": 273, + "token_acc": 0.024692416742819932 + }, + { + "epoch": 0.16065669891527412, + "grad_norm": 7.457730027433706, + "learning_rate": 2.248534583821805e-05, + "loss": 7.903068542480469, + "step": 274, + "token_acc": 0.025924767043416543 + }, + { + "epoch": 0.16124303723248315, + "grad_norm": 5.799148446749172, + "learning_rate": 2.256740914419695e-05, + "loss": 7.890822887420654, + "step": 275, + "token_acc": 0.025598293273243197 + }, + { + "epoch": 0.16182937554969218, + "grad_norm": 6.838781032891721, + "learning_rate": 2.2649472450175847e-05, + "loss": 7.878671646118164, + "step": 276, + "token_acc": 0.024970285908336925 + }, + { + "epoch": 0.1624157138669012, + "grad_norm": 5.351665955003442, + "learning_rate": 2.273153575615475e-05, + "loss": 7.777127265930176, + "step": 277, + "token_acc": 0.02718523177169251 + }, + { + "epoch": 0.16300205218411024, + "grad_norm": 5.548791419889535, + "learning_rate": 2.2813599062133643e-05, + "loss": 7.721002101898193, + "step": 278, + "token_acc": 0.028221501044071933 + }, + { + "epoch": 0.16358839050131926, + "grad_norm": 3.7459441569023872, + "learning_rate": 2.289566236811254e-05, + "loss": 7.737344741821289, + "step": 279, + "token_acc": 0.02737406035831494 + }, + { + "epoch": 0.1641747288185283, + "grad_norm": 5.710591638749095, + "learning_rate": 2.297772567409144e-05, + "loss": 7.6797027587890625, + "step": 280, + "token_acc": 0.028434778283599086 + }, + { + "epoch": 0.16476106713573732, + "grad_norm": 2.8138814454535974, + "learning_rate": 2.305978898007034e-05, + "loss": 7.641755104064941, + "step": 281, + "token_acc": 0.03108317759822257 + }, + { + "epoch": 0.16534740545294635, + "grad_norm": 5.5663994352506245, + "learning_rate": 2.3141852286049238e-05, + "loss": 7.639070510864258, + "step": 282, + "token_acc": 0.03185563805553715 + }, + { + "epoch": 0.16593374377015538, + "grad_norm": 3.7226460692409256, + "learning_rate": 2.3223915592028133e-05, + "loss": 7.719721794128418, + "step": 283, + "token_acc": 0.02765388973791182 + }, + { + "epoch": 0.1665200820873644, + "grad_norm": 5.714925507398263, + "learning_rate": 2.330597889800703e-05, + "loss": 7.6826629638671875, + "step": 284, + "token_acc": 0.02806624473945453 + }, + { + "epoch": 0.16710642040457344, + "grad_norm": 4.2376831842455776, + "learning_rate": 2.3388042203985932e-05, + "loss": 7.5432844161987305, + "step": 285, + "token_acc": 0.03027291593041454 + }, + { + "epoch": 0.16769275872178246, + "grad_norm": 3.0580963196406143, + "learning_rate": 2.347010550996483e-05, + "loss": 7.636653423309326, + "step": 286, + "token_acc": 0.03028059676938421 + }, + { + "epoch": 0.1682790970389915, + "grad_norm": 5.686907198101258, + "learning_rate": 2.3552168815943725e-05, + "loss": 7.544497489929199, + "step": 287, + "token_acc": 0.03165042220222115 + }, + { + "epoch": 0.16886543535620052, + "grad_norm": 5.610947848626733, + "learning_rate": 2.3634232121922623e-05, + "loss": 7.56590461730957, + "step": 288, + "token_acc": 0.030103740895081017 + }, + { + "epoch": 0.16945177367340955, + "grad_norm": 3.708955920099665, + "learning_rate": 2.371629542790152e-05, + "loss": 7.51408576965332, + "step": 289, + "token_acc": 0.03282895792118912 + }, + { + "epoch": 0.17003811199061858, + "grad_norm": 5.868576732270801, + "learning_rate": 2.3798358733880422e-05, + "loss": 7.476420879364014, + "step": 290, + "token_acc": 0.033566991508519974 + }, + { + "epoch": 0.1706244503078276, + "grad_norm": 6.098114302318514, + "learning_rate": 2.3880422039859317e-05, + "loss": 7.493291854858398, + "step": 291, + "token_acc": 0.033254375744228626 + }, + { + "epoch": 0.17121078862503664, + "grad_norm": 2.5394875645795953, + "learning_rate": 2.3962485345838215e-05, + "loss": 7.340261459350586, + "step": 292, + "token_acc": 0.03699549835248983 + }, + { + "epoch": 0.17179712694224566, + "grad_norm": 10.201615528266453, + "learning_rate": 2.4044548651817113e-05, + "loss": 7.498136520385742, + "step": 293, + "token_acc": 0.03217249717796288 + }, + { + "epoch": 0.1723834652594547, + "grad_norm": 6.85334552310493, + "learning_rate": 2.4126611957796014e-05, + "loss": 7.466062545776367, + "step": 294, + "token_acc": 0.03243098225425411 + }, + { + "epoch": 0.17296980357666372, + "grad_norm": 10.365558575423215, + "learning_rate": 2.4208675263774912e-05, + "loss": 7.455526351928711, + "step": 295, + "token_acc": 0.03454595606762255 + }, + { + "epoch": 0.17355614189387278, + "grad_norm": 9.3288213244638, + "learning_rate": 2.4290738569753807e-05, + "loss": 7.399315357208252, + "step": 296, + "token_acc": 0.037120165306501564 + }, + { + "epoch": 0.1741424802110818, + "grad_norm": 4.293712622406385, + "learning_rate": 2.4372801875732705e-05, + "loss": 7.3079681396484375, + "step": 297, + "token_acc": 0.03697229640419735 + }, + { + "epoch": 0.17472881852829084, + "grad_norm": 6.908889227251877, + "learning_rate": 2.4454865181711603e-05, + "loss": 7.377861976623535, + "step": 298, + "token_acc": 0.034878200022312913 + }, + { + "epoch": 0.17531515684549986, + "grad_norm": 5.979617318144951, + "learning_rate": 2.4536928487690504e-05, + "loss": 7.199686050415039, + "step": 299, + "token_acc": 0.03668531704301895 + }, + { + "epoch": 0.1759014951627089, + "grad_norm": 4.574131899597824, + "learning_rate": 2.46189917936694e-05, + "loss": 7.230719566345215, + "step": 300, + "token_acc": 0.038200730262956034 + }, + { + "epoch": 0.17648783347991792, + "grad_norm": 4.954127023170958, + "learning_rate": 2.4701055099648297e-05, + "loss": 7.380388259887695, + "step": 301, + "token_acc": 0.03557278409577408 + }, + { + "epoch": 0.17707417179712695, + "grad_norm": 6.066811010061318, + "learning_rate": 2.4783118405627195e-05, + "loss": 7.241059303283691, + "step": 302, + "token_acc": 0.03938199338987775 + }, + { + "epoch": 0.17766051011433598, + "grad_norm": 4.50315139186192, + "learning_rate": 2.4865181711606096e-05, + "loss": 7.194522380828857, + "step": 303, + "token_acc": 0.04079912776458071 + }, + { + "epoch": 0.178246848431545, + "grad_norm": 8.445854335168255, + "learning_rate": 2.4947245017584994e-05, + "loss": 7.182653427124023, + "step": 304, + "token_acc": 0.04091390896674692 + }, + { + "epoch": 0.17883318674875404, + "grad_norm": 3.3369167266364856, + "learning_rate": 2.502930832356389e-05, + "loss": 7.139228820800781, + "step": 305, + "token_acc": 0.04075027757435449 + }, + { + "epoch": 0.17941952506596306, + "grad_norm": 11.262799050853033, + "learning_rate": 2.5111371629542786e-05, + "loss": 7.123529434204102, + "step": 306, + "token_acc": 0.03972182503574678 + }, + { + "epoch": 0.1800058633831721, + "grad_norm": 7.866751245262843, + "learning_rate": 2.5193434935521688e-05, + "loss": 7.234379291534424, + "step": 307, + "token_acc": 0.039482702349869454 + }, + { + "epoch": 0.18059220170038112, + "grad_norm": 9.196175260009392, + "learning_rate": 2.5275498241500586e-05, + "loss": 7.211536407470703, + "step": 308, + "token_acc": 0.040580688893984716 + }, + { + "epoch": 0.18117854001759015, + "grad_norm": 6.922032685757308, + "learning_rate": 2.535756154747948e-05, + "loss": 7.189626693725586, + "step": 309, + "token_acc": 0.04212458921968454 + }, + { + "epoch": 0.18176487833479918, + "grad_norm": 8.383231703396527, + "learning_rate": 2.543962485345838e-05, + "loss": 7.084568977355957, + "step": 310, + "token_acc": 0.04413164237465217 + }, + { + "epoch": 0.1823512166520082, + "grad_norm": 6.779468807751188, + "learning_rate": 2.5521688159437276e-05, + "loss": 7.10654354095459, + "step": 311, + "token_acc": 0.04092412734538333 + }, + { + "epoch": 0.18293755496921724, + "grad_norm": 5.594294123579753, + "learning_rate": 2.5603751465416178e-05, + "loss": 6.957389831542969, + "step": 312, + "token_acc": 0.04682504123984278 + }, + { + "epoch": 0.18352389328642627, + "grad_norm": 5.268304731052235, + "learning_rate": 2.5685814771395076e-05, + "loss": 7.018155574798584, + "step": 313, + "token_acc": 0.04502595830806588 + }, + { + "epoch": 0.1841102316036353, + "grad_norm": 4.243679147352299, + "learning_rate": 2.576787807737397e-05, + "loss": 6.998994827270508, + "step": 314, + "token_acc": 0.04605518946455452 + }, + { + "epoch": 0.18469656992084432, + "grad_norm": 5.069872560490284, + "learning_rate": 2.584994138335287e-05, + "loss": 6.891481399536133, + "step": 315, + "token_acc": 0.050647868043256154 + }, + { + "epoch": 0.18528290823805335, + "grad_norm": 5.966297941491455, + "learning_rate": 2.593200468933177e-05, + "loss": 6.871021270751953, + "step": 316, + "token_acc": 0.0521790462501925 + }, + { + "epoch": 0.18586924655526238, + "grad_norm": 6.114684395876698, + "learning_rate": 2.6014067995310668e-05, + "loss": 6.890596389770508, + "step": 317, + "token_acc": 0.0506723860479895 + }, + { + "epoch": 0.1864555848724714, + "grad_norm": 5.512745573353237, + "learning_rate": 2.6096131301289562e-05, + "loss": 6.8679633140563965, + "step": 318, + "token_acc": 0.051609993312215836 + }, + { + "epoch": 0.18704192318968044, + "grad_norm": 7.477696147298044, + "learning_rate": 2.617819460726846e-05, + "loss": 6.932560920715332, + "step": 319, + "token_acc": 0.049224795280894844 + }, + { + "epoch": 0.18762826150688947, + "grad_norm": 2.9716454164041948, + "learning_rate": 2.626025791324736e-05, + "loss": 6.843957901000977, + "step": 320, + "token_acc": 0.0517449452757845 + }, + { + "epoch": 0.1882145998240985, + "grad_norm": 10.569004721389046, + "learning_rate": 2.634232121922626e-05, + "loss": 6.895000457763672, + "step": 321, + "token_acc": 0.04795219022188314 + }, + { + "epoch": 0.18880093814130752, + "grad_norm": 5.598411440654056, + "learning_rate": 2.6424384525205158e-05, + "loss": 6.817564964294434, + "step": 322, + "token_acc": 0.0520371671412045 + }, + { + "epoch": 0.18938727645851655, + "grad_norm": 9.834785046839244, + "learning_rate": 2.6506447831184052e-05, + "loss": 6.821771621704102, + "step": 323, + "token_acc": 0.05322270779234488 + }, + { + "epoch": 0.18997361477572558, + "grad_norm": 7.058848236846204, + "learning_rate": 2.658851113716295e-05, + "loss": 6.7929582595825195, + "step": 324, + "token_acc": 0.05393500134120877 + }, + { + "epoch": 0.19055995309293464, + "grad_norm": 6.6178315405620065, + "learning_rate": 2.667057444314185e-05, + "loss": 6.666149139404297, + "step": 325, + "token_acc": 0.056866230515177654 + }, + { + "epoch": 0.19114629141014366, + "grad_norm": 7.00483924683113, + "learning_rate": 2.675263774912075e-05, + "loss": 6.666651725769043, + "step": 326, + "token_acc": 0.05690540037547512 + }, + { + "epoch": 0.1917326297273527, + "grad_norm": 5.079934705961767, + "learning_rate": 2.6834701055099644e-05, + "loss": 6.710475921630859, + "step": 327, + "token_acc": 0.05638290431795751 + }, + { + "epoch": 0.19231896804456172, + "grad_norm": 7.277159360110486, + "learning_rate": 2.6916764361078542e-05, + "loss": 6.624210357666016, + "step": 328, + "token_acc": 0.06208354033103861 + }, + { + "epoch": 0.19290530636177075, + "grad_norm": 6.8930766517698965, + "learning_rate": 2.6998827667057443e-05, + "loss": 6.631856918334961, + "step": 329, + "token_acc": 0.05746466339686679 + }, + { + "epoch": 0.19349164467897978, + "grad_norm": 10.611747699040464, + "learning_rate": 2.708089097303634e-05, + "loss": 6.647503852844238, + "step": 330, + "token_acc": 0.05955072394054548 + }, + { + "epoch": 0.1940779829961888, + "grad_norm": 5.910850637883799, + "learning_rate": 2.7162954279015236e-05, + "loss": 6.596729278564453, + "step": 331, + "token_acc": 0.061496661220145535 + }, + { + "epoch": 0.19466432131339784, + "grad_norm": 10.37649063178904, + "learning_rate": 2.7245017584994134e-05, + "loss": 6.716401100158691, + "step": 332, + "token_acc": 0.05534771432103188 + }, + { + "epoch": 0.19525065963060687, + "grad_norm": 8.205825164860919, + "learning_rate": 2.7327080890973035e-05, + "loss": 6.575899600982666, + "step": 333, + "token_acc": 0.06262224770822368 + }, + { + "epoch": 0.1958369979478159, + "grad_norm": 6.738555686603056, + "learning_rate": 2.7409144196951933e-05, + "loss": 6.6320271492004395, + "step": 334, + "token_acc": 0.05921731618499614 + }, + { + "epoch": 0.19642333626502492, + "grad_norm": 7.608194715912897, + "learning_rate": 2.749120750293083e-05, + "loss": 6.515623092651367, + "step": 335, + "token_acc": 0.06527202512637102 + }, + { + "epoch": 0.19700967458223395, + "grad_norm": 7.247446421640857, + "learning_rate": 2.7573270808909726e-05, + "loss": 6.562398910522461, + "step": 336, + "token_acc": 0.062424494533715603 + }, + { + "epoch": 0.19759601289944298, + "grad_norm": 5.647851292128922, + "learning_rate": 2.7655334114888624e-05, + "loss": 6.436605453491211, + "step": 337, + "token_acc": 0.07144093387787155 + }, + { + "epoch": 0.198182351216652, + "grad_norm": 6.546820186865682, + "learning_rate": 2.7737397420867525e-05, + "loss": 6.478397846221924, + "step": 338, + "token_acc": 0.06827630154860767 + }, + { + "epoch": 0.19876868953386104, + "grad_norm": 3.3966243818453035, + "learning_rate": 2.7819460726846423e-05, + "loss": 6.496657848358154, + "step": 339, + "token_acc": 0.06569103307490173 + }, + { + "epoch": 0.19935502785107007, + "grad_norm": 8.922646670057471, + "learning_rate": 2.7901524032825318e-05, + "loss": 6.423449993133545, + "step": 340, + "token_acc": 0.06927440159012095 + }, + { + "epoch": 0.1999413661682791, + "grad_norm": 5.990033084623942, + "learning_rate": 2.7983587338804216e-05, + "loss": 6.516849994659424, + "step": 341, + "token_acc": 0.06816905490842558 + }, + { + "epoch": 0.20052770448548812, + "grad_norm": 6.336726318293391, + "learning_rate": 2.8065650644783117e-05, + "loss": 6.381641387939453, + "step": 342, + "token_acc": 0.0721427706513254 + }, + { + "epoch": 0.20111404280269715, + "grad_norm": 6.29280806854031, + "learning_rate": 2.8147713950762015e-05, + "loss": 6.475069999694824, + "step": 343, + "token_acc": 0.06888381798184752 + }, + { + "epoch": 0.20170038111990618, + "grad_norm": 6.709775694827274, + "learning_rate": 2.8229777256740913e-05, + "loss": 6.38390588760376, + "step": 344, + "token_acc": 0.06849427428402091 + }, + { + "epoch": 0.2022867194371152, + "grad_norm": 6.787138640691426, + "learning_rate": 2.8311840562719808e-05, + "loss": 6.440901279449463, + "step": 345, + "token_acc": 0.06869756266783159 + }, + { + "epoch": 0.20287305775432424, + "grad_norm": 8.186447429958918, + "learning_rate": 2.8393903868698706e-05, + "loss": 6.4044084548950195, + "step": 346, + "token_acc": 0.06835312836502146 + }, + { + "epoch": 0.20345939607153327, + "grad_norm": 5.239185760055508, + "learning_rate": 2.8475967174677607e-05, + "loss": 6.290832042694092, + "step": 347, + "token_acc": 0.07705955315507686 + }, + { + "epoch": 0.2040457343887423, + "grad_norm": 2.6794803391264366, + "learning_rate": 2.8558030480656505e-05, + "loss": 6.281411647796631, + "step": 348, + "token_acc": 0.07904037878030076 + }, + { + "epoch": 0.20463207270595132, + "grad_norm": 9.189856019533304, + "learning_rate": 2.86400937866354e-05, + "loss": 6.379359722137451, + "step": 349, + "token_acc": 0.07438556255190483 + }, + { + "epoch": 0.20521841102316035, + "grad_norm": 5.985019382753672, + "learning_rate": 2.8722157092614298e-05, + "loss": 6.338669300079346, + "step": 350, + "token_acc": 0.07450111772138301 + }, + { + "epoch": 0.20580474934036938, + "grad_norm": 8.27772835875237, + "learning_rate": 2.88042203985932e-05, + "loss": 6.249000549316406, + "step": 351, + "token_acc": 0.07807650273224044 + }, + { + "epoch": 0.2063910876575784, + "grad_norm": 4.968068790964617, + "learning_rate": 2.8886283704572097e-05, + "loss": 6.238777160644531, + "step": 352, + "token_acc": 0.07617823458105995 + }, + { + "epoch": 0.20697742597478747, + "grad_norm": 7.353257839321968, + "learning_rate": 2.8968347010550995e-05, + "loss": 6.330348014831543, + "step": 353, + "token_acc": 0.07336546316393351 + }, + { + "epoch": 0.2075637642919965, + "grad_norm": 3.966773831948239, + "learning_rate": 2.905041031652989e-05, + "loss": 6.253887176513672, + "step": 354, + "token_acc": 0.07955167491332954 + }, + { + "epoch": 0.20815010260920552, + "grad_norm": 8.548288866727106, + "learning_rate": 2.913247362250879e-05, + "loss": 6.187613010406494, + "step": 355, + "token_acc": 0.08292580982236154 + }, + { + "epoch": 0.20873644092641455, + "grad_norm": 6.2221681668528195, + "learning_rate": 2.921453692848769e-05, + "loss": 6.29707670211792, + "step": 356, + "token_acc": 0.07706845373118851 + }, + { + "epoch": 0.20932277924362358, + "grad_norm": 5.175913773085352, + "learning_rate": 2.9296600234466587e-05, + "loss": 6.221586227416992, + "step": 357, + "token_acc": 0.07976189547023824 + }, + { + "epoch": 0.2099091175608326, + "grad_norm": 7.011518564505431, + "learning_rate": 2.937866354044548e-05, + "loss": 6.142030715942383, + "step": 358, + "token_acc": 0.08658054366246258 + }, + { + "epoch": 0.21049545587804164, + "grad_norm": 5.761545102540839, + "learning_rate": 2.946072684642438e-05, + "loss": 6.1075592041015625, + "step": 359, + "token_acc": 0.08501039490152336 + }, + { + "epoch": 0.21108179419525067, + "grad_norm": 5.250445543210576, + "learning_rate": 2.954279015240328e-05, + "loss": 6.084338188171387, + "step": 360, + "token_acc": 0.09044136552678385 + }, + { + "epoch": 0.2116681325124597, + "grad_norm": 5.389902101025379, + "learning_rate": 2.962485345838218e-05, + "loss": 6.151660919189453, + "step": 361, + "token_acc": 0.08483443484859904 + }, + { + "epoch": 0.21225447082966872, + "grad_norm": 4.914817189225502, + "learning_rate": 2.9706916764361077e-05, + "loss": 6.1269402503967285, + "step": 362, + "token_acc": 0.08605764599599587 + }, + { + "epoch": 0.21284080914687775, + "grad_norm": 6.574694364726248, + "learning_rate": 2.978898007033997e-05, + "loss": 6.026739120483398, + "step": 363, + "token_acc": 0.08999350058494736 + }, + { + "epoch": 0.21342714746408678, + "grad_norm": 7.7171852478008285, + "learning_rate": 2.9871043376318873e-05, + "loss": 6.097996234893799, + "step": 364, + "token_acc": 0.0869426784567914 + }, + { + "epoch": 0.2140134857812958, + "grad_norm": 4.509688260607066, + "learning_rate": 2.995310668229777e-05, + "loss": 5.972391128540039, + "step": 365, + "token_acc": 0.09577491295756092 + }, + { + "epoch": 0.21459982409850484, + "grad_norm": 5.458653278006507, + "learning_rate": 3.003516998827667e-05, + "loss": 5.933320045471191, + "step": 366, + "token_acc": 0.09642404934035861 + }, + { + "epoch": 0.21518616241571387, + "grad_norm": 5.683848252394637, + "learning_rate": 3.0117233294255564e-05, + "loss": 5.965089797973633, + "step": 367, + "token_acc": 0.09715046809164607 + }, + { + "epoch": 0.2157725007329229, + "grad_norm": 5.5727047201970805, + "learning_rate": 3.0199296600234465e-05, + "loss": 5.945783615112305, + "step": 368, + "token_acc": 0.0934168092457757 + }, + { + "epoch": 0.21635883905013192, + "grad_norm": 7.993806798397042, + "learning_rate": 3.0281359906213363e-05, + "loss": 5.8814473152160645, + "step": 369, + "token_acc": 0.10355702275535555 + }, + { + "epoch": 0.21694517736734095, + "grad_norm": 4.242216756776699, + "learning_rate": 3.036342321219226e-05, + "loss": 5.863864421844482, + "step": 370, + "token_acc": 0.10104954264262676 + }, + { + "epoch": 0.21753151568454998, + "grad_norm": 7.12559426922248, + "learning_rate": 3.0445486518171155e-05, + "loss": 5.851903438568115, + "step": 371, + "token_acc": 0.0988907776975681 + }, + { + "epoch": 0.218117854001759, + "grad_norm": 2.562175766758614, + "learning_rate": 3.052754982415006e-05, + "loss": 5.892111301422119, + "step": 372, + "token_acc": 0.10034013155282627 + }, + { + "epoch": 0.21870419231896804, + "grad_norm": 8.906073138903627, + "learning_rate": 3.0609613130128955e-05, + "loss": 5.915294647216797, + "step": 373, + "token_acc": 0.09728479833643808 + }, + { + "epoch": 0.21929053063617707, + "grad_norm": 5.76900127275279, + "learning_rate": 3.069167643610785e-05, + "loss": 5.888400077819824, + "step": 374, + "token_acc": 0.09916352260368695 + }, + { + "epoch": 0.2198768689533861, + "grad_norm": 7.010054252869409, + "learning_rate": 3.077373974208675e-05, + "loss": 5.904424667358398, + "step": 375, + "token_acc": 0.09839418428646998 + }, + { + "epoch": 0.22046320727059512, + "grad_norm": 5.5435753806715065, + "learning_rate": 3.085580304806565e-05, + "loss": 5.814133167266846, + "step": 376, + "token_acc": 0.10243825909607918 + }, + { + "epoch": 0.22104954558780415, + "grad_norm": 4.364383601823679, + "learning_rate": 3.093786635404455e-05, + "loss": 5.871279716491699, + "step": 377, + "token_acc": 0.10116971763000714 + }, + { + "epoch": 0.22163588390501318, + "grad_norm": 8.100806956559703, + "learning_rate": 3.1019929660023445e-05, + "loss": 5.793034553527832, + "step": 378, + "token_acc": 0.10408344137827652 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 6.049077745274884, + "learning_rate": 3.110199296600234e-05, + "loss": 5.801364898681641, + "step": 379, + "token_acc": 0.10839996459445904 + }, + { + "epoch": 0.22280856053943124, + "grad_norm": 3.386272956144011, + "learning_rate": 3.118405627198124e-05, + "loss": 5.733031272888184, + "step": 380, + "token_acc": 0.1094655165123261 + }, + { + "epoch": 0.2233948988566403, + "grad_norm": 9.456867599845966, + "learning_rate": 3.126611957796014e-05, + "loss": 5.719071388244629, + "step": 381, + "token_acc": 0.1083124224094667 + }, + { + "epoch": 0.22398123717384932, + "grad_norm": 4.330384081141098, + "learning_rate": 3.134818288393904e-05, + "loss": 5.669560432434082, + "step": 382, + "token_acc": 0.11199709669339958 + }, + { + "epoch": 0.22456757549105835, + "grad_norm": 9.978848251326847, + "learning_rate": 3.1430246189917935e-05, + "loss": 5.727608680725098, + "step": 383, + "token_acc": 0.10876805574724167 + }, + { + "epoch": 0.22515391380826738, + "grad_norm": 6.950342593424276, + "learning_rate": 3.151230949589683e-05, + "loss": 5.697427749633789, + "step": 384, + "token_acc": 0.11248477563676398 + }, + { + "epoch": 0.2257402521254764, + "grad_norm": 6.479830826986394, + "learning_rate": 3.159437280187573e-05, + "loss": 5.694333076477051, + "step": 385, + "token_acc": 0.10902044049902239 + }, + { + "epoch": 0.22632659044268544, + "grad_norm": 6.388302184794283, + "learning_rate": 3.167643610785463e-05, + "loss": 5.715522766113281, + "step": 386, + "token_acc": 0.11104647859506346 + }, + { + "epoch": 0.22691292875989447, + "grad_norm": 4.9447274118658235, + "learning_rate": 3.1758499413833527e-05, + "loss": 5.690276145935059, + "step": 387, + "token_acc": 0.11082315420356849 + }, + { + "epoch": 0.2274992670771035, + "grad_norm": 6.825288287594065, + "learning_rate": 3.1840562719812425e-05, + "loss": 5.664138317108154, + "step": 388, + "token_acc": 0.11341331572703575 + }, + { + "epoch": 0.22808560539431252, + "grad_norm": 5.728998858379683, + "learning_rate": 3.192262602579132e-05, + "loss": 5.697914123535156, + "step": 389, + "token_acc": 0.1109388931511394 + }, + { + "epoch": 0.22867194371152155, + "grad_norm": 3.3265138383471538, + "learning_rate": 3.200468933177022e-05, + "loss": 5.575888156890869, + "step": 390, + "token_acc": 0.1199025003352537 + }, + { + "epoch": 0.22925828202873058, + "grad_norm": 10.459675562249998, + "learning_rate": 3.208675263774912e-05, + "loss": 5.606986045837402, + "step": 391, + "token_acc": 0.12034323599185881 + }, + { + "epoch": 0.2298446203459396, + "grad_norm": 4.868130814850591, + "learning_rate": 3.2168815943728016e-05, + "loss": 5.64790678024292, + "step": 392, + "token_acc": 0.11590617389799782 + }, + { + "epoch": 0.23043095866314864, + "grad_norm": 12.768720719385545, + "learning_rate": 3.2250879249706914e-05, + "loss": 5.650926113128662, + "step": 393, + "token_acc": 0.110319839414737 + }, + { + "epoch": 0.23101729698035767, + "grad_norm": 7.090300637662734, + "learning_rate": 3.233294255568581e-05, + "loss": 5.5357208251953125, + "step": 394, + "token_acc": 0.11874065864793143 + }, + { + "epoch": 0.2316036352975667, + "grad_norm": 11.974542948234529, + "learning_rate": 3.241500586166471e-05, + "loss": 5.676987648010254, + "step": 395, + "token_acc": 0.11116802605828967 + }, + { + "epoch": 0.23218997361477572, + "grad_norm": 9.550487573974742, + "learning_rate": 3.249706916764361e-05, + "loss": 5.605356216430664, + "step": 396, + "token_acc": 0.11959896507115136 + }, + { + "epoch": 0.23277631193198475, + "grad_norm": 7.752182120700802, + "learning_rate": 3.2579132473622506e-05, + "loss": 5.620063781738281, + "step": 397, + "token_acc": 0.1139806145100941 + }, + { + "epoch": 0.23336265024919378, + "grad_norm": 6.901103737347299, + "learning_rate": 3.2661195779601404e-05, + "loss": 5.656468391418457, + "step": 398, + "token_acc": 0.10943400215099794 + }, + { + "epoch": 0.2339489885664028, + "grad_norm": 4.86173992333123, + "learning_rate": 3.27432590855803e-05, + "loss": 5.540590286254883, + "step": 399, + "token_acc": 0.12205815544498297 + }, + { + "epoch": 0.23453532688361184, + "grad_norm": 4.37658279856578, + "learning_rate": 3.28253223915592e-05, + "loss": 5.512296199798584, + "step": 400, + "token_acc": 0.12221467534382174 + }, + { + "epoch": 0.23512166520082087, + "grad_norm": 7.610321730055148, + "learning_rate": 3.29073856975381e-05, + "loss": 5.504565715789795, + "step": 401, + "token_acc": 0.12283261098064342 + }, + { + "epoch": 0.2357080035180299, + "grad_norm": 4.834560929092718, + "learning_rate": 3.2989449003516996e-05, + "loss": 5.471881866455078, + "step": 402, + "token_acc": 0.12622801565050978 + }, + { + "epoch": 0.23629434183523892, + "grad_norm": 6.4863374990144305, + "learning_rate": 3.3071512309495894e-05, + "loss": 5.448960304260254, + "step": 403, + "token_acc": 0.1272094616334707 + }, + { + "epoch": 0.23688068015244795, + "grad_norm": 6.376514340996239, + "learning_rate": 3.315357561547479e-05, + "loss": 5.4967474937438965, + "step": 404, + "token_acc": 0.12428196786789786 + }, + { + "epoch": 0.23746701846965698, + "grad_norm": 4.603992644435482, + "learning_rate": 3.323563892145369e-05, + "loss": 5.3730878829956055, + "step": 405, + "token_acc": 0.13343197910041368 + }, + { + "epoch": 0.238053356786866, + "grad_norm": 7.644842595241106, + "learning_rate": 3.331770222743259e-05, + "loss": 5.453896999359131, + "step": 406, + "token_acc": 0.12670379307884033 + }, + { + "epoch": 0.23863969510407504, + "grad_norm": 4.423087088557715, + "learning_rate": 3.3399765533411486e-05, + "loss": 5.409916877746582, + "step": 407, + "token_acc": 0.1314650683821998 + }, + { + "epoch": 0.23922603342128407, + "grad_norm": 6.152294869594081, + "learning_rate": 3.3481828839390384e-05, + "loss": 5.451051712036133, + "step": 408, + "token_acc": 0.12499536573610648 + }, + { + "epoch": 0.23981237173849312, + "grad_norm": 5.832180776655737, + "learning_rate": 3.356389214536928e-05, + "loss": 5.37488317489624, + "step": 409, + "token_acc": 0.13363264297599303 + }, + { + "epoch": 0.24039871005570215, + "grad_norm": 4.519219135834628, + "learning_rate": 3.364595545134818e-05, + "loss": 5.371809959411621, + "step": 410, + "token_acc": 0.13427762499290158 + }, + { + "epoch": 0.24098504837291118, + "grad_norm": 3.9662349757634425, + "learning_rate": 3.372801875732708e-05, + "loss": 5.233335494995117, + "step": 411, + "token_acc": 0.14384437187752938 + }, + { + "epoch": 0.2415713866901202, + "grad_norm": 6.175410218180845, + "learning_rate": 3.3810082063305976e-05, + "loss": 5.339487075805664, + "step": 412, + "token_acc": 0.1353152362682007 + }, + { + "epoch": 0.24215772500732924, + "grad_norm": 5.734904973769652, + "learning_rate": 3.3892145369284874e-05, + "loss": 5.325626850128174, + "step": 413, + "token_acc": 0.13544106760771907 + }, + { + "epoch": 0.24274406332453827, + "grad_norm": 6.748808978080904, + "learning_rate": 3.397420867526377e-05, + "loss": 5.316834449768066, + "step": 414, + "token_acc": 0.13883175702324893 + }, + { + "epoch": 0.2433304016417473, + "grad_norm": 3.733223510404855, + "learning_rate": 3.405627198124267e-05, + "loss": 5.31654167175293, + "step": 415, + "token_acc": 0.1347342103381333 + }, + { + "epoch": 0.24391673995895632, + "grad_norm": 5.68343905444421, + "learning_rate": 3.413833528722157e-05, + "loss": 5.376072883605957, + "step": 416, + "token_acc": 0.12966109854763955 + }, + { + "epoch": 0.24450307827616535, + "grad_norm": 5.587575454541941, + "learning_rate": 3.4220398593200466e-05, + "loss": 5.255402088165283, + "step": 417, + "token_acc": 0.14047008413348763 + }, + { + "epoch": 0.24508941659337438, + "grad_norm": 4.8150777335325765, + "learning_rate": 3.4302461899179364e-05, + "loss": 5.2430009841918945, + "step": 418, + "token_acc": 0.14188434890249677 + }, + { + "epoch": 0.2456757549105834, + "grad_norm": 6.71383271808846, + "learning_rate": 3.438452520515826e-05, + "loss": 5.282718658447266, + "step": 419, + "token_acc": 0.13470143134912269 + }, + { + "epoch": 0.24626209322779244, + "grad_norm": 8.138263407057881, + "learning_rate": 3.446658851113716e-05, + "loss": 5.304866790771484, + "step": 420, + "token_acc": 0.13565958635386735 + }, + { + "epoch": 0.24684843154500147, + "grad_norm": 5.212040391716807, + "learning_rate": 3.454865181711606e-05, + "loss": 5.295950889587402, + "step": 421, + "token_acc": 0.13749651700378462 + }, + { + "epoch": 0.2474347698622105, + "grad_norm": 7.165890640559476, + "learning_rate": 3.4630715123094956e-05, + "loss": 5.2838006019592285, + "step": 422, + "token_acc": 0.1374077461069823 + }, + { + "epoch": 0.24802110817941952, + "grad_norm": 4.556210119101287, + "learning_rate": 3.4712778429073854e-05, + "loss": 5.276226997375488, + "step": 423, + "token_acc": 0.13886151588216794 + }, + { + "epoch": 0.24860744649662855, + "grad_norm": 6.7196136739153465, + "learning_rate": 3.479484173505275e-05, + "loss": 5.141275405883789, + "step": 424, + "token_acc": 0.14905236454897539 + }, + { + "epoch": 0.24919378481383758, + "grad_norm": 4.5244675263915655, + "learning_rate": 3.487690504103165e-05, + "loss": 5.1395721435546875, + "step": 425, + "token_acc": 0.1501625551889078 + }, + { + "epoch": 0.2497801231310466, + "grad_norm": 4.305755272978437, + "learning_rate": 3.495896834701055e-05, + "loss": 5.161777496337891, + "step": 426, + "token_acc": 0.14801628329407618 + }, + { + "epoch": 0.25036646144825564, + "grad_norm": 5.658256696784875, + "learning_rate": 3.5041031652989446e-05, + "loss": 5.184259414672852, + "step": 427, + "token_acc": 0.14238316969091688 + }, + { + "epoch": 0.2509527997654647, + "grad_norm": 4.972662889303137, + "learning_rate": 3.5123094958968344e-05, + "loss": 5.185224533081055, + "step": 428, + "token_acc": 0.14612326306684267 + }, + { + "epoch": 0.2515391380826737, + "grad_norm": 5.705291829231124, + "learning_rate": 3.520515826494724e-05, + "loss": 5.1912007331848145, + "step": 429, + "token_acc": 0.1475584146309046 + }, + { + "epoch": 0.25212547639988275, + "grad_norm": 6.659758070748966, + "learning_rate": 3.528722157092614e-05, + "loss": 5.212825775146484, + "step": 430, + "token_acc": 0.1454485756755106 + }, + { + "epoch": 0.25271181471709175, + "grad_norm": 5.268090333457292, + "learning_rate": 3.536928487690503e-05, + "loss": 5.073171615600586, + "step": 431, + "token_acc": 0.15389058861718796 + }, + { + "epoch": 0.2532981530343008, + "grad_norm": 5.398514606098428, + "learning_rate": 3.5451348182883936e-05, + "loss": 5.135335922241211, + "step": 432, + "token_acc": 0.14954449788394764 + }, + { + "epoch": 0.2538844913515098, + "grad_norm": 6.6646015579131195, + "learning_rate": 3.5533411488862834e-05, + "loss": 5.15054178237915, + "step": 433, + "token_acc": 0.14666734936828765 + }, + { + "epoch": 0.25447082966871887, + "grad_norm": 3.559878286621562, + "learning_rate": 3.561547479484173e-05, + "loss": 5.091745376586914, + "step": 434, + "token_acc": 0.15107027747548854 + }, + { + "epoch": 0.25505716798592787, + "grad_norm": 8.224332800984234, + "learning_rate": 3.569753810082063e-05, + "loss": 5.09508752822876, + "step": 435, + "token_acc": 0.14848039694076193 + }, + { + "epoch": 0.2556435063031369, + "grad_norm": 4.4820530093887765, + "learning_rate": 3.577960140679953e-05, + "loss": 5.1070709228515625, + "step": 436, + "token_acc": 0.14960491990600006 + }, + { + "epoch": 0.2562298446203459, + "grad_norm": 5.916449641785716, + "learning_rate": 3.5861664712778426e-05, + "loss": 5.155655860900879, + "step": 437, + "token_acc": 0.14843973358464194 + }, + { + "epoch": 0.256816182937555, + "grad_norm": 5.6588710286094654, + "learning_rate": 3.5943728018757324e-05, + "loss": 5.150594234466553, + "step": 438, + "token_acc": 0.14711758852548973 + }, + { + "epoch": 0.257402521254764, + "grad_norm": 5.807443482602914, + "learning_rate": 3.602579132473622e-05, + "loss": 5.041116714477539, + "step": 439, + "token_acc": 0.15865883257256277 + }, + { + "epoch": 0.25798885957197304, + "grad_norm": 3.066935402216836, + "learning_rate": 3.610785463071511e-05, + "loss": 5.028489589691162, + "step": 440, + "token_acc": 0.1578670890679425 + }, + { + "epoch": 0.25857519788918204, + "grad_norm": 7.454903739282551, + "learning_rate": 3.618991793669402e-05, + "loss": 5.084744930267334, + "step": 441, + "token_acc": 0.15124455327694783 + }, + { + "epoch": 0.2591615362063911, + "grad_norm": 4.855878422769349, + "learning_rate": 3.6271981242672916e-05, + "loss": 5.067684173583984, + "step": 442, + "token_acc": 0.15359699438766833 + }, + { + "epoch": 0.2597478745236001, + "grad_norm": 5.250694288037539, + "learning_rate": 3.6354044548651814e-05, + "loss": 5.074548244476318, + "step": 443, + "token_acc": 0.14843909845686656 + }, + { + "epoch": 0.26033421284080915, + "grad_norm": 6.064142847715136, + "learning_rate": 3.643610785463071e-05, + "loss": 5.060888290405273, + "step": 444, + "token_acc": 0.15394387372898669 + }, + { + "epoch": 0.26092055115801815, + "grad_norm": 4.8974284209180645, + "learning_rate": 3.651817116060961e-05, + "loss": 5.02075719833374, + "step": 445, + "token_acc": 0.15538345224901023 + }, + { + "epoch": 0.2615068894752272, + "grad_norm": 4.8504137942003664, + "learning_rate": 3.660023446658851e-05, + "loss": 5.036099433898926, + "step": 446, + "token_acc": 0.15459575586157864 + }, + { + "epoch": 0.2620932277924362, + "grad_norm": 2.9613863602936634, + "learning_rate": 3.6682297772567406e-05, + "loss": 4.953916549682617, + "step": 447, + "token_acc": 0.1619095717957014 + }, + { + "epoch": 0.26267956610964527, + "grad_norm": 7.017908950738959, + "learning_rate": 3.676436107854631e-05, + "loss": 5.088875770568848, + "step": 448, + "token_acc": 0.14751002443953068 + }, + { + "epoch": 0.26326590442685427, + "grad_norm": 4.387576226004965, + "learning_rate": 3.68464243845252e-05, + "loss": 4.966022491455078, + "step": 449, + "token_acc": 0.16129543176731456 + }, + { + "epoch": 0.2638522427440633, + "grad_norm": 5.305816905622403, + "learning_rate": 3.69284876905041e-05, + "loss": 5.052915096282959, + "step": 450, + "token_acc": 0.15338949940870916 + }, + { + "epoch": 0.2644385810612723, + "grad_norm": 6.035451566160448, + "learning_rate": 3.7010550996483e-05, + "loss": 5.048089027404785, + "step": 451, + "token_acc": 0.15400516375832743 + }, + { + "epoch": 0.2650249193784814, + "grad_norm": 3.5121201704922425, + "learning_rate": 3.7092614302461896e-05, + "loss": 5.022364139556885, + "step": 452, + "token_acc": 0.15661915875225468 + }, + { + "epoch": 0.26561125769569044, + "grad_norm": 6.088430150586162, + "learning_rate": 3.7174677608440794e-05, + "loss": 4.956811904907227, + "step": 453, + "token_acc": 0.1606962224884909 + }, + { + "epoch": 0.26619759601289944, + "grad_norm": 3.9191383038118106, + "learning_rate": 3.725674091441969e-05, + "loss": 4.872303009033203, + "step": 454, + "token_acc": 0.16753443106820165 + }, + { + "epoch": 0.2667839343301085, + "grad_norm": 6.961416911227017, + "learning_rate": 3.733880422039859e-05, + "loss": 5.049225330352783, + "step": 455, + "token_acc": 0.1523634816120787 + }, + { + "epoch": 0.2673702726473175, + "grad_norm": 4.778722044033276, + "learning_rate": 3.742086752637749e-05, + "loss": 4.90943717956543, + "step": 456, + "token_acc": 0.16553806617410857 + }, + { + "epoch": 0.26795661096452655, + "grad_norm": 4.692481899166697, + "learning_rate": 3.750293083235639e-05, + "loss": 4.986898422241211, + "step": 457, + "token_acc": 0.16026830837085362 + }, + { + "epoch": 0.26854294928173555, + "grad_norm": 4.580578061397562, + "learning_rate": 3.7584994138335283e-05, + "loss": 4.964858055114746, + "step": 458, + "token_acc": 0.15767877457075585 + }, + { + "epoch": 0.2691292875989446, + "grad_norm": 3.8708301193200363, + "learning_rate": 3.766705744431418e-05, + "loss": 4.976293087005615, + "step": 459, + "token_acc": 0.15764734617093534 + }, + { + "epoch": 0.2697156259161536, + "grad_norm": 5.22549963399129, + "learning_rate": 3.774912075029308e-05, + "loss": 4.969941139221191, + "step": 460, + "token_acc": 0.15727248148494413 + }, + { + "epoch": 0.27030196423336267, + "grad_norm": 3.9221587888364593, + "learning_rate": 3.783118405627198e-05, + "loss": 4.986678600311279, + "step": 461, + "token_acc": 0.1555507480943758 + }, + { + "epoch": 0.27088830255057167, + "grad_norm": 5.76792278190766, + "learning_rate": 3.7913247362250875e-05, + "loss": 4.886974811553955, + "step": 462, + "token_acc": 0.16748608313640906 + }, + { + "epoch": 0.2714746408677807, + "grad_norm": 2.927549106959559, + "learning_rate": 3.799531066822977e-05, + "loss": 4.819583415985107, + "step": 463, + "token_acc": 0.1731849634498167 + }, + { + "epoch": 0.2720609791849897, + "grad_norm": 4.311879057406091, + "learning_rate": 3.807737397420867e-05, + "loss": 4.862274646759033, + "step": 464, + "token_acc": 0.16724169814809922 + }, + { + "epoch": 0.2726473175021988, + "grad_norm": 7.9268557112795035, + "learning_rate": 3.815943728018757e-05, + "loss": 4.993745803833008, + "step": 465, + "token_acc": 0.15440605642022304 + }, + { + "epoch": 0.2732336558194078, + "grad_norm": 5.174726235951455, + "learning_rate": 3.8241500586166474e-05, + "loss": 4.876766204833984, + "step": 466, + "token_acc": 0.16630642965378786 + }, + { + "epoch": 0.27381999413661684, + "grad_norm": 5.324514735526076, + "learning_rate": 3.8323563892145365e-05, + "loss": 4.9098286628723145, + "step": 467, + "token_acc": 0.1622337877160335 + }, + { + "epoch": 0.27440633245382584, + "grad_norm": 3.8230161864156154, + "learning_rate": 3.840562719812426e-05, + "loss": 4.8760504722595215, + "step": 468, + "token_acc": 0.1670110063483716 + }, + { + "epoch": 0.2749926707710349, + "grad_norm": 6.531581457449913, + "learning_rate": 3.848769050410316e-05, + "loss": 4.856128215789795, + "step": 469, + "token_acc": 0.16428578978521902 + }, + { + "epoch": 0.2755790090882439, + "grad_norm": 3.8723758469892884, + "learning_rate": 3.856975381008206e-05, + "loss": 4.870810508728027, + "step": 470, + "token_acc": 0.16378886984870017 + }, + { + "epoch": 0.27616534740545295, + "grad_norm": 6.060564796565729, + "learning_rate": 3.865181711606096e-05, + "loss": 4.866532325744629, + "step": 471, + "token_acc": 0.16861252224979537 + }, + { + "epoch": 0.27675168572266196, + "grad_norm": 4.158868558541277, + "learning_rate": 3.8733880422039855e-05, + "loss": 4.863700866699219, + "step": 472, + "token_acc": 0.16568546753942434 + }, + { + "epoch": 0.277338024039871, + "grad_norm": 6.890178880254814, + "learning_rate": 3.881594372801875e-05, + "loss": 4.900069236755371, + "step": 473, + "token_acc": 0.16240871789593653 + }, + { + "epoch": 0.27792436235708, + "grad_norm": 6.075839627985804, + "learning_rate": 3.889800703399766e-05, + "loss": 4.827624320983887, + "step": 474, + "token_acc": 0.1690576413176927 + }, + { + "epoch": 0.27851070067428907, + "grad_norm": 3.7536917237957907, + "learning_rate": 3.8980070339976556e-05, + "loss": 4.7210211753845215, + "step": 475, + "token_acc": 0.1784072418205659 + }, + { + "epoch": 0.27909703899149807, + "grad_norm": 7.846119632939475, + "learning_rate": 3.906213364595545e-05, + "loss": 4.902039527893066, + "step": 476, + "token_acc": 0.1612549185392345 + }, + { + "epoch": 0.2796833773087071, + "grad_norm": 4.604159514257903, + "learning_rate": 3.9144196951934345e-05, + "loss": 4.826611518859863, + "step": 477, + "token_acc": 0.16720481964997705 + }, + { + "epoch": 0.2802697156259161, + "grad_norm": 6.571972386929722, + "learning_rate": 3.922626025791324e-05, + "loss": 4.8582658767700195, + "step": 478, + "token_acc": 0.17041208929617874 + }, + { + "epoch": 0.2808560539431252, + "grad_norm": 4.595181361235622, + "learning_rate": 3.930832356389214e-05, + "loss": 4.912710666656494, + "step": 479, + "token_acc": 0.16197342217131086 + }, + { + "epoch": 0.28144239226033424, + "grad_norm": 4.940192232931624, + "learning_rate": 3.939038686987104e-05, + "loss": 4.774998664855957, + "step": 480, + "token_acc": 0.17084471156987 + }, + { + "epoch": 0.28202873057754324, + "grad_norm": 5.112579533634725, + "learning_rate": 3.947245017584994e-05, + "loss": 4.776060581207275, + "step": 481, + "token_acc": 0.1709170510515206 + }, + { + "epoch": 0.2826150688947523, + "grad_norm": 5.966988350355773, + "learning_rate": 3.9554513481828835e-05, + "loss": 4.813092231750488, + "step": 482, + "token_acc": 0.16952886977239423 + }, + { + "epoch": 0.2832014072119613, + "grad_norm": 5.007591138618022, + "learning_rate": 3.963657678780774e-05, + "loss": 4.825842380523682, + "step": 483, + "token_acc": 0.16916373643173827 + }, + { + "epoch": 0.28378774552917035, + "grad_norm": 5.051898317509681, + "learning_rate": 3.971864009378664e-05, + "loss": 4.794732093811035, + "step": 484, + "token_acc": 0.1692357963091669 + }, + { + "epoch": 0.28437408384637936, + "grad_norm": 4.711763343865885, + "learning_rate": 3.980070339976553e-05, + "loss": 4.788509368896484, + "step": 485, + "token_acc": 0.17035854400166878 + }, + { + "epoch": 0.2849604221635884, + "grad_norm": 3.819670135340478, + "learning_rate": 3.988276670574443e-05, + "loss": 4.711777687072754, + "step": 486, + "token_acc": 0.17715314858534165 + }, + { + "epoch": 0.2855467604807974, + "grad_norm": 5.970907743815383, + "learning_rate": 3.9964830011723325e-05, + "loss": 4.755890846252441, + "step": 487, + "token_acc": 0.17389933774114566 + }, + { + "epoch": 0.28613309879800647, + "grad_norm": 4.508949535865645, + "learning_rate": 4.004689331770222e-05, + "loss": 4.786577224731445, + "step": 488, + "token_acc": 0.17178006323067477 + }, + { + "epoch": 0.28671943711521547, + "grad_norm": 4.7039836772607275, + "learning_rate": 4.012895662368112e-05, + "loss": 4.661779880523682, + "step": 489, + "token_acc": 0.18148803993658003 + }, + { + "epoch": 0.2873057754324245, + "grad_norm": 4.099877234508298, + "learning_rate": 4.021101992966002e-05, + "loss": 4.757030010223389, + "step": 490, + "token_acc": 0.17181380069671318 + }, + { + "epoch": 0.2878921137496335, + "grad_norm": 7.878111921252635, + "learning_rate": 4.029308323563892e-05, + "loss": 4.677789688110352, + "step": 491, + "token_acc": 0.17885842897538257 + }, + { + "epoch": 0.2884784520668426, + "grad_norm": 3.991116361814418, + "learning_rate": 4.037514654161782e-05, + "loss": 4.732224464416504, + "step": 492, + "token_acc": 0.17598534656474948 + }, + { + "epoch": 0.2890647903840516, + "grad_norm": 8.77967181210225, + "learning_rate": 4.045720984759672e-05, + "loss": 4.756728172302246, + "step": 493, + "token_acc": 0.1733485132601125 + }, + { + "epoch": 0.28965112870126064, + "grad_norm": 5.715927704378967, + "learning_rate": 4.053927315357561e-05, + "loss": 4.751640796661377, + "step": 494, + "token_acc": 0.17318351413720964 + }, + { + "epoch": 0.29023746701846964, + "grad_norm": 6.802560523270176, + "learning_rate": 4.062133645955451e-05, + "loss": 4.760959625244141, + "step": 495, + "token_acc": 0.17220478182843135 + }, + { + "epoch": 0.2908238053356787, + "grad_norm": 5.072369745747379, + "learning_rate": 4.070339976553341e-05, + "loss": 4.716753959655762, + "step": 496, + "token_acc": 0.17656106375866967 + }, + { + "epoch": 0.2914101436528877, + "grad_norm": 4.90117491775021, + "learning_rate": 4.0785463071512305e-05, + "loss": 4.752071380615234, + "step": 497, + "token_acc": 0.17300147353210046 + }, + { + "epoch": 0.29199648197009676, + "grad_norm": 5.4957393353858865, + "learning_rate": 4.08675263774912e-05, + "loss": 4.681450843811035, + "step": 498, + "token_acc": 0.1782429481868255 + }, + { + "epoch": 0.29258282028730576, + "grad_norm": 4.95462142170705, + "learning_rate": 4.09495896834701e-05, + "loss": 4.659943580627441, + "step": 499, + "token_acc": 0.18024463667727314 + }, + { + "epoch": 0.2931691586045148, + "grad_norm": 5.955133078258152, + "learning_rate": 4.1031652989449e-05, + "loss": 4.7830939292907715, + "step": 500, + "token_acc": 0.1678144186512178 + }, + { + "epoch": 0.2937554969217238, + "grad_norm": 6.031623149406746, + "learning_rate": 4.1113716295427904e-05, + "loss": 4.656826972961426, + "step": 501, + "token_acc": 0.18212388155719533 + }, + { + "epoch": 0.29434183523893287, + "grad_norm": 3.9467654836948056, + "learning_rate": 4.11957796014068e-05, + "loss": 4.635341644287109, + "step": 502, + "token_acc": 0.18288904750082755 + }, + { + "epoch": 0.29492817355614187, + "grad_norm": 5.270988018328669, + "learning_rate": 4.127784290738569e-05, + "loss": 4.578634262084961, + "step": 503, + "token_acc": 0.1883495461407736 + }, + { + "epoch": 0.2955145118733509, + "grad_norm": 4.955821900675812, + "learning_rate": 4.135990621336459e-05, + "loss": 4.6846466064453125, + "step": 504, + "token_acc": 0.17669373603992516 + }, + { + "epoch": 0.2961008501905599, + "grad_norm": 4.350351258889933, + "learning_rate": 4.144196951934349e-05, + "loss": 4.686916351318359, + "step": 505, + "token_acc": 0.17451957112149413 + }, + { + "epoch": 0.296687188507769, + "grad_norm": 6.779972490933298, + "learning_rate": 4.152403282532239e-05, + "loss": 4.664619445800781, + "step": 506, + "token_acc": 0.1783146322912501 + }, + { + "epoch": 0.297273526824978, + "grad_norm": 2.516193002793484, + "learning_rate": 4.1606096131301285e-05, + "loss": 4.657482147216797, + "step": 507, + "token_acc": 0.1795607850559665 + }, + { + "epoch": 0.29785986514218704, + "grad_norm": 7.917869431286497, + "learning_rate": 4.168815943728018e-05, + "loss": 4.618627548217773, + "step": 508, + "token_acc": 0.18406706493287173 + }, + { + "epoch": 0.2984462034593961, + "grad_norm": 5.401828678481543, + "learning_rate": 4.177022274325909e-05, + "loss": 4.700881004333496, + "step": 509, + "token_acc": 0.17571589831333073 + }, + { + "epoch": 0.2990325417766051, + "grad_norm": 5.612374293805915, + "learning_rate": 4.1852286049237985e-05, + "loss": 4.602940559387207, + "step": 510, + "token_acc": 0.1833063605970912 + }, + { + "epoch": 0.29961888009381415, + "grad_norm": 4.44346370741346, + "learning_rate": 4.1934349355216877e-05, + "loss": 4.622890472412109, + "step": 511, + "token_acc": 0.18342795420628047 + }, + { + "epoch": 0.30020521841102316, + "grad_norm": 4.394793460960435, + "learning_rate": 4.2016412661195775e-05, + "loss": 4.701310157775879, + "step": 512, + "token_acc": 0.17294349686525656 + }, + { + "epoch": 0.3007915567282322, + "grad_norm": 5.12559901698279, + "learning_rate": 4.209847596717467e-05, + "loss": 4.669729232788086, + "step": 513, + "token_acc": 0.17563004865451648 + }, + { + "epoch": 0.3013778950454412, + "grad_norm": 5.981152478181236, + "learning_rate": 4.218053927315357e-05, + "loss": 4.613595008850098, + "step": 514, + "token_acc": 0.18036203514838425 + }, + { + "epoch": 0.30196423336265027, + "grad_norm": 5.965352563229646, + "learning_rate": 4.226260257913247e-05, + "loss": 4.624350070953369, + "step": 515, + "token_acc": 0.182394285055882 + }, + { + "epoch": 0.30255057167985927, + "grad_norm": 5.374282132665988, + "learning_rate": 4.2344665885111366e-05, + "loss": 4.681499481201172, + "step": 516, + "token_acc": 0.1765534802978332 + }, + { + "epoch": 0.3031369099970683, + "grad_norm": 5.051166520187232, + "learning_rate": 4.2426729191090264e-05, + "loss": 4.583155632019043, + "step": 517, + "token_acc": 0.1848408302787974 + }, + { + "epoch": 0.3037232483142773, + "grad_norm": 4.354903076724776, + "learning_rate": 4.250879249706917e-05, + "loss": 4.5662055015563965, + "step": 518, + "token_acc": 0.18501617616364016 + }, + { + "epoch": 0.3043095866314864, + "grad_norm": 7.254814879247495, + "learning_rate": 4.259085580304807e-05, + "loss": 4.63272762298584, + "step": 519, + "token_acc": 0.1814322888287 + }, + { + "epoch": 0.3048959249486954, + "grad_norm": 2.8344317826714165, + "learning_rate": 4.267291910902696e-05, + "loss": 4.566251277923584, + "step": 520, + "token_acc": 0.18252042917412659 + }, + { + "epoch": 0.30548226326590444, + "grad_norm": 5.286420691375452, + "learning_rate": 4.2754982415005856e-05, + "loss": 4.647183895111084, + "step": 521, + "token_acc": 0.1760656428817472 + }, + { + "epoch": 0.30606860158311344, + "grad_norm": 5.141743028250384, + "learning_rate": 4.2837045720984754e-05, + "loss": 4.534359931945801, + "step": 522, + "token_acc": 0.18841153549645742 + }, + { + "epoch": 0.3066549399003225, + "grad_norm": 6.712464526146822, + "learning_rate": 4.291910902696365e-05, + "loss": 4.575827598571777, + "step": 523, + "token_acc": 0.18434154351395732 + }, + { + "epoch": 0.3072412782175315, + "grad_norm": 4.5108593646587565, + "learning_rate": 4.300117233294255e-05, + "loss": 4.6136322021484375, + "step": 524, + "token_acc": 0.17909760470212055 + }, + { + "epoch": 0.30782761653474056, + "grad_norm": 6.074280889169607, + "learning_rate": 4.308323563892145e-05, + "loss": 4.533474922180176, + "step": 525, + "token_acc": 0.18856864834808396 + }, + { + "epoch": 0.30841395485194956, + "grad_norm": 5.075177379537212, + "learning_rate": 4.3165298944900346e-05, + "loss": 4.533350944519043, + "step": 526, + "token_acc": 0.18758292288398976 + }, + { + "epoch": 0.3090002931691586, + "grad_norm": 7.125022653072245, + "learning_rate": 4.324736225087925e-05, + "loss": 4.567195892333984, + "step": 527, + "token_acc": 0.18469439295092674 + }, + { + "epoch": 0.3095866314863676, + "grad_norm": 4.373813287264896, + "learning_rate": 4.332942555685815e-05, + "loss": 4.537093162536621, + "step": 528, + "token_acc": 0.18826839480491772 + }, + { + "epoch": 0.31017296980357667, + "grad_norm": 6.839596206891016, + "learning_rate": 4.341148886283704e-05, + "loss": 4.5922441482543945, + "step": 529, + "token_acc": 0.18332998719777224 + }, + { + "epoch": 0.31075930812078567, + "grad_norm": 4.636625620323953, + "learning_rate": 4.349355216881594e-05, + "loss": 4.512372016906738, + "step": 530, + "token_acc": 0.189996297527451 + }, + { + "epoch": 0.3113456464379947, + "grad_norm": 6.939024922502758, + "learning_rate": 4.3575615474794836e-05, + "loss": 4.534167289733887, + "step": 531, + "token_acc": 0.18478207539455394 + }, + { + "epoch": 0.31193198475520373, + "grad_norm": 4.9190237347534485, + "learning_rate": 4.3657678780773734e-05, + "loss": 4.477829933166504, + "step": 532, + "token_acc": 0.19221272755661292 + }, + { + "epoch": 0.3125183230724128, + "grad_norm": 5.385586585954992, + "learning_rate": 4.373974208675263e-05, + "loss": 4.456300258636475, + "step": 533, + "token_acc": 0.19495484514091363 + }, + { + "epoch": 0.3131046613896218, + "grad_norm": 5.711207057589939, + "learning_rate": 4.382180539273153e-05, + "loss": 4.460087776184082, + "step": 534, + "token_acc": 0.1928616727257543 + }, + { + "epoch": 0.31369099970683084, + "grad_norm": 5.8989313206787575, + "learning_rate": 4.390386869871043e-05, + "loss": 4.484177112579346, + "step": 535, + "token_acc": 0.19025959854577973 + }, + { + "epoch": 0.3142773380240399, + "grad_norm": 5.239531672071058, + "learning_rate": 4.398593200468933e-05, + "loss": 4.5287322998046875, + "step": 536, + "token_acc": 0.1855750170686579 + }, + { + "epoch": 0.3148636763412489, + "grad_norm": 3.5463464554853017, + "learning_rate": 4.406799531066823e-05, + "loss": 4.522578239440918, + "step": 537, + "token_acc": 0.18565694349335923 + }, + { + "epoch": 0.31545001465845796, + "grad_norm": 6.604086194843378, + "learning_rate": 4.415005861664712e-05, + "loss": 4.484546661376953, + "step": 538, + "token_acc": 0.18996912184597098 + }, + { + "epoch": 0.31603635297566696, + "grad_norm": 4.745285705276692, + "learning_rate": 4.423212192262602e-05, + "loss": 4.475639343261719, + "step": 539, + "token_acc": 0.19128830680128026 + }, + { + "epoch": 0.316622691292876, + "grad_norm": 5.8346735401001455, + "learning_rate": 4.431418522860492e-05, + "loss": 4.5008158683776855, + "step": 540, + "token_acc": 0.18859267707981203 + }, + { + "epoch": 0.317209029610085, + "grad_norm": 6.547702205302202, + "learning_rate": 4.4396248534583816e-05, + "loss": 4.480155944824219, + "step": 541, + "token_acc": 0.18974612414742165 + }, + { + "epoch": 0.31779536792729407, + "grad_norm": 5.743626658794127, + "learning_rate": 4.4478311840562714e-05, + "loss": 4.537848472595215, + "step": 542, + "token_acc": 0.18700443453092222 + }, + { + "epoch": 0.31838170624450307, + "grad_norm": 5.314244724847178, + "learning_rate": 4.456037514654161e-05, + "loss": 4.522566795349121, + "step": 543, + "token_acc": 0.18432651737298222 + }, + { + "epoch": 0.3189680445617121, + "grad_norm": 6.041757818787196, + "learning_rate": 4.464243845252052e-05, + "loss": 4.5613322257995605, + "step": 544, + "token_acc": 0.18117134597051096 + }, + { + "epoch": 0.31955438287892113, + "grad_norm": 6.632733398775879, + "learning_rate": 4.4724501758499415e-05, + "loss": 4.425638198852539, + "step": 545, + "token_acc": 0.1966081197365259 + }, + { + "epoch": 0.3201407211961302, + "grad_norm": 4.355380606979025, + "learning_rate": 4.480656506447831e-05, + "loss": 4.463742256164551, + "step": 546, + "token_acc": 0.18941737298914316 + }, + { + "epoch": 0.3207270595133392, + "grad_norm": 7.659555627912166, + "learning_rate": 4.4888628370457204e-05, + "loss": 4.510584831237793, + "step": 547, + "token_acc": 0.18727835778965388 + }, + { + "epoch": 0.32131339783054824, + "grad_norm": 5.773509503241845, + "learning_rate": 4.49706916764361e-05, + "loss": 4.535423278808594, + "step": 548, + "token_acc": 0.1858875768036705 + }, + { + "epoch": 0.32189973614775724, + "grad_norm": 8.836138949870497, + "learning_rate": 4.5052754982415e-05, + "loss": 4.49858283996582, + "step": 549, + "token_acc": 0.18657096585336183 + }, + { + "epoch": 0.3224860744649663, + "grad_norm": 5.27196626205651, + "learning_rate": 4.51348182883939e-05, + "loss": 4.419318199157715, + "step": 550, + "token_acc": 0.1946249987204815 + }, + { + "epoch": 0.3230724127821753, + "grad_norm": 8.282159121276031, + "learning_rate": 4.5216881594372796e-05, + "loss": 4.496129035949707, + "step": 551, + "token_acc": 0.18838829197948492 + }, + { + "epoch": 0.32365875109938436, + "grad_norm": 5.993117978987693, + "learning_rate": 4.5298944900351694e-05, + "loss": 4.461013317108154, + "step": 552, + "token_acc": 0.19080874737371645 + }, + { + "epoch": 0.32424508941659336, + "grad_norm": 3.948394316134522, + "learning_rate": 4.53810082063306e-05, + "loss": 4.479466438293457, + "step": 553, + "token_acc": 0.18935493018214392 + }, + { + "epoch": 0.3248314277338024, + "grad_norm": 7.689630909084427, + "learning_rate": 4.54630715123095e-05, + "loss": 4.415638446807861, + "step": 554, + "token_acc": 0.19777227514533213 + }, + { + "epoch": 0.3254177660510114, + "grad_norm": 7.320774738127758, + "learning_rate": 4.5545134818288395e-05, + "loss": 4.428081512451172, + "step": 555, + "token_acc": 0.1960862680807083 + }, + { + "epoch": 0.32600410436822047, + "grad_norm": 5.862654536263259, + "learning_rate": 4.5627198124267286e-05, + "loss": 4.470279693603516, + "step": 556, + "token_acc": 0.19028585239148738 + }, + { + "epoch": 0.32659044268542947, + "grad_norm": 5.105823546914807, + "learning_rate": 4.5709261430246184e-05, + "loss": 4.5024919509887695, + "step": 557, + "token_acc": 0.18545226403110623 + }, + { + "epoch": 0.32717678100263853, + "grad_norm": 6.028710401444009, + "learning_rate": 4.579132473622508e-05, + "loss": 4.416274070739746, + "step": 558, + "token_acc": 0.19277732950657256 + }, + { + "epoch": 0.32776311931984753, + "grad_norm": 6.3098463850094895, + "learning_rate": 4.587338804220398e-05, + "loss": 4.432454586029053, + "step": 559, + "token_acc": 0.19296357743454878 + }, + { + "epoch": 0.3283494576370566, + "grad_norm": 7.7506743787126995, + "learning_rate": 4.595545134818288e-05, + "loss": 4.461402416229248, + "step": 560, + "token_acc": 0.1898100580759658 + }, + { + "epoch": 0.3289357959542656, + "grad_norm": 6.557894074424091, + "learning_rate": 4.6037514654161776e-05, + "loss": 4.4386186599731445, + "step": 561, + "token_acc": 0.19209386902311817 + }, + { + "epoch": 0.32952213427147464, + "grad_norm": 4.1822001242563545, + "learning_rate": 4.611957796014068e-05, + "loss": 4.367791175842285, + "step": 562, + "token_acc": 0.1983296095052554 + }, + { + "epoch": 0.33010847258868364, + "grad_norm": 8.359275481722483, + "learning_rate": 4.620164126611958e-05, + "loss": 4.379465579986572, + "step": 563, + "token_acc": 0.20002784036130603 + }, + { + "epoch": 0.3306948109058927, + "grad_norm": 4.150650938335347, + "learning_rate": 4.6283704572098477e-05, + "loss": 4.4019670486450195, + "step": 564, + "token_acc": 0.19714070930455738 + }, + { + "epoch": 0.33128114922310176, + "grad_norm": 9.504457061160851, + "learning_rate": 4.636576787807737e-05, + "loss": 4.430449962615967, + "step": 565, + "token_acc": 0.19324560284998296 + }, + { + "epoch": 0.33186748754031076, + "grad_norm": 5.658739697814227, + "learning_rate": 4.6447831184056266e-05, + "loss": 4.451925754547119, + "step": 566, + "token_acc": 0.18910567831583347 + }, + { + "epoch": 0.3324538258575198, + "grad_norm": 8.887793535710317, + "learning_rate": 4.6529894490035164e-05, + "loss": 4.482825756072998, + "step": 567, + "token_acc": 0.18872363186921615 + }, + { + "epoch": 0.3330401641747288, + "grad_norm": 5.937473436789054, + "learning_rate": 4.661195779601406e-05, + "loss": 4.4818830490112305, + "step": 568, + "token_acc": 0.18827303548086388 + }, + { + "epoch": 0.33362650249193787, + "grad_norm": 6.223251974775681, + "learning_rate": 4.669402110199296e-05, + "loss": 4.42918586730957, + "step": 569, + "token_acc": 0.1919210409897335 + }, + { + "epoch": 0.33421284080914687, + "grad_norm": 7.002097879881412, + "learning_rate": 4.6776084407971864e-05, + "loss": 4.486233711242676, + "step": 570, + "token_acc": 0.18689357666161005 + }, + { + "epoch": 0.33479917912635593, + "grad_norm": 5.47675686619503, + "learning_rate": 4.685814771395076e-05, + "loss": 4.3734259605407715, + "step": 571, + "token_acc": 0.19837908801202714 + }, + { + "epoch": 0.33538551744356493, + "grad_norm": 6.0266905736728456, + "learning_rate": 4.694021101992966e-05, + "loss": 4.396583557128906, + "step": 572, + "token_acc": 0.19571154613434238 + }, + { + "epoch": 0.335971855760774, + "grad_norm": 4.658813691796653, + "learning_rate": 4.702227432590856e-05, + "loss": 4.367709159851074, + "step": 573, + "token_acc": 0.19878226157488507 + }, + { + "epoch": 0.336558194077983, + "grad_norm": 9.395882672486985, + "learning_rate": 4.710433763188745e-05, + "loss": 4.435266494750977, + "step": 574, + "token_acc": 0.190687181739285 + }, + { + "epoch": 0.33714453239519204, + "grad_norm": 3.4531224434636214, + "learning_rate": 4.718640093786635e-05, + "loss": 4.372501850128174, + "step": 575, + "token_acc": 0.19843487692040238 + }, + { + "epoch": 0.33773087071240104, + "grad_norm": 8.846575320080282, + "learning_rate": 4.7268464243845246e-05, + "loss": 4.401895523071289, + "step": 576, + "token_acc": 0.194584976189257 + }, + { + "epoch": 0.3383172090296101, + "grad_norm": 6.34762391674699, + "learning_rate": 4.7350527549824144e-05, + "loss": 4.460208892822266, + "step": 577, + "token_acc": 0.19122330644088487 + }, + { + "epoch": 0.3389035473468191, + "grad_norm": 4.431158013832074, + "learning_rate": 4.743259085580304e-05, + "loss": 4.46501350402832, + "step": 578, + "token_acc": 0.18966632660678295 + }, + { + "epoch": 0.33948988566402816, + "grad_norm": 4.882452446056982, + "learning_rate": 4.7514654161781946e-05, + "loss": 4.391036033630371, + "step": 579, + "token_acc": 0.19597744853808838 + }, + { + "epoch": 0.34007622398123716, + "grad_norm": 7.674952597614779, + "learning_rate": 4.7596717467760844e-05, + "loss": 4.347477436065674, + "step": 580, + "token_acc": 0.199366841142713 + }, + { + "epoch": 0.3406625622984462, + "grad_norm": 5.030967460889975, + "learning_rate": 4.767878077373974e-05, + "loss": 4.418288707733154, + "step": 581, + "token_acc": 0.1916214551902067 + }, + { + "epoch": 0.3412489006156552, + "grad_norm": 8.896203828070119, + "learning_rate": 4.7760844079718633e-05, + "loss": 4.416926383972168, + "step": 582, + "token_acc": 0.19323235494166274 + }, + { + "epoch": 0.34183523893286427, + "grad_norm": 4.489306072034783, + "learning_rate": 4.784290738569753e-05, + "loss": 4.426893711090088, + "step": 583, + "token_acc": 0.19094187577937521 + }, + { + "epoch": 0.3424215772500733, + "grad_norm": 9.204048802736533, + "learning_rate": 4.792497069167643e-05, + "loss": 4.451262474060059, + "step": 584, + "token_acc": 0.19010222448275044 + }, + { + "epoch": 0.34300791556728233, + "grad_norm": 6.859814676939098, + "learning_rate": 4.800703399765533e-05, + "loss": 4.46299409866333, + "step": 585, + "token_acc": 0.1906721463748978 + }, + { + "epoch": 0.34359425388449133, + "grad_norm": 7.533827152115777, + "learning_rate": 4.8089097303634225e-05, + "loss": 4.417018890380859, + "step": 586, + "token_acc": 0.19287514915878398 + }, + { + "epoch": 0.3441805922017004, + "grad_norm": 8.172277786659507, + "learning_rate": 4.817116060961312e-05, + "loss": 4.394757270812988, + "step": 587, + "token_acc": 0.194650138726811 + }, + { + "epoch": 0.3447669305189094, + "grad_norm": 6.956038937526932, + "learning_rate": 4.825322391559203e-05, + "loss": 4.359362602233887, + "step": 588, + "token_acc": 0.19735680202209507 + }, + { + "epoch": 0.34535326883611844, + "grad_norm": 5.1733747985879965, + "learning_rate": 4.8335287221570926e-05, + "loss": 4.36881160736084, + "step": 589, + "token_acc": 0.19789089071170593 + }, + { + "epoch": 0.34593960715332744, + "grad_norm": 5.18686725818335, + "learning_rate": 4.8417350527549824e-05, + "loss": 4.352592945098877, + "step": 590, + "token_acc": 0.19756552140773018 + }, + { + "epoch": 0.3465259454705365, + "grad_norm": 4.241528137874688, + "learning_rate": 4.8499413833528715e-05, + "loss": 4.409492492675781, + "step": 591, + "token_acc": 0.19087769867423396 + }, + { + "epoch": 0.34711228378774556, + "grad_norm": 7.187543154768812, + "learning_rate": 4.858147713950761e-05, + "loss": 4.281979084014893, + "step": 592, + "token_acc": 0.20348683013078864 + }, + { + "epoch": 0.34769862210495456, + "grad_norm": 6.731449993214299, + "learning_rate": 4.866354044548651e-05, + "loss": 4.329821586608887, + "step": 593, + "token_acc": 0.19997488869939156 + }, + { + "epoch": 0.3482849604221636, + "grad_norm": 8.403755192686713, + "learning_rate": 4.874560375146541e-05, + "loss": 4.358314514160156, + "step": 594, + "token_acc": 0.19530622213118629 + }, + { + "epoch": 0.3488712987393726, + "grad_norm": 4.528393967952365, + "learning_rate": 4.882766705744431e-05, + "loss": 4.301444053649902, + "step": 595, + "token_acc": 0.2015556768558952 + }, + { + "epoch": 0.34945763705658167, + "grad_norm": 6.966055045560965, + "learning_rate": 4.8909730363423205e-05, + "loss": 4.321412563323975, + "step": 596, + "token_acc": 0.20148697102244575 + }, + { + "epoch": 0.3500439753737907, + "grad_norm": 6.881316858375219, + "learning_rate": 4.899179366940211e-05, + "loss": 4.366338729858398, + "step": 597, + "token_acc": 0.19566463460364056 + }, + { + "epoch": 0.35063031369099973, + "grad_norm": 6.83300977581268, + "learning_rate": 4.907385697538101e-05, + "loss": 4.357544898986816, + "step": 598, + "token_acc": 0.19502726805186804 + }, + { + "epoch": 0.35121665200820873, + "grad_norm": 5.489291699937008, + "learning_rate": 4.9155920281359906e-05, + "loss": 4.353487968444824, + "step": 599, + "token_acc": 0.1956040154089252 + }, + { + "epoch": 0.3518029903254178, + "grad_norm": 5.748467794890245, + "learning_rate": 4.92379835873388e-05, + "loss": 4.32010555267334, + "step": 600, + "token_acc": 0.2004268590420086 + }, + { + "epoch": 0.3523893286426268, + "grad_norm": 6.335321485462031, + "learning_rate": 4.9320046893317695e-05, + "loss": 4.313159942626953, + "step": 601, + "token_acc": 0.19927321984448468 + }, + { + "epoch": 0.35297566695983584, + "grad_norm": 3.7368206365732513, + "learning_rate": 4.940211019929659e-05, + "loss": 4.370244026184082, + "step": 602, + "token_acc": 0.1923649152165671 + }, + { + "epoch": 0.35356200527704484, + "grad_norm": 10.44467872139782, + "learning_rate": 4.948417350527549e-05, + "loss": 4.313765525817871, + "step": 603, + "token_acc": 0.2034349048115739 + }, + { + "epoch": 0.3541483435942539, + "grad_norm": 5.3176314880898525, + "learning_rate": 4.956623681125439e-05, + "loss": 4.321287155151367, + "step": 604, + "token_acc": 0.1991299567368597 + }, + { + "epoch": 0.3547346819114629, + "grad_norm": 6.233879510796788, + "learning_rate": 4.9648300117233294e-05, + "loss": 4.298736572265625, + "step": 605, + "token_acc": 0.2031674786067473 + }, + { + "epoch": 0.35532102022867196, + "grad_norm": 7.207066786392458, + "learning_rate": 4.973036342321219e-05, + "loss": 4.318865776062012, + "step": 606, + "token_acc": 0.2011544449929746 + }, + { + "epoch": 0.35590735854588096, + "grad_norm": 4.997552499651344, + "learning_rate": 4.981242672919109e-05, + "loss": 4.333331108093262, + "step": 607, + "token_acc": 0.20019160321000282 + }, + { + "epoch": 0.35649369686309, + "grad_norm": 6.810892953065391, + "learning_rate": 4.989449003516999e-05, + "loss": 4.368122577667236, + "step": 608, + "token_acc": 0.19231795850302202 + }, + { + "epoch": 0.357080035180299, + "grad_norm": 5.789893708176766, + "learning_rate": 4.997655334114888e-05, + "loss": 4.321147441864014, + "step": 609, + "token_acc": 0.20054164323664614 + }, + { + "epoch": 0.35766637349750807, + "grad_norm": 6.566545651045261, + "learning_rate": 5.005861664712778e-05, + "loss": 4.36564826965332, + "step": 610, + "token_acc": 0.19370404294186577 + }, + { + "epoch": 0.3582527118147171, + "grad_norm": 7.439063988575006, + "learning_rate": 5.0140679953106675e-05, + "loss": 4.338534832000732, + "step": 611, + "token_acc": 0.19844616204690832 + }, + { + "epoch": 0.35883905013192613, + "grad_norm": 6.6669879988847836, + "learning_rate": 5.022274325908557e-05, + "loss": 4.325567722320557, + "step": 612, + "token_acc": 0.20045966886629357 + }, + { + "epoch": 0.35942538844913513, + "grad_norm": 6.687994302475722, + "learning_rate": 5.030480656506447e-05, + "loss": 4.332146644592285, + "step": 613, + "token_acc": 0.20098085616412073 + }, + { + "epoch": 0.3600117267663442, + "grad_norm": 4.4555993465093575, + "learning_rate": 5.0386869871043376e-05, + "loss": 4.381261348724365, + "step": 614, + "token_acc": 0.19281257281930456 + }, + { + "epoch": 0.3605980650835532, + "grad_norm": 6.884530804038673, + "learning_rate": 5.0468933177022274e-05, + "loss": 4.287456035614014, + "step": 615, + "token_acc": 0.20258283154667886 + }, + { + "epoch": 0.36118440340076224, + "grad_norm": 3.9426626333463277, + "learning_rate": 5.055099648300117e-05, + "loss": 4.228626728057861, + "step": 616, + "token_acc": 0.2080618053673082 + }, + { + "epoch": 0.36177074171797124, + "grad_norm": 8.468973017835788, + "learning_rate": 5.063305978898007e-05, + "loss": 4.2621941566467285, + "step": 617, + "token_acc": 0.2061359786583569 + }, + { + "epoch": 0.3623570800351803, + "grad_norm": 4.7226266501157035, + "learning_rate": 5.071512309495896e-05, + "loss": 4.298932075500488, + "step": 618, + "token_acc": 0.2024293547476061 + }, + { + "epoch": 0.3629434183523893, + "grad_norm": 10.247823179573718, + "learning_rate": 5.079718640093786e-05, + "loss": 4.3769917488098145, + "step": 619, + "token_acc": 0.19332201201963772 + }, + { + "epoch": 0.36352975666959836, + "grad_norm": 7.731050579633973, + "learning_rate": 5.087924970691676e-05, + "loss": 4.340371131896973, + "step": 620, + "token_acc": 0.19514768754954323 + }, + { + "epoch": 0.3641160949868074, + "grad_norm": 4.962868099430327, + "learning_rate": 5.0961313012895655e-05, + "loss": 4.297729969024658, + "step": 621, + "token_acc": 0.20349455427657048 + }, + { + "epoch": 0.3647024333040164, + "grad_norm": 10.806352539050675, + "learning_rate": 5.104337631887455e-05, + "loss": 4.274240016937256, + "step": 622, + "token_acc": 0.20278523034523077 + }, + { + "epoch": 0.36528877162122547, + "grad_norm": 6.19575035906615, + "learning_rate": 5.112543962485346e-05, + "loss": 4.358561038970947, + "step": 623, + "token_acc": 0.19831221376490557 + }, + { + "epoch": 0.3658751099384345, + "grad_norm": 11.352687956646522, + "learning_rate": 5.1207502930832356e-05, + "loss": 4.337742805480957, + "step": 624, + "token_acc": 0.19651950879442884 + }, + { + "epoch": 0.36646144825564353, + "grad_norm": 7.693508242233182, + "learning_rate": 5.1289566236811254e-05, + "loss": 4.280763626098633, + "step": 625, + "token_acc": 0.20407976138465936 + }, + { + "epoch": 0.36704778657285253, + "grad_norm": 9.064200943271722, + "learning_rate": 5.137162954279015e-05, + "loss": 4.325865745544434, + "step": 626, + "token_acc": 0.19913288080717417 + }, + { + "epoch": 0.3676341248900616, + "grad_norm": 7.5082242443432365, + "learning_rate": 5.145369284876904e-05, + "loss": 4.3048906326293945, + "step": 627, + "token_acc": 0.20137878828841008 + }, + { + "epoch": 0.3682204632072706, + "grad_norm": 5.117586923589491, + "learning_rate": 5.153575615474794e-05, + "loss": 4.274219512939453, + "step": 628, + "token_acc": 0.20411066283684523 + }, + { + "epoch": 0.36880680152447964, + "grad_norm": 6.3038731305539, + "learning_rate": 5.161781946072684e-05, + "loss": 4.3289899826049805, + "step": 629, + "token_acc": 0.19631616427817716 + }, + { + "epoch": 0.36939313984168864, + "grad_norm": 7.05866879671453, + "learning_rate": 5.169988276670574e-05, + "loss": 4.268229961395264, + "step": 630, + "token_acc": 0.20327354622866284 + }, + { + "epoch": 0.3699794781588977, + "grad_norm": 5.071653610171233, + "learning_rate": 5.1781946072684635e-05, + "loss": 4.28590202331543, + "step": 631, + "token_acc": 0.20130190173080048 + }, + { + "epoch": 0.3705658164761067, + "grad_norm": 8.016254781251437, + "learning_rate": 5.186400937866354e-05, + "loss": 4.256338596343994, + "step": 632, + "token_acc": 0.20127530091078086 + }, + { + "epoch": 0.37115215479331576, + "grad_norm": 5.064988963622655, + "learning_rate": 5.194607268464244e-05, + "loss": 4.262134075164795, + "step": 633, + "token_acc": 0.20192980295889149 + }, + { + "epoch": 0.37173849311052476, + "grad_norm": 6.070005089696539, + "learning_rate": 5.2028135990621335e-05, + "loss": 4.247917175292969, + "step": 634, + "token_acc": 0.20496268898838493 + }, + { + "epoch": 0.3723248314277338, + "grad_norm": 7.035437886556663, + "learning_rate": 5.211019929660023e-05, + "loss": 4.255850791931152, + "step": 635, + "token_acc": 0.20356832027850305 + }, + { + "epoch": 0.3729111697449428, + "grad_norm": 6.117971571153342, + "learning_rate": 5.2192262602579125e-05, + "loss": 4.260165214538574, + "step": 636, + "token_acc": 0.2047510152180155 + }, + { + "epoch": 0.3734975080621519, + "grad_norm": 12.060227109146973, + "learning_rate": 5.227432590855802e-05, + "loss": 4.253174781799316, + "step": 637, + "token_acc": 0.20350842740904004 + }, + { + "epoch": 0.3740838463793609, + "grad_norm": 3.949708660184968, + "learning_rate": 5.235638921453692e-05, + "loss": 4.26906156539917, + "step": 638, + "token_acc": 0.20094818658619423 + }, + { + "epoch": 0.37467018469656993, + "grad_norm": 8.524892019361486, + "learning_rate": 5.243845252051582e-05, + "loss": 4.257997989654541, + "step": 639, + "token_acc": 0.20150783862929297 + }, + { + "epoch": 0.37525652301377893, + "grad_norm": 7.325621817381045, + "learning_rate": 5.252051582649472e-05, + "loss": 4.324463367462158, + "step": 640, + "token_acc": 0.1951068522124921 + }, + { + "epoch": 0.375842861330988, + "grad_norm": 6.6332324928320485, + "learning_rate": 5.260257913247362e-05, + "loss": 4.255081653594971, + "step": 641, + "token_acc": 0.20332773542111807 + }, + { + "epoch": 0.376429199648197, + "grad_norm": 7.372021097396333, + "learning_rate": 5.268464243845252e-05, + "loss": 4.265812397003174, + "step": 642, + "token_acc": 0.20216091155635685 + }, + { + "epoch": 0.37701553796540604, + "grad_norm": 4.968326969691676, + "learning_rate": 5.276670574443142e-05, + "loss": 4.176780700683594, + "step": 643, + "token_acc": 0.21011885574053468 + }, + { + "epoch": 0.37760187628261505, + "grad_norm": 5.468141668006452, + "learning_rate": 5.2848769050410315e-05, + "loss": 4.208514213562012, + "step": 644, + "token_acc": 0.2071975231073194 + }, + { + "epoch": 0.3781882145998241, + "grad_norm": 5.699032656341187, + "learning_rate": 5.2930832356389206e-05, + "loss": 4.256397247314453, + "step": 645, + "token_acc": 0.20213619130941965 + }, + { + "epoch": 0.3787745529170331, + "grad_norm": 7.695058660013026, + "learning_rate": 5.3012895662368104e-05, + "loss": 4.333415508270264, + "step": 646, + "token_acc": 0.19477001440625644 + }, + { + "epoch": 0.37936089123424216, + "grad_norm": 5.342643623289217, + "learning_rate": 5.3094958968347e-05, + "loss": 4.234793663024902, + "step": 647, + "token_acc": 0.20469448023915765 + }, + { + "epoch": 0.37994722955145116, + "grad_norm": 9.50816321744696, + "learning_rate": 5.31770222743259e-05, + "loss": 4.23974609375, + "step": 648, + "token_acc": 0.20590203865223747 + }, + { + "epoch": 0.3805335678686602, + "grad_norm": 5.2672766197213825, + "learning_rate": 5.3259085580304805e-05, + "loss": 4.329718589782715, + "step": 649, + "token_acc": 0.19572213440159827 + }, + { + "epoch": 0.3811199061858693, + "grad_norm": 9.291106055736, + "learning_rate": 5.33411488862837e-05, + "loss": 4.272111415863037, + "step": 650, + "token_acc": 0.19994436259105475 + }, + { + "epoch": 0.3817062445030783, + "grad_norm": 6.0741692853216955, + "learning_rate": 5.34232121922626e-05, + "loss": 4.19622802734375, + "step": 651, + "token_acc": 0.20987739410787173 + }, + { + "epoch": 0.38229258282028733, + "grad_norm": 5.533836567441832, + "learning_rate": 5.35052754982415e-05, + "loss": 4.302114486694336, + "step": 652, + "token_acc": 0.19659415260826232 + }, + { + "epoch": 0.38287892113749633, + "grad_norm": 8.553847965407476, + "learning_rate": 5.35873388042204e-05, + "loss": 4.287426471710205, + "step": 653, + "token_acc": 0.1989726405714917 + }, + { + "epoch": 0.3834652594547054, + "grad_norm": 4.4105232895566955, + "learning_rate": 5.366940211019929e-05, + "loss": 4.216601848602295, + "step": 654, + "token_acc": 0.20663760948393198 + }, + { + "epoch": 0.3840515977719144, + "grad_norm": 9.113729704927863, + "learning_rate": 5.3751465416178186e-05, + "loss": 4.238519191741943, + "step": 655, + "token_acc": 0.2031450603686175 + }, + { + "epoch": 0.38463793608912344, + "grad_norm": 5.348400808975778, + "learning_rate": 5.3833528722157084e-05, + "loss": 4.238986968994141, + "step": 656, + "token_acc": 0.20360236938589654 + }, + { + "epoch": 0.38522427440633245, + "grad_norm": 6.764859900160404, + "learning_rate": 5.391559202813598e-05, + "loss": 4.250058174133301, + "step": 657, + "token_acc": 0.2034909072254854 + }, + { + "epoch": 0.3858106127235415, + "grad_norm": 4.256537835605083, + "learning_rate": 5.399765533411489e-05, + "loss": 4.2425761222839355, + "step": 658, + "token_acc": 0.20315329264004284 + }, + { + "epoch": 0.3863969510407505, + "grad_norm": 7.047299068881363, + "learning_rate": 5.4079718640093785e-05, + "loss": 4.261916160583496, + "step": 659, + "token_acc": 0.20068792978959793 + }, + { + "epoch": 0.38698328935795956, + "grad_norm": 4.769386620516134, + "learning_rate": 5.416178194607268e-05, + "loss": 4.211328506469727, + "step": 660, + "token_acc": 0.20559451571854195 + }, + { + "epoch": 0.38756962767516856, + "grad_norm": 6.098845066360906, + "learning_rate": 5.424384525205158e-05, + "loss": 4.264926910400391, + "step": 661, + "token_acc": 0.20190850527195128 + }, + { + "epoch": 0.3881559659923776, + "grad_norm": 4.837044068288691, + "learning_rate": 5.432590855803047e-05, + "loss": 4.256516933441162, + "step": 662, + "token_acc": 0.20202830876988181 + }, + { + "epoch": 0.3887423043095866, + "grad_norm": 6.70982110798902, + "learning_rate": 5.440797186400937e-05, + "loss": 4.154748916625977, + "step": 663, + "token_acc": 0.21118555815459436 + }, + { + "epoch": 0.3893286426267957, + "grad_norm": 7.461742114936183, + "learning_rate": 5.449003516998827e-05, + "loss": 4.203394889831543, + "step": 664, + "token_acc": 0.20551437753333893 + }, + { + "epoch": 0.3899149809440047, + "grad_norm": 4.652825065191578, + "learning_rate": 5.4572098475967166e-05, + "loss": 4.2032647132873535, + "step": 665, + "token_acc": 0.20763021202619733 + }, + { + "epoch": 0.39050131926121373, + "grad_norm": 8.540272248397173, + "learning_rate": 5.465416178194607e-05, + "loss": 4.235311508178711, + "step": 666, + "token_acc": 0.20420387010641627 + }, + { + "epoch": 0.39108765757842273, + "grad_norm": 4.2331679056094265, + "learning_rate": 5.473622508792497e-05, + "loss": 4.260921478271484, + "step": 667, + "token_acc": 0.20075133197919878 + }, + { + "epoch": 0.3916739958956318, + "grad_norm": 6.576679074719142, + "learning_rate": 5.481828839390387e-05, + "loss": 4.147568702697754, + "step": 668, + "token_acc": 0.21375431565410727 + }, + { + "epoch": 0.3922603342128408, + "grad_norm": 4.80209191626738, + "learning_rate": 5.4900351699882765e-05, + "loss": 4.245433807373047, + "step": 669, + "token_acc": 0.20293622955242924 + }, + { + "epoch": 0.39284667253004985, + "grad_norm": 7.736238777858105, + "learning_rate": 5.498241500586166e-05, + "loss": 4.2774505615234375, + "step": 670, + "token_acc": 0.19808356935840904 + }, + { + "epoch": 0.39343301084725885, + "grad_norm": 8.579043790049862, + "learning_rate": 5.5064478311840554e-05, + "loss": 4.195842266082764, + "step": 671, + "token_acc": 0.20660908415556695 + }, + { + "epoch": 0.3940193491644679, + "grad_norm": 2.705451206256234, + "learning_rate": 5.514654161781945e-05, + "loss": 4.185248374938965, + "step": 672, + "token_acc": 0.20905620646553288 + }, + { + "epoch": 0.3946056874816769, + "grad_norm": 11.431402440916541, + "learning_rate": 5.522860492379835e-05, + "loss": 4.206908226013184, + "step": 673, + "token_acc": 0.20610974220385236 + }, + { + "epoch": 0.39519202579888596, + "grad_norm": 5.285335074542644, + "learning_rate": 5.531066822977725e-05, + "loss": 4.256257057189941, + "step": 674, + "token_acc": 0.20515903548467027 + }, + { + "epoch": 0.39577836411609496, + "grad_norm": 8.987733859903123, + "learning_rate": 5.539273153575615e-05, + "loss": 4.28242301940918, + "step": 675, + "token_acc": 0.20150640555637023 + }, + { + "epoch": 0.396364702433304, + "grad_norm": 6.249525903352653, + "learning_rate": 5.547479484173505e-05, + "loss": 4.207487106323242, + "step": 676, + "token_acc": 0.20964671472919927 + }, + { + "epoch": 0.3969510407505131, + "grad_norm": 5.0884343977261555, + "learning_rate": 5.555685814771395e-05, + "loss": 4.203760147094727, + "step": 677, + "token_acc": 0.20659704165577983 + }, + { + "epoch": 0.3975373790677221, + "grad_norm": 6.560956502373544, + "learning_rate": 5.563892145369285e-05, + "loss": 4.175498962402344, + "step": 678, + "token_acc": 0.2076709570113264 + }, + { + "epoch": 0.39812371738493113, + "grad_norm": 3.21155361480233, + "learning_rate": 5.5720984759671745e-05, + "loss": 4.247404098510742, + "step": 679, + "token_acc": 0.2023355250410138 + }, + { + "epoch": 0.39871005570214013, + "grad_norm": 6.000142226397825, + "learning_rate": 5.5803048065650636e-05, + "loss": 4.17470645904541, + "step": 680, + "token_acc": 0.2079712739351207 + }, + { + "epoch": 0.3992963940193492, + "grad_norm": 5.163044478363358, + "learning_rate": 5.5885111371629534e-05, + "loss": 4.334084987640381, + "step": 681, + "token_acc": 0.19304747956385798 + }, + { + "epoch": 0.3998827323365582, + "grad_norm": 5.775827465594905, + "learning_rate": 5.596717467760843e-05, + "loss": 4.2259931564331055, + "step": 682, + "token_acc": 0.20339967663744066 + }, + { + "epoch": 0.40046907065376725, + "grad_norm": 5.469309809611569, + "learning_rate": 5.604923798358733e-05, + "loss": 4.236342906951904, + "step": 683, + "token_acc": 0.20294456159865285 + }, + { + "epoch": 0.40105540897097625, + "grad_norm": 6.1674367504282115, + "learning_rate": 5.6131301289566235e-05, + "loss": 4.219335079193115, + "step": 684, + "token_acc": 0.20542775465816773 + }, + { + "epoch": 0.4016417472881853, + "grad_norm": 4.261188541523941, + "learning_rate": 5.621336459554513e-05, + "loss": 4.162544250488281, + "step": 685, + "token_acc": 0.21062752469296167 + }, + { + "epoch": 0.4022280856053943, + "grad_norm": 6.12411599441717, + "learning_rate": 5.629542790152403e-05, + "loss": 4.204689025878906, + "step": 686, + "token_acc": 0.2051478187277743 + }, + { + "epoch": 0.40281442392260336, + "grad_norm": 3.808443114456628, + "learning_rate": 5.637749120750293e-05, + "loss": 4.190600395202637, + "step": 687, + "token_acc": 0.20551752781527305 + }, + { + "epoch": 0.40340076223981236, + "grad_norm": 6.082695933506258, + "learning_rate": 5.6459554513481827e-05, + "loss": 4.199838638305664, + "step": 688, + "token_acc": 0.20602594579908987 + }, + { + "epoch": 0.4039871005570214, + "grad_norm": 5.498151956905652, + "learning_rate": 5.654161781946072e-05, + "loss": 4.201581954956055, + "step": 689, + "token_acc": 0.20699494128131288 + }, + { + "epoch": 0.4045734388742304, + "grad_norm": 5.769371998160246, + "learning_rate": 5.6623681125439616e-05, + "loss": 4.167081832885742, + "step": 690, + "token_acc": 0.20919080611007174 + }, + { + "epoch": 0.4051597771914395, + "grad_norm": 4.8127302195745525, + "learning_rate": 5.6705744431418514e-05, + "loss": 4.205621719360352, + "step": 691, + "token_acc": 0.205567081604426 + }, + { + "epoch": 0.4057461155086485, + "grad_norm": 4.711062674769272, + "learning_rate": 5.678780773739741e-05, + "loss": 4.227962017059326, + "step": 692, + "token_acc": 0.2018501459617256 + }, + { + "epoch": 0.40633245382585753, + "grad_norm": 6.382862690871602, + "learning_rate": 5.6869871043376316e-05, + "loss": 4.214358329772949, + "step": 693, + "token_acc": 0.20356120810979939 + }, + { + "epoch": 0.40691879214306653, + "grad_norm": 3.7484614915595973, + "learning_rate": 5.6951934349355214e-05, + "loss": 4.232317924499512, + "step": 694, + "token_acc": 0.2041442127991581 + }, + { + "epoch": 0.4075051304602756, + "grad_norm": 5.007036338560286, + "learning_rate": 5.703399765533411e-05, + "loss": 4.104582786560059, + "step": 695, + "token_acc": 0.21457855845784782 + }, + { + "epoch": 0.4080914687774846, + "grad_norm": 5.231535436627986, + "learning_rate": 5.711606096131301e-05, + "loss": 4.158563613891602, + "step": 696, + "token_acc": 0.2091188954339574 + }, + { + "epoch": 0.40867780709469365, + "grad_norm": 6.681352838638222, + "learning_rate": 5.719812426729191e-05, + "loss": 4.135995864868164, + "step": 697, + "token_acc": 0.21215096186999424 + }, + { + "epoch": 0.40926414541190265, + "grad_norm": 4.323274110154445, + "learning_rate": 5.72801875732708e-05, + "loss": 4.179586887359619, + "step": 698, + "token_acc": 0.20552419570244235 + }, + { + "epoch": 0.4098504837291117, + "grad_norm": 6.85420066927111, + "learning_rate": 5.73622508792497e-05, + "loss": 4.090293884277344, + "step": 699, + "token_acc": 0.21603218320659723 + }, + { + "epoch": 0.4104368220463207, + "grad_norm": 3.4930822274653766, + "learning_rate": 5.7444314185228596e-05, + "loss": 4.1796956062316895, + "step": 700, + "token_acc": 0.20468234788397927 + }, + { + "epoch": 0.41102316036352976, + "grad_norm": 8.955607941594574, + "learning_rate": 5.75263774912075e-05, + "loss": 4.1959004402160645, + "step": 701, + "token_acc": 0.20585980010592858 + }, + { + "epoch": 0.41160949868073876, + "grad_norm": 5.488655143025368, + "learning_rate": 5.76084407971864e-05, + "loss": 4.216902256011963, + "step": 702, + "token_acc": 0.20418105061764338 + }, + { + "epoch": 0.4121958369979478, + "grad_norm": 6.5665338828052775, + "learning_rate": 5.7690504103165296e-05, + "loss": 4.160950660705566, + "step": 703, + "token_acc": 0.20898828272647096 + }, + { + "epoch": 0.4127821753151568, + "grad_norm": 4.827311090143434, + "learning_rate": 5.7772567409144194e-05, + "loss": 4.191999912261963, + "step": 704, + "token_acc": 0.2044706303762662 + }, + { + "epoch": 0.4133685136323659, + "grad_norm": 5.340756247730568, + "learning_rate": 5.785463071512309e-05, + "loss": 4.205822944641113, + "step": 705, + "token_acc": 0.20248091304279758 + }, + { + "epoch": 0.41395485194957493, + "grad_norm": 6.400983175884303, + "learning_rate": 5.793669402110199e-05, + "loss": 4.137466907501221, + "step": 706, + "token_acc": 0.2097769155150084 + }, + { + "epoch": 0.41454119026678393, + "grad_norm": 6.701433086702472, + "learning_rate": 5.801875732708088e-05, + "loss": 4.122692584991455, + "step": 707, + "token_acc": 0.21178068848845258 + }, + { + "epoch": 0.415127528583993, + "grad_norm": 3.724613052876181, + "learning_rate": 5.810082063305978e-05, + "loss": 4.093698501586914, + "step": 708, + "token_acc": 0.21447327281832024 + }, + { + "epoch": 0.415713866901202, + "grad_norm": 7.013574370295882, + "learning_rate": 5.818288393903868e-05, + "loss": 4.14458703994751, + "step": 709, + "token_acc": 0.2128691778309926 + }, + { + "epoch": 0.41630020521841105, + "grad_norm": 4.433594166391518, + "learning_rate": 5.826494724501758e-05, + "loss": 4.145415306091309, + "step": 710, + "token_acc": 0.2111799756552615 + }, + { + "epoch": 0.41688654353562005, + "grad_norm": 8.562807618790616, + "learning_rate": 5.834701055099648e-05, + "loss": 4.145606994628906, + "step": 711, + "token_acc": 0.20886160618505178 + }, + { + "epoch": 0.4174728818528291, + "grad_norm": 4.71640556750639, + "learning_rate": 5.842907385697538e-05, + "loss": 4.19354248046875, + "step": 712, + "token_acc": 0.2035779857954017 + }, + { + "epoch": 0.4180592201700381, + "grad_norm": 5.065617495900307, + "learning_rate": 5.8511137162954276e-05, + "loss": 4.19216251373291, + "step": 713, + "token_acc": 0.2055668735366375 + }, + { + "epoch": 0.41864555848724716, + "grad_norm": 5.549692737992384, + "learning_rate": 5.8593200468933174e-05, + "loss": 4.141023635864258, + "step": 714, + "token_acc": 0.2081619019753553 + }, + { + "epoch": 0.41923189680445616, + "grad_norm": 5.114101547372383, + "learning_rate": 5.867526377491207e-05, + "loss": 4.121971130371094, + "step": 715, + "token_acc": 0.2113343052803417 + }, + { + "epoch": 0.4198182351216652, + "grad_norm": 5.579266668043171, + "learning_rate": 5.875732708089096e-05, + "loss": 4.0913543701171875, + "step": 716, + "token_acc": 0.21559033885629417 + }, + { + "epoch": 0.4204045734388742, + "grad_norm": 4.694940547126315, + "learning_rate": 5.883939038686986e-05, + "loss": 4.136467933654785, + "step": 717, + "token_acc": 0.2098276066076417 + }, + { + "epoch": 0.4209909117560833, + "grad_norm": 6.340555808406843, + "learning_rate": 5.892145369284876e-05, + "loss": 4.15889835357666, + "step": 718, + "token_acc": 0.20600056624092758 + }, + { + "epoch": 0.4215772500732923, + "grad_norm": 5.213191570752816, + "learning_rate": 5.9003516998827664e-05, + "loss": 4.099459648132324, + "step": 719, + "token_acc": 0.2142941638823681 + }, + { + "epoch": 0.42216358839050133, + "grad_norm": 6.278967182497455, + "learning_rate": 5.908558030480656e-05, + "loss": 4.174129486083984, + "step": 720, + "token_acc": 0.20631060544328553 + }, + { + "epoch": 0.42274992670771033, + "grad_norm": 3.709784921330175, + "learning_rate": 5.916764361078546e-05, + "loss": 4.042458534240723, + "step": 721, + "token_acc": 0.21835256529561387 + }, + { + "epoch": 0.4233362650249194, + "grad_norm": 5.586953604909075, + "learning_rate": 5.924970691676436e-05, + "loss": 4.143787860870361, + "step": 722, + "token_acc": 0.20750663498293112 + }, + { + "epoch": 0.4239226033421284, + "grad_norm": 4.56629087830607, + "learning_rate": 5.9331770222743256e-05, + "loss": 4.133569240570068, + "step": 723, + "token_acc": 0.21038678435699398 + }, + { + "epoch": 0.42450894165933745, + "grad_norm": 4.036103425279676, + "learning_rate": 5.9413833528722154e-05, + "loss": 4.041327476501465, + "step": 724, + "token_acc": 0.21928318147847795 + }, + { + "epoch": 0.42509527997654645, + "grad_norm": 6.767492736506062, + "learning_rate": 5.9495896834701045e-05, + "loss": 4.161279678344727, + "step": 725, + "token_acc": 0.20512148042465697 + }, + { + "epoch": 0.4256816182937555, + "grad_norm": 5.065260523826766, + "learning_rate": 5.957796014067994e-05, + "loss": 4.168001651763916, + "step": 726, + "token_acc": 0.20429929458020712 + }, + { + "epoch": 0.4262679566109645, + "grad_norm": 5.531995663876098, + "learning_rate": 5.966002344665884e-05, + "loss": 4.085506916046143, + "step": 727, + "token_acc": 0.2132195431706141 + }, + { + "epoch": 0.42685429492817356, + "grad_norm": 6.537170283925281, + "learning_rate": 5.9742086752637746e-05, + "loss": 4.111567497253418, + "step": 728, + "token_acc": 0.2124138106830145 + }, + { + "epoch": 0.42744063324538256, + "grad_norm": 3.926897959552154, + "learning_rate": 5.9824150058616644e-05, + "loss": 4.084894180297852, + "step": 729, + "token_acc": 0.21293772447408926 + }, + { + "epoch": 0.4280269715625916, + "grad_norm": 7.690922528933665, + "learning_rate": 5.990621336459554e-05, + "loss": 4.120464324951172, + "step": 730, + "token_acc": 0.21219283546098913 + }, + { + "epoch": 0.4286133098798006, + "grad_norm": 3.8756421542125663, + "learning_rate": 5.998827667057444e-05, + "loss": 4.135234355926514, + "step": 731, + "token_acc": 0.20832390745501286 + }, + { + "epoch": 0.4291996481970097, + "grad_norm": 7.445468777171998, + "learning_rate": 6.007033997655334e-05, + "loss": 4.1082563400268555, + "step": 732, + "token_acc": 0.21236238617953557 + }, + { + "epoch": 0.42978598651421873, + "grad_norm": 3.820678732926357, + "learning_rate": 6.0152403282532236e-05, + "loss": 4.090451717376709, + "step": 733, + "token_acc": 0.21370788934399346 + }, + { + "epoch": 0.43037232483142773, + "grad_norm": 5.710618810201329, + "learning_rate": 6.023446658851113e-05, + "loss": 4.08793830871582, + "step": 734, + "token_acc": 0.21196567528739338 + }, + { + "epoch": 0.4309586631486368, + "grad_norm": 5.392673999942736, + "learning_rate": 6.0316529894490025e-05, + "loss": 4.105639457702637, + "step": 735, + "token_acc": 0.2126338972870157 + }, + { + "epoch": 0.4315450014658458, + "grad_norm": 4.971434822494723, + "learning_rate": 6.039859320046893e-05, + "loss": 4.056774139404297, + "step": 736, + "token_acc": 0.21927300915111567 + }, + { + "epoch": 0.43213133978305485, + "grad_norm": 6.004185320984899, + "learning_rate": 6.048065650644783e-05, + "loss": 4.115375518798828, + "step": 737, + "token_acc": 0.2123379002489912 + }, + { + "epoch": 0.43271767810026385, + "grad_norm": 4.8816387426639976, + "learning_rate": 6.0562719812426726e-05, + "loss": 4.111758708953857, + "step": 738, + "token_acc": 0.2102325605920395 + }, + { + "epoch": 0.4333040164174729, + "grad_norm": 6.426680953167651, + "learning_rate": 6.0644783118405624e-05, + "loss": 4.107548713684082, + "step": 739, + "token_acc": 0.21164426724969454 + }, + { + "epoch": 0.4338903547346819, + "grad_norm": 4.226173696181125, + "learning_rate": 6.072684642438452e-05, + "loss": 4.060413837432861, + "step": 740, + "token_acc": 0.2172739256525178 + }, + { + "epoch": 0.43447669305189096, + "grad_norm": 7.618018312018148, + "learning_rate": 6.080890973036342e-05, + "loss": 4.167675018310547, + "step": 741, + "token_acc": 0.20574997066885672 + }, + { + "epoch": 0.43506303136909996, + "grad_norm": 3.7076965117358864, + "learning_rate": 6.089097303634231e-05, + "loss": 4.0514020919799805, + "step": 742, + "token_acc": 0.21745280754978163 + }, + { + "epoch": 0.435649369686309, + "grad_norm": 6.403849352684368, + "learning_rate": 6.097303634232121e-05, + "loss": 4.10659122467041, + "step": 743, + "token_acc": 0.2127811103285217 + }, + { + "epoch": 0.436235708003518, + "grad_norm": 4.9477706004904825, + "learning_rate": 6.105509964830011e-05, + "loss": 4.063077449798584, + "step": 744, + "token_acc": 0.2155527225439876 + }, + { + "epoch": 0.4368220463207271, + "grad_norm": 5.70651725041729, + "learning_rate": 6.113716295427901e-05, + "loss": 4.0971269607543945, + "step": 745, + "token_acc": 0.21257264007560953 + }, + { + "epoch": 0.4374083846379361, + "grad_norm": 8.201175989736933, + "learning_rate": 6.121922626025791e-05, + "loss": 4.1412858963012695, + "step": 746, + "token_acc": 0.20583142301571436 + }, + { + "epoch": 0.43799472295514513, + "grad_norm": 4.315864901252941, + "learning_rate": 6.130128956623681e-05, + "loss": 4.070844650268555, + "step": 747, + "token_acc": 0.2150485116493961 + }, + { + "epoch": 0.43858106127235413, + "grad_norm": 6.678621435289561, + "learning_rate": 6.13833528722157e-05, + "loss": 4.155675411224365, + "step": 748, + "token_acc": 0.2047260192436092 + }, + { + "epoch": 0.4391673995895632, + "grad_norm": 4.288474693404013, + "learning_rate": 6.14654161781946e-05, + "loss": 4.099294662475586, + "step": 749, + "token_acc": 0.21298414635729773 + }, + { + "epoch": 0.4397537379067722, + "grad_norm": 4.825265843377594, + "learning_rate": 6.15474794841735e-05, + "loss": 4.111228942871094, + "step": 750, + "token_acc": 0.21178295227946528 + }, + { + "epoch": 0.44034007622398125, + "grad_norm": 6.647912642882177, + "learning_rate": 6.16295427901524e-05, + "loss": 4.127750396728516, + "step": 751, + "token_acc": 0.2068447357818744 + }, + { + "epoch": 0.44092641454119025, + "grad_norm": 6.61583689021098, + "learning_rate": 6.17116060961313e-05, + "loss": 4.114223480224609, + "step": 752, + "token_acc": 0.20892537923285154 + }, + { + "epoch": 0.4415127528583993, + "grad_norm": 4.480109992919529, + "learning_rate": 6.17936694021102e-05, + "loss": 4.079311847686768, + "step": 753, + "token_acc": 0.21533014157792182 + }, + { + "epoch": 0.4420990911756083, + "grad_norm": 6.888122259844035, + "learning_rate": 6.18757327080891e-05, + "loss": 4.046022891998291, + "step": 754, + "token_acc": 0.21643093730627871 + }, + { + "epoch": 0.44268542949281736, + "grad_norm": 2.9024707980486264, + "learning_rate": 6.195779601406799e-05, + "loss": 4.097957611083984, + "step": 755, + "token_acc": 0.21023586934017105 + }, + { + "epoch": 0.44327176781002636, + "grad_norm": 7.1683543663645, + "learning_rate": 6.203985932004689e-05, + "loss": 4.111137390136719, + "step": 756, + "token_acc": 0.21366301584842798 + }, + { + "epoch": 0.4438581061272354, + "grad_norm": 5.228513188408095, + "learning_rate": 6.212192262602579e-05, + "loss": 4.120720863342285, + "step": 757, + "token_acc": 0.20936401430820153 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 6.695585981623289, + "learning_rate": 6.220398593200469e-05, + "loss": 4.129217624664307, + "step": 758, + "token_acc": 0.20681253133879324 + }, + { + "epoch": 0.4450307827616535, + "grad_norm": 4.440925726101526, + "learning_rate": 6.228604923798358e-05, + "loss": 4.070681095123291, + "step": 759, + "token_acc": 0.21466587207607432 + }, + { + "epoch": 0.4456171210788625, + "grad_norm": 4.8071648886887886, + "learning_rate": 6.236811254396248e-05, + "loss": 4.104011058807373, + "step": 760, + "token_acc": 0.21046202895250318 + }, + { + "epoch": 0.44620345939607153, + "grad_norm": 5.303969375265496, + "learning_rate": 6.245017584994138e-05, + "loss": 4.077654838562012, + "step": 761, + "token_acc": 0.21338909925718733 + }, + { + "epoch": 0.4467897977132806, + "grad_norm": 4.2044613693246795, + "learning_rate": 6.253223915592028e-05, + "loss": 4.057465553283691, + "step": 762, + "token_acc": 0.214498190518995 + }, + { + "epoch": 0.4473761360304896, + "grad_norm": 5.908054074818977, + "learning_rate": 6.261430246189918e-05, + "loss": 4.061487197875977, + "step": 763, + "token_acc": 0.21570638765708755 + }, + { + "epoch": 0.44796247434769865, + "grad_norm": 6.021553993864116, + "learning_rate": 6.269636576787807e-05, + "loss": 4.042322158813477, + "step": 764, + "token_acc": 0.21750293562730066 + }, + { + "epoch": 0.44854881266490765, + "grad_norm": 3.63104793768578, + "learning_rate": 6.277842907385697e-05, + "loss": 4.083230972290039, + "step": 765, + "token_acc": 0.21296144776764037 + }, + { + "epoch": 0.4491351509821167, + "grad_norm": 7.472090332995207, + "learning_rate": 6.286049237983587e-05, + "loss": 4.041102886199951, + "step": 766, + "token_acc": 0.21816202405949708 + }, + { + "epoch": 0.4497214892993257, + "grad_norm": 4.0288501598641115, + "learning_rate": 6.294255568581477e-05, + "loss": 4.090574264526367, + "step": 767, + "token_acc": 0.2127265497188395 + }, + { + "epoch": 0.45030782761653476, + "grad_norm": 7.074433521578454, + "learning_rate": 6.302461899179367e-05, + "loss": 4.0869951248168945, + "step": 768, + "token_acc": 0.21064017051419362 + }, + { + "epoch": 0.45089416593374376, + "grad_norm": 4.16343708212507, + "learning_rate": 6.310668229777256e-05, + "loss": 4.129717826843262, + "step": 769, + "token_acc": 0.20743499842522964 + }, + { + "epoch": 0.4514805042509528, + "grad_norm": 5.087280517204008, + "learning_rate": 6.318874560375146e-05, + "loss": 4.055534839630127, + "step": 770, + "token_acc": 0.21638147863370388 + }, + { + "epoch": 0.4520668425681618, + "grad_norm": 4.912551123275517, + "learning_rate": 6.327080890973036e-05, + "loss": 4.04734992980957, + "step": 771, + "token_acc": 0.21546405803586352 + }, + { + "epoch": 0.4526531808853709, + "grad_norm": 6.012292623593857, + "learning_rate": 6.335287221570926e-05, + "loss": 4.0179877281188965, + "step": 772, + "token_acc": 0.22023658505645058 + }, + { + "epoch": 0.4532395192025799, + "grad_norm": 3.618287335602584, + "learning_rate": 6.343493552168816e-05, + "loss": 4.069624900817871, + "step": 773, + "token_acc": 0.2154240159494792 + }, + { + "epoch": 0.45382585751978893, + "grad_norm": 6.118985284309547, + "learning_rate": 6.351699882766705e-05, + "loss": 4.104743480682373, + "step": 774, + "token_acc": 0.2114108018788139 + }, + { + "epoch": 0.45441219583699793, + "grad_norm": 4.964537114713395, + "learning_rate": 6.359906213364595e-05, + "loss": 4.118830680847168, + "step": 775, + "token_acc": 0.20860223547533474 + }, + { + "epoch": 0.454998534154207, + "grad_norm": 4.17806497393013, + "learning_rate": 6.368112543962485e-05, + "loss": 4.059308052062988, + "step": 776, + "token_acc": 0.21496380446775845 + }, + { + "epoch": 0.455584872471416, + "grad_norm": 6.644856595094865, + "learning_rate": 6.376318874560375e-05, + "loss": 4.054924011230469, + "step": 777, + "token_acc": 0.21495542103249865 + }, + { + "epoch": 0.45617121078862505, + "grad_norm": 3.240986415940476, + "learning_rate": 6.384525205158265e-05, + "loss": 4.056221008300781, + "step": 778, + "token_acc": 0.21613459645935396 + }, + { + "epoch": 0.45675754910583405, + "grad_norm": 7.305868384031052, + "learning_rate": 6.392731535756154e-05, + "loss": 4.071720123291016, + "step": 779, + "token_acc": 0.21205278553559848 + }, + { + "epoch": 0.4573438874230431, + "grad_norm": 6.1282390778907745, + "learning_rate": 6.400937866354044e-05, + "loss": 4.075984477996826, + "step": 780, + "token_acc": 0.2104517708277205 + }, + { + "epoch": 0.4579302257402521, + "grad_norm": 4.092787147119571, + "learning_rate": 6.409144196951934e-05, + "loss": 4.057436943054199, + "step": 781, + "token_acc": 0.21637315836440124 + }, + { + "epoch": 0.45851656405746116, + "grad_norm": 5.960452762804376, + "learning_rate": 6.417350527549824e-05, + "loss": 4.043334484100342, + "step": 782, + "token_acc": 0.21579216397883813 + }, + { + "epoch": 0.45910290237467016, + "grad_norm": 3.5712552289131394, + "learning_rate": 6.425556858147713e-05, + "loss": 4.0843186378479, + "step": 783, + "token_acc": 0.21234321468166292 + }, + { + "epoch": 0.4596892406918792, + "grad_norm": 5.729685171301163, + "learning_rate": 6.433763188745603e-05, + "loss": 4.095596790313721, + "step": 784, + "token_acc": 0.2111191831874677 + }, + { + "epoch": 0.4602755790090882, + "grad_norm": 3.3307347079350316, + "learning_rate": 6.441969519343493e-05, + "loss": 4.078511714935303, + "step": 785, + "token_acc": 0.21295191160050822 + }, + { + "epoch": 0.4608619173262973, + "grad_norm": 4.240882934666269, + "learning_rate": 6.450175849941383e-05, + "loss": 4.046321868896484, + "step": 786, + "token_acc": 0.21484360771580366 + }, + { + "epoch": 0.4614482556435063, + "grad_norm": 4.699549985134434, + "learning_rate": 6.458382180539273e-05, + "loss": 4.104073524475098, + "step": 787, + "token_acc": 0.2084332714556068 + }, + { + "epoch": 0.46203459396071533, + "grad_norm": 5.481704256863831, + "learning_rate": 6.466588511137162e-05, + "loss": 4.05832576751709, + "step": 788, + "token_acc": 0.2149072259261436 + }, + { + "epoch": 0.46262093227792433, + "grad_norm": 2.817924037726149, + "learning_rate": 6.474794841735052e-05, + "loss": 4.053046703338623, + "step": 789, + "token_acc": 0.21451936812902425 + }, + { + "epoch": 0.4632072705951334, + "grad_norm": 5.691301326157076, + "learning_rate": 6.483001172332942e-05, + "loss": 4.066107273101807, + "step": 790, + "token_acc": 0.21437340442471967 + }, + { + "epoch": 0.46379360891234245, + "grad_norm": 3.378327341464904, + "learning_rate": 6.491207502930832e-05, + "loss": 4.1215620040893555, + "step": 791, + "token_acc": 0.20711647764477853 + }, + { + "epoch": 0.46437994722955145, + "grad_norm": 4.460702793020244, + "learning_rate": 6.499413833528722e-05, + "loss": 4.027914047241211, + "step": 792, + "token_acc": 0.21829832775919733 + }, + { + "epoch": 0.4649662855467605, + "grad_norm": 5.464669142349539, + "learning_rate": 6.507620164126611e-05, + "loss": 4.102588653564453, + "step": 793, + "token_acc": 0.21014650393452708 + }, + { + "epoch": 0.4655526238639695, + "grad_norm": 4.448583527409169, + "learning_rate": 6.515826494724501e-05, + "loss": 4.055959224700928, + "step": 794, + "token_acc": 0.2134151901042458 + }, + { + "epoch": 0.46613896218117856, + "grad_norm": 5.1349201607474715, + "learning_rate": 6.524032825322391e-05, + "loss": 4.034310817718506, + "step": 795, + "token_acc": 0.21668292853346244 + }, + { + "epoch": 0.46672530049838756, + "grad_norm": 4.5210699795669695, + "learning_rate": 6.532239155920281e-05, + "loss": 4.062445163726807, + "step": 796, + "token_acc": 0.21575042959527235 + }, + { + "epoch": 0.4673116388155966, + "grad_norm": 4.811041422453679, + "learning_rate": 6.54044548651817e-05, + "loss": 4.036968231201172, + "step": 797, + "token_acc": 0.21730169991917522 + }, + { + "epoch": 0.4678979771328056, + "grad_norm": 4.68037780882389, + "learning_rate": 6.54865181711606e-05, + "loss": 3.966832399368286, + "step": 798, + "token_acc": 0.22272376882524278 + }, + { + "epoch": 0.4684843154500147, + "grad_norm": 3.832528698351509, + "learning_rate": 6.55685814771395e-05, + "loss": 4.03481388092041, + "step": 799, + "token_acc": 0.21444375256693915 + }, + { + "epoch": 0.4690706537672237, + "grad_norm": 5.085002612796332, + "learning_rate": 6.56506447831184e-05, + "loss": 4.040844440460205, + "step": 800, + "token_acc": 0.21513813918991306 + }, + { + "epoch": 0.46965699208443273, + "grad_norm": 5.159114561565329, + "learning_rate": 6.57327080890973e-05, + "loss": 4.041948318481445, + "step": 801, + "token_acc": 0.21609032413414847 + }, + { + "epoch": 0.47024333040164173, + "grad_norm": 3.5855464618679904, + "learning_rate": 6.58147713950762e-05, + "loss": 4.031923770904541, + "step": 802, + "token_acc": 0.21435230731986216 + }, + { + "epoch": 0.4708296687188508, + "grad_norm": 5.753062004348817, + "learning_rate": 6.58968347010551e-05, + "loss": 4.0522003173828125, + "step": 803, + "token_acc": 0.2155308616660226 + }, + { + "epoch": 0.4714160070360598, + "grad_norm": 3.394091310811631, + "learning_rate": 6.597889800703399e-05, + "loss": 4.077088356018066, + "step": 804, + "token_acc": 0.2103382382542002 + }, + { + "epoch": 0.47200234535326885, + "grad_norm": 5.790928520527392, + "learning_rate": 6.606096131301289e-05, + "loss": 4.04644250869751, + "step": 805, + "token_acc": 0.21362232295260344 + }, + { + "epoch": 0.47258868367047785, + "grad_norm": 2.8794334226316685, + "learning_rate": 6.614302461899179e-05, + "loss": 3.968930959701538, + "step": 806, + "token_acc": 0.2215706449841017 + }, + { + "epoch": 0.4731750219876869, + "grad_norm": 5.777484653725544, + "learning_rate": 6.622508792497069e-05, + "loss": 4.076010704040527, + "step": 807, + "token_acc": 0.2109606115349116 + }, + { + "epoch": 0.4737613603048959, + "grad_norm": 4.027667007149946, + "learning_rate": 6.630715123094958e-05, + "loss": 4.050105571746826, + "step": 808, + "token_acc": 0.2139537168205648 + }, + { + "epoch": 0.47434769862210496, + "grad_norm": 4.694283129384224, + "learning_rate": 6.638921453692848e-05, + "loss": 4.021165370941162, + "step": 809, + "token_acc": 0.2166883293459035 + }, + { + "epoch": 0.47493403693931396, + "grad_norm": 5.634513434650093, + "learning_rate": 6.647127784290738e-05, + "loss": 4.004396438598633, + "step": 810, + "token_acc": 0.21737749348689822 + }, + { + "epoch": 0.475520375256523, + "grad_norm": 4.874597054510726, + "learning_rate": 6.655334114888628e-05, + "loss": 4.058430194854736, + "step": 811, + "token_acc": 0.21351653327688716 + }, + { + "epoch": 0.476106713573732, + "grad_norm": 4.246625045347158, + "learning_rate": 6.663540445486518e-05, + "loss": 4.004892826080322, + "step": 812, + "token_acc": 0.21960231944952863 + }, + { + "epoch": 0.4766930518909411, + "grad_norm": 3.2497735868631965, + "learning_rate": 6.671746776084407e-05, + "loss": 3.9837310314178467, + "step": 813, + "token_acc": 0.21899919476679208 + }, + { + "epoch": 0.4772793902081501, + "grad_norm": 5.341321448805372, + "learning_rate": 6.679953106682297e-05, + "loss": 4.0097270011901855, + "step": 814, + "token_acc": 0.21651922246906585 + }, + { + "epoch": 0.47786572852535913, + "grad_norm": 3.6718191727682497, + "learning_rate": 6.688159437280187e-05, + "loss": 4.033107757568359, + "step": 815, + "token_acc": 0.21488122536297774 + }, + { + "epoch": 0.47845206684256814, + "grad_norm": 7.305765591297953, + "learning_rate": 6.696365767878077e-05, + "loss": 4.046302318572998, + "step": 816, + "token_acc": 0.21270874719641142 + }, + { + "epoch": 0.4790384051597772, + "grad_norm": 2.6863816673479763, + "learning_rate": 6.704572098475967e-05, + "loss": 4.0154900550842285, + "step": 817, + "token_acc": 0.21801591727074568 + }, + { + "epoch": 0.47962474347698625, + "grad_norm": 6.438865388948921, + "learning_rate": 6.712778429073856e-05, + "loss": 4.00985860824585, + "step": 818, + "token_acc": 0.21753190755432908 + }, + { + "epoch": 0.48021108179419525, + "grad_norm": 4.259731133756506, + "learning_rate": 6.720984759671746e-05, + "loss": 4.011630058288574, + "step": 819, + "token_acc": 0.2180763419859866 + }, + { + "epoch": 0.4807974201114043, + "grad_norm": 4.7477933044214415, + "learning_rate": 6.729191090269636e-05, + "loss": 4.006157875061035, + "step": 820, + "token_acc": 0.2164464293936185 + }, + { + "epoch": 0.4813837584286133, + "grad_norm": 3.2981815112000246, + "learning_rate": 6.737397420867526e-05, + "loss": 4.033952713012695, + "step": 821, + "token_acc": 0.214432444369123 + }, + { + "epoch": 0.48197009674582236, + "grad_norm": 4.612507856748187, + "learning_rate": 6.745603751465416e-05, + "loss": 4.019974231719971, + "step": 822, + "token_acc": 0.21820074826408517 + }, + { + "epoch": 0.48255643506303136, + "grad_norm": 3.836725240344396, + "learning_rate": 6.753810082063305e-05, + "loss": 3.980194330215454, + "step": 823, + "token_acc": 0.22201967517880658 + }, + { + "epoch": 0.4831427733802404, + "grad_norm": 4.802984966882959, + "learning_rate": 6.762016412661195e-05, + "loss": 3.9957339763641357, + "step": 824, + "token_acc": 0.21802652458568472 + }, + { + "epoch": 0.4837291116974494, + "grad_norm": 5.152519858861021, + "learning_rate": 6.770222743259085e-05, + "loss": 4.057910442352295, + "step": 825, + "token_acc": 0.21181657164281373 + }, + { + "epoch": 0.4843154500146585, + "grad_norm": 4.433238928229318, + "learning_rate": 6.778429073856975e-05, + "loss": 3.9672060012817383, + "step": 826, + "token_acc": 0.22388096135746582 + }, + { + "epoch": 0.4849017883318675, + "grad_norm": 4.520959270076583, + "learning_rate": 6.786635404454865e-05, + "loss": 3.940217971801758, + "step": 827, + "token_acc": 0.22445812261790385 + }, + { + "epoch": 0.48548812664907653, + "grad_norm": 6.788972715607243, + "learning_rate": 6.794841735052754e-05, + "loss": 4.062726020812988, + "step": 828, + "token_acc": 0.21115039758137993 + }, + { + "epoch": 0.48607446496628554, + "grad_norm": 3.0956771067886115, + "learning_rate": 6.803048065650644e-05, + "loss": 3.9951138496398926, + "step": 829, + "token_acc": 0.2190778666082576 + }, + { + "epoch": 0.4866608032834946, + "grad_norm": 5.696698618129569, + "learning_rate": 6.811254396248534e-05, + "loss": 3.990934371948242, + "step": 830, + "token_acc": 0.21920582528774726 + }, + { + "epoch": 0.4872471416007036, + "grad_norm": 3.3943526120013923, + "learning_rate": 6.819460726846424e-05, + "loss": 3.961142063140869, + "step": 831, + "token_acc": 0.22282085337455385 + }, + { + "epoch": 0.48783347991791265, + "grad_norm": 7.034190513970132, + "learning_rate": 6.827667057444314e-05, + "loss": 4.0496745109558105, + "step": 832, + "token_acc": 0.21266195417128528 + }, + { + "epoch": 0.48841981823512165, + "grad_norm": 3.2203863755841224, + "learning_rate": 6.835873388042203e-05, + "loss": 3.9616713523864746, + "step": 833, + "token_acc": 0.22296148152814085 + }, + { + "epoch": 0.4890061565523307, + "grad_norm": 4.992999536862018, + "learning_rate": 6.844079718640093e-05, + "loss": 3.9896860122680664, + "step": 834, + "token_acc": 0.21847508496621218 + }, + { + "epoch": 0.4895924948695397, + "grad_norm": 5.217503769547477, + "learning_rate": 6.852286049237983e-05, + "loss": 3.9951581954956055, + "step": 835, + "token_acc": 0.2189320836908455 + }, + { + "epoch": 0.49017883318674876, + "grad_norm": 4.802373612412434, + "learning_rate": 6.860492379835873e-05, + "loss": 3.9440159797668457, + "step": 836, + "token_acc": 0.22156611107870328 + }, + { + "epoch": 0.49076517150395776, + "grad_norm": 4.439798899640177, + "learning_rate": 6.868698710433763e-05, + "loss": 4.035882472991943, + "step": 837, + "token_acc": 0.21453252216919874 + }, + { + "epoch": 0.4913515098211668, + "grad_norm": 4.45858539117359, + "learning_rate": 6.876905041031652e-05, + "loss": 3.9638586044311523, + "step": 838, + "token_acc": 0.22277777777777777 + }, + { + "epoch": 0.4919378481383758, + "grad_norm": 3.8991037432482925, + "learning_rate": 6.885111371629542e-05, + "loss": 3.9965476989746094, + "step": 839, + "token_acc": 0.21630768664815053 + }, + { + "epoch": 0.4925241864555849, + "grad_norm": 5.294590930701878, + "learning_rate": 6.893317702227432e-05, + "loss": 3.9517555236816406, + "step": 840, + "token_acc": 0.22350560107807188 + }, + { + "epoch": 0.4931105247727939, + "grad_norm": 3.8762030871395527, + "learning_rate": 6.901524032825322e-05, + "loss": 3.9987030029296875, + "step": 841, + "token_acc": 0.21646050643745843 + }, + { + "epoch": 0.49369686309000294, + "grad_norm": 5.226671133508383, + "learning_rate": 6.909730363423212e-05, + "loss": 3.972357988357544, + "step": 842, + "token_acc": 0.22039892121837037 + }, + { + "epoch": 0.49428320140721194, + "grad_norm": 5.248159644147203, + "learning_rate": 6.917936694021101e-05, + "loss": 4.009156227111816, + "step": 843, + "token_acc": 0.21540408332861163 + }, + { + "epoch": 0.494869539724421, + "grad_norm": 3.0625152638377724, + "learning_rate": 6.926143024618991e-05, + "loss": 3.9520890712738037, + "step": 844, + "token_acc": 0.22420237200367493 + }, + { + "epoch": 0.49545587804163, + "grad_norm": 5.2969751167623995, + "learning_rate": 6.934349355216881e-05, + "loss": 3.9215798377990723, + "step": 845, + "token_acc": 0.22605102943293773 + }, + { + "epoch": 0.49604221635883905, + "grad_norm": 3.893484568704019, + "learning_rate": 6.942555685814771e-05, + "loss": 3.9572455883026123, + "step": 846, + "token_acc": 0.21943391387588967 + }, + { + "epoch": 0.4966285546760481, + "grad_norm": 6.524297086600854, + "learning_rate": 6.95076201641266e-05, + "loss": 4.00451135635376, + "step": 847, + "token_acc": 0.21761363066397826 + }, + { + "epoch": 0.4972148929932571, + "grad_norm": 3.109434153817786, + "learning_rate": 6.95896834701055e-05, + "loss": 3.995995044708252, + "step": 848, + "token_acc": 0.2191871889617463 + }, + { + "epoch": 0.49780123131046616, + "grad_norm": 5.226130412008868, + "learning_rate": 6.96717467760844e-05, + "loss": 4.031238555908203, + "step": 849, + "token_acc": 0.21236272769119485 + }, + { + "epoch": 0.49838756962767516, + "grad_norm": 3.7466784467786693, + "learning_rate": 6.97538100820633e-05, + "loss": 4.011455059051514, + "step": 850, + "token_acc": 0.21532591334330553 + }, + { + "epoch": 0.4989739079448842, + "grad_norm": 7.173364308176404, + "learning_rate": 6.98358733880422e-05, + "loss": 3.9455344676971436, + "step": 851, + "token_acc": 0.22367477435893424 + }, + { + "epoch": 0.4995602462620932, + "grad_norm": 3.3513706818245423, + "learning_rate": 6.99179366940211e-05, + "loss": 4.007432460784912, + "step": 852, + "token_acc": 0.2182759632185081 + }, + { + "epoch": 0.5001465845793023, + "grad_norm": 8.6649703776723, + "learning_rate": 7e-05, + "loss": 3.9926390647888184, + "step": 853, + "token_acc": 0.2181222064266595 + }, + { + "epoch": 0.5007329228965113, + "grad_norm": 4.352675286888637, + "learning_rate": 7.008206330597889e-05, + "loss": 4.01979398727417, + "step": 854, + "token_acc": 0.21553492646597247 + }, + { + "epoch": 0.5013192612137203, + "grad_norm": 5.711007515364585, + "learning_rate": 7.016412661195779e-05, + "loss": 4.0128350257873535, + "step": 855, + "token_acc": 0.21589557635105247 + }, + { + "epoch": 0.5019055995309294, + "grad_norm": 3.656445345419733, + "learning_rate": 7.024618991793669e-05, + "loss": 4.00874137878418, + "step": 856, + "token_acc": 0.21513901765827978 + }, + { + "epoch": 0.5024919378481384, + "grad_norm": 4.057422859025177, + "learning_rate": 7.032825322391559e-05, + "loss": 3.9652767181396484, + "step": 857, + "token_acc": 0.2221678747417772 + }, + { + "epoch": 0.5030782761653474, + "grad_norm": 3.444897759941114, + "learning_rate": 7.041031652989448e-05, + "loss": 3.9911670684814453, + "step": 858, + "token_acc": 0.21638694559265656 + }, + { + "epoch": 0.5036646144825564, + "grad_norm": 5.750083563618263, + "learning_rate": 7.049237983587338e-05, + "loss": 4.0121235847473145, + "step": 859, + "token_acc": 0.21741492002027107 + }, + { + "epoch": 0.5042509527997655, + "grad_norm": 3.4633672135683065, + "learning_rate": 7.057444314185228e-05, + "loss": 3.987497091293335, + "step": 860, + "token_acc": 0.2168935219814372 + }, + { + "epoch": 0.5048372911169745, + "grad_norm": 4.033214313903988, + "learning_rate": 7.065650644783119e-05, + "loss": 3.9413931369781494, + "step": 861, + "token_acc": 0.22270985017071285 + }, + { + "epoch": 0.5054236294341835, + "grad_norm": 4.14091094745713, + "learning_rate": 7.073856975381006e-05, + "loss": 3.9866116046905518, + "step": 862, + "token_acc": 0.21899040242108717 + }, + { + "epoch": 0.5060099677513925, + "grad_norm": 4.440484227663608, + "learning_rate": 7.082063305978897e-05, + "loss": 3.9983103275299072, + "step": 863, + "token_acc": 0.2147575009195523 + }, + { + "epoch": 0.5065963060686016, + "grad_norm": 3.7214046582832365, + "learning_rate": 7.090269636576787e-05, + "loss": 4.019493103027344, + "step": 864, + "token_acc": 0.21327359866638884 + }, + { + "epoch": 0.5071826443858106, + "grad_norm": 4.486378213749062, + "learning_rate": 7.098475967174677e-05, + "loss": 3.9570109844207764, + "step": 865, + "token_acc": 0.2214040674995458 + }, + { + "epoch": 0.5077689827030196, + "grad_norm": 3.8467668693372308, + "learning_rate": 7.106682297772567e-05, + "loss": 3.9893617630004883, + "step": 866, + "token_acc": 0.21912038986823898 + }, + { + "epoch": 0.5083553210202286, + "grad_norm": 6.146675121610778, + "learning_rate": 7.114888628370457e-05, + "loss": 3.934000015258789, + "step": 867, + "token_acc": 0.22261524087744924 + }, + { + "epoch": 0.5089416593374377, + "grad_norm": 3.0193792674239983, + "learning_rate": 7.123094958968346e-05, + "loss": 3.879106283187866, + "step": 868, + "token_acc": 0.22801801036445501 + }, + { + "epoch": 0.5095279976546467, + "grad_norm": 7.929730089888444, + "learning_rate": 7.131301289566236e-05, + "loss": 4.037492275238037, + "step": 869, + "token_acc": 0.2142709155614767 + }, + { + "epoch": 0.5101143359718557, + "grad_norm": 5.21060678523679, + "learning_rate": 7.139507620164126e-05, + "loss": 3.9994630813598633, + "step": 870, + "token_acc": 0.2173769510107253 + }, + { + "epoch": 0.5107006742890647, + "grad_norm": 5.409436630914686, + "learning_rate": 7.147713950762016e-05, + "loss": 4.012307167053223, + "step": 871, + "token_acc": 0.21459939365400182 + }, + { + "epoch": 0.5112870126062738, + "grad_norm": 4.270792580029859, + "learning_rate": 7.155920281359906e-05, + "loss": 3.9235074520111084, + "step": 872, + "token_acc": 0.22711250979461303 + }, + { + "epoch": 0.5118733509234829, + "grad_norm": 3.721081466540487, + "learning_rate": 7.164126611957795e-05, + "loss": 4.00079345703125, + "step": 873, + "token_acc": 0.2172611024664437 + }, + { + "epoch": 0.5124596892406919, + "grad_norm": 5.233746634953643, + "learning_rate": 7.172332942555685e-05, + "loss": 3.932919979095459, + "step": 874, + "token_acc": 0.22385943166569064 + }, + { + "epoch": 0.513046027557901, + "grad_norm": 3.7464398839309943, + "learning_rate": 7.180539273153575e-05, + "loss": 3.9187636375427246, + "step": 875, + "token_acc": 0.22386681119372223 + }, + { + "epoch": 0.51363236587511, + "grad_norm": 4.442836358862631, + "learning_rate": 7.188745603751465e-05, + "loss": 3.9152987003326416, + "step": 876, + "token_acc": 0.2252617148554337 + }, + { + "epoch": 0.514218704192319, + "grad_norm": 3.975531814815956, + "learning_rate": 7.196951934349355e-05, + "loss": 3.9355785846710205, + "step": 877, + "token_acc": 0.22226019232297545 + }, + { + "epoch": 0.514805042509528, + "grad_norm": 4.541750775651291, + "learning_rate": 7.205158264947244e-05, + "loss": 3.9372718334198, + "step": 878, + "token_acc": 0.22283512504183875 + }, + { + "epoch": 0.5153913808267371, + "grad_norm": 3.878191100706564, + "learning_rate": 7.213364595545136e-05, + "loss": 3.9745213985443115, + "step": 879, + "token_acc": 0.2193806413737433 + }, + { + "epoch": 0.5159777191439461, + "grad_norm": 5.779776923028234, + "learning_rate": 7.221570926143023e-05, + "loss": 3.8926379680633545, + "step": 880, + "token_acc": 0.2300731855641999 + }, + { + "epoch": 0.5165640574611551, + "grad_norm": 2.8259193465927885, + "learning_rate": 7.229777256740914e-05, + "loss": 3.958523750305176, + "step": 881, + "token_acc": 0.219617717397754 + }, + { + "epoch": 0.5171503957783641, + "grad_norm": 5.953905079652252, + "learning_rate": 7.237983587338804e-05, + "loss": 3.906073570251465, + "step": 882, + "token_acc": 0.22284724890194899 + }, + { + "epoch": 0.5177367340955732, + "grad_norm": 3.424066460604576, + "learning_rate": 7.246189917936693e-05, + "loss": 3.9637608528137207, + "step": 883, + "token_acc": 0.21793221029060003 + }, + { + "epoch": 0.5183230724127822, + "grad_norm": 4.431494527412378, + "learning_rate": 7.254396248534583e-05, + "loss": 3.9044528007507324, + "step": 884, + "token_acc": 0.22711755670245587 + }, + { + "epoch": 0.5189094107299912, + "grad_norm": 5.0536939877353975, + "learning_rate": 7.262602579132473e-05, + "loss": 3.941391944885254, + "step": 885, + "token_acc": 0.22255097168747523 + }, + { + "epoch": 0.5194957490472002, + "grad_norm": 3.0585335755153644, + "learning_rate": 7.270808909730363e-05, + "loss": 3.953786849975586, + "step": 886, + "token_acc": 0.2189842872142502 + }, + { + "epoch": 0.5200820873644093, + "grad_norm": 3.58658304994689, + "learning_rate": 7.279015240328253e-05, + "loss": 3.9502034187316895, + "step": 887, + "token_acc": 0.22080998769047921 + }, + { + "epoch": 0.5206684256816183, + "grad_norm": 4.022615966478019, + "learning_rate": 7.287221570926142e-05, + "loss": 3.869863510131836, + "step": 888, + "token_acc": 0.23007992897579133 + }, + { + "epoch": 0.5212547639988273, + "grad_norm": 4.933834732727239, + "learning_rate": 7.295427901524032e-05, + "loss": 3.8581924438476562, + "step": 889, + "token_acc": 0.2322468294128856 + }, + { + "epoch": 0.5218411023160363, + "grad_norm": 4.2477553037008855, + "learning_rate": 7.303634232121922e-05, + "loss": 4.047770023345947, + "step": 890, + "token_acc": 0.21254070412383255 + }, + { + "epoch": 0.5224274406332454, + "grad_norm": 2.993079493007284, + "learning_rate": 7.311840562719812e-05, + "loss": 3.9408206939697266, + "step": 891, + "token_acc": 0.22034887392792635 + }, + { + "epoch": 0.5230137789504544, + "grad_norm": 4.310028473311131, + "learning_rate": 7.320046893317702e-05, + "loss": 3.8987066745758057, + "step": 892, + "token_acc": 0.2246339106518992 + }, + { + "epoch": 0.5236001172676634, + "grad_norm": 5.338960084915028, + "learning_rate": 7.328253223915591e-05, + "loss": 3.980449914932251, + "step": 893, + "token_acc": 0.2169134098795536 + }, + { + "epoch": 0.5241864555848724, + "grad_norm": 3.019209666074229, + "learning_rate": 7.336459554513481e-05, + "loss": 3.9784231185913086, + "step": 894, + "token_acc": 0.21574862131759417 + }, + { + "epoch": 0.5247727939020815, + "grad_norm": 4.8939575307265555, + "learning_rate": 7.344665885111371e-05, + "loss": 3.898198127746582, + "step": 895, + "token_acc": 0.2253466074405409 + }, + { + "epoch": 0.5253591322192905, + "grad_norm": 2.8546093667038, + "learning_rate": 7.352872215709262e-05, + "loss": 3.893986940383911, + "step": 896, + "token_acc": 0.22656573742095062 + }, + { + "epoch": 0.5259454705364995, + "grad_norm": 5.857708422466417, + "learning_rate": 7.361078546307152e-05, + "loss": 3.9416933059692383, + "step": 897, + "token_acc": 0.2220271912423012 + }, + { + "epoch": 0.5265318088537085, + "grad_norm": 3.4329769467322953, + "learning_rate": 7.36928487690504e-05, + "loss": 3.989224910736084, + "step": 898, + "token_acc": 0.21665791512214902 + }, + { + "epoch": 0.5271181471709177, + "grad_norm": 5.8285255496060255, + "learning_rate": 7.37749120750293e-05, + "loss": 3.931288719177246, + "step": 899, + "token_acc": 0.22245900400513866 + }, + { + "epoch": 0.5277044854881267, + "grad_norm": 3.8255927671258037, + "learning_rate": 7.38569753810082e-05, + "loss": 3.903797149658203, + "step": 900, + "token_acc": 0.22568931148689367 + }, + { + "epoch": 0.5282908238053357, + "grad_norm": 2.3302054348179198, + "learning_rate": 7.39390386869871e-05, + "loss": 3.8834705352783203, + "step": 901, + "token_acc": 0.2272481622758921 + }, + { + "epoch": 0.5288771621225447, + "grad_norm": 5.917167165962022, + "learning_rate": 7.4021101992966e-05, + "loss": 3.9495420455932617, + "step": 902, + "token_acc": 0.22068242166755178 + }, + { + "epoch": 0.5294635004397538, + "grad_norm": 2.8763906550265412, + "learning_rate": 7.410316529894489e-05, + "loss": 4.001081943511963, + "step": 903, + "token_acc": 0.21539195507272058 + }, + { + "epoch": 0.5300498387569628, + "grad_norm": 5.001622948569647, + "learning_rate": 7.418522860492379e-05, + "loss": 4.000034332275391, + "step": 904, + "token_acc": 0.21476615409950803 + }, + { + "epoch": 0.5306361770741718, + "grad_norm": 3.571959051065895, + "learning_rate": 7.426729191090269e-05, + "loss": 3.98193359375, + "step": 905, + "token_acc": 0.2158800052902232 + }, + { + "epoch": 0.5312225153913809, + "grad_norm": 4.314495024250772, + "learning_rate": 7.434935521688159e-05, + "loss": 3.954977035522461, + "step": 906, + "token_acc": 0.22094877416782938 + }, + { + "epoch": 0.5318088537085899, + "grad_norm": 4.760160344986089, + "learning_rate": 7.443141852286049e-05, + "loss": 3.905460834503174, + "step": 907, + "token_acc": 0.22663310559750688 + }, + { + "epoch": 0.5323951920257989, + "grad_norm": 3.375661926320089, + "learning_rate": 7.451348182883938e-05, + "loss": 3.933032512664795, + "step": 908, + "token_acc": 0.2227987739415911 + }, + { + "epoch": 0.5329815303430079, + "grad_norm": 4.1356343859429545, + "learning_rate": 7.459554513481828e-05, + "loss": 3.887661933898926, + "step": 909, + "token_acc": 0.2270420162551823 + }, + { + "epoch": 0.533567868660217, + "grad_norm": 3.366607829254808, + "learning_rate": 7.467760844079718e-05, + "loss": 3.954080104827881, + "step": 910, + "token_acc": 0.22044739869615237 + }, + { + "epoch": 0.534154206977426, + "grad_norm": 5.549424512471356, + "learning_rate": 7.475967174677608e-05, + "loss": 4.005929470062256, + "step": 911, + "token_acc": 0.21477800310888295 + }, + { + "epoch": 0.534740545294635, + "grad_norm": 2.600919493494448, + "learning_rate": 7.484173505275497e-05, + "loss": 3.960184335708618, + "step": 912, + "token_acc": 0.21650387049878764 + }, + { + "epoch": 0.535326883611844, + "grad_norm": 4.502918341638091, + "learning_rate": 7.492379835873389e-05, + "loss": 3.9495348930358887, + "step": 913, + "token_acc": 0.221415117582077 + }, + { + "epoch": 0.5359132219290531, + "grad_norm": 3.498191326307574, + "learning_rate": 7.500586166471278e-05, + "loss": 3.9135985374450684, + "step": 914, + "token_acc": 0.22394118570726196 + }, + { + "epoch": 0.5364995602462621, + "grad_norm": 3.593785183875704, + "learning_rate": 7.508792497069168e-05, + "loss": 3.905601739883423, + "step": 915, + "token_acc": 0.22396632947065132 + }, + { + "epoch": 0.5370858985634711, + "grad_norm": 5.52076900255028, + "learning_rate": 7.516998827667057e-05, + "loss": 3.892564535140991, + "step": 916, + "token_acc": 0.22547355833449947 + }, + { + "epoch": 0.5376722368806801, + "grad_norm": 3.1674819562606937, + "learning_rate": 7.525205158264946e-05, + "loss": 3.930121421813965, + "step": 917, + "token_acc": 0.22316433710118405 + }, + { + "epoch": 0.5382585751978892, + "grad_norm": 3.588282321328592, + "learning_rate": 7.533411488862836e-05, + "loss": 3.8929686546325684, + "step": 918, + "token_acc": 0.2257181907267007 + }, + { + "epoch": 0.5388449135150982, + "grad_norm": 4.956463348598123, + "learning_rate": 7.541617819460726e-05, + "loss": 3.944598436355591, + "step": 919, + "token_acc": 0.21990550352147428 + }, + { + "epoch": 0.5394312518323072, + "grad_norm": 4.316020753094943, + "learning_rate": 7.549824150058616e-05, + "loss": 3.9181320667266846, + "step": 920, + "token_acc": 0.22260710062759603 + }, + { + "epoch": 0.5400175901495162, + "grad_norm": 3.840725134227517, + "learning_rate": 7.558030480656506e-05, + "loss": 3.9240145683288574, + "step": 921, + "token_acc": 0.2196389186973895 + }, + { + "epoch": 0.5406039284667253, + "grad_norm": 3.3237048216771026, + "learning_rate": 7.566236811254395e-05, + "loss": 3.911754608154297, + "step": 922, + "token_acc": 0.22117295066930895 + }, + { + "epoch": 0.5411902667839343, + "grad_norm": 3.6250707529186417, + "learning_rate": 7.574443141852285e-05, + "loss": 3.8985207080841064, + "step": 923, + "token_acc": 0.22436844635132008 + }, + { + "epoch": 0.5417766051011433, + "grad_norm": 3.7120036571529282, + "learning_rate": 7.582649472450175e-05, + "loss": 3.9529762268066406, + "step": 924, + "token_acc": 0.21985204921583917 + }, + { + "epoch": 0.5423629434183523, + "grad_norm": 5.383160656316933, + "learning_rate": 7.590855803048065e-05, + "loss": 3.9192419052124023, + "step": 925, + "token_acc": 0.22281855239198808 + }, + { + "epoch": 0.5429492817355615, + "grad_norm": 2.0527480363140653, + "learning_rate": 7.599062133645955e-05, + "loss": 3.891284942626953, + "step": 926, + "token_acc": 0.2236781882410655 + }, + { + "epoch": 0.5435356200527705, + "grad_norm": 3.263266165618146, + "learning_rate": 7.607268464243844e-05, + "loss": 3.945679187774658, + "step": 927, + "token_acc": 0.21928222878494386 + }, + { + "epoch": 0.5441219583699795, + "grad_norm": 4.234682571124678, + "learning_rate": 7.615474794841734e-05, + "loss": 3.9352431297302246, + "step": 928, + "token_acc": 0.2190277792083389 + }, + { + "epoch": 0.5447082966871885, + "grad_norm": 2.761705785590396, + "learning_rate": 7.623681125439624e-05, + "loss": 3.8988049030303955, + "step": 929, + "token_acc": 0.22568798797944506 + }, + { + "epoch": 0.5452946350043976, + "grad_norm": 3.9690195712761156, + "learning_rate": 7.631887456037514e-05, + "loss": 3.8889317512512207, + "step": 930, + "token_acc": 0.22487847367103397 + }, + { + "epoch": 0.5458809733216066, + "grad_norm": 2.777011776409667, + "learning_rate": 7.640093786635405e-05, + "loss": 3.9464635848999023, + "step": 931, + "token_acc": 0.219765797981268 + }, + { + "epoch": 0.5464673116388156, + "grad_norm": 3.552419808478299, + "learning_rate": 7.648300117233295e-05, + "loss": 3.8892805576324463, + "step": 932, + "token_acc": 0.22564620538391222 + }, + { + "epoch": 0.5470536499560247, + "grad_norm": 3.1801561150165143, + "learning_rate": 7.656506447831183e-05, + "loss": 3.900843620300293, + "step": 933, + "token_acc": 0.2246993504683916 + }, + { + "epoch": 0.5476399882732337, + "grad_norm": 3.968305321540962, + "learning_rate": 7.664712778429073e-05, + "loss": 3.9216039180755615, + "step": 934, + "token_acc": 0.22265100542415012 + }, + { + "epoch": 0.5482263265904427, + "grad_norm": 2.72321068246262, + "learning_rate": 7.672919109026963e-05, + "loss": 3.8249402046203613, + "step": 935, + "token_acc": 0.23361378695773394 + }, + { + "epoch": 0.5488126649076517, + "grad_norm": 4.338261740074034, + "learning_rate": 7.681125439624853e-05, + "loss": 3.9207706451416016, + "step": 936, + "token_acc": 0.2226716113944028 + }, + { + "epoch": 0.5493990032248608, + "grad_norm": 4.395011774319159, + "learning_rate": 7.689331770222742e-05, + "loss": 3.9280495643615723, + "step": 937, + "token_acc": 0.22203949096793854 + }, + { + "epoch": 0.5499853415420698, + "grad_norm": 3.325537024141444, + "learning_rate": 7.697538100820632e-05, + "loss": 3.904790163040161, + "step": 938, + "token_acc": 0.2250743838050709 + }, + { + "epoch": 0.5505716798592788, + "grad_norm": 3.9823258684187266, + "learning_rate": 7.705744431418522e-05, + "loss": 3.8971245288848877, + "step": 939, + "token_acc": 0.22593879428760585 + }, + { + "epoch": 0.5511580181764878, + "grad_norm": 3.944714517481396, + "learning_rate": 7.713950762016412e-05, + "loss": 3.9638924598693848, + "step": 940, + "token_acc": 0.21506544130728603 + }, + { + "epoch": 0.5517443564936969, + "grad_norm": 3.2877293615052383, + "learning_rate": 7.722157092614302e-05, + "loss": 3.872718572616577, + "step": 941, + "token_acc": 0.22773351209689124 + }, + { + "epoch": 0.5523306948109059, + "grad_norm": 4.8399644203222065, + "learning_rate": 7.730363423212191e-05, + "loss": 3.9278793334960938, + "step": 942, + "token_acc": 0.22157682775712514 + }, + { + "epoch": 0.5529170331281149, + "grad_norm": 2.68554868312657, + "learning_rate": 7.738569753810081e-05, + "loss": 3.8825113773345947, + "step": 943, + "token_acc": 0.22638048113562748 + }, + { + "epoch": 0.5535033714453239, + "grad_norm": 2.736919658190876, + "learning_rate": 7.746776084407971e-05, + "loss": 3.8771533966064453, + "step": 944, + "token_acc": 0.22490796693755644 + }, + { + "epoch": 0.554089709762533, + "grad_norm": 3.279178671179111, + "learning_rate": 7.754982415005861e-05, + "loss": 3.896569013595581, + "step": 945, + "token_acc": 0.22379683409739348 + }, + { + "epoch": 0.554676048079742, + "grad_norm": 3.7235756234502313, + "learning_rate": 7.76318874560375e-05, + "loss": 3.8837943077087402, + "step": 946, + "token_acc": 0.22533231724199762 + }, + { + "epoch": 0.555262386396951, + "grad_norm": 3.1662908855190537, + "learning_rate": 7.77139507620164e-05, + "loss": 3.8931336402893066, + "step": 947, + "token_acc": 0.22274568874573683 + }, + { + "epoch": 0.55584872471416, + "grad_norm": 5.197714669975523, + "learning_rate": 7.779601406799532e-05, + "loss": 3.898205280303955, + "step": 948, + "token_acc": 0.22474386720540732 + }, + { + "epoch": 0.5564350630313691, + "grad_norm": 3.554308146990438, + "learning_rate": 7.787807737397421e-05, + "loss": 3.8957624435424805, + "step": 949, + "token_acc": 0.2242974035314944 + }, + { + "epoch": 0.5570214013485781, + "grad_norm": 3.3562448081353855, + "learning_rate": 7.796014067995311e-05, + "loss": 3.9175848960876465, + "step": 950, + "token_acc": 0.2214336955655339 + }, + { + "epoch": 0.5576077396657871, + "grad_norm": 3.253693877911425, + "learning_rate": 7.8042203985932e-05, + "loss": 3.893742561340332, + "step": 951, + "token_acc": 0.2237078837642612 + }, + { + "epoch": 0.5581940779829961, + "grad_norm": 3.415461906251086, + "learning_rate": 7.81242672919109e-05, + "loss": 3.864027976989746, + "step": 952, + "token_acc": 0.2274428251762737 + }, + { + "epoch": 0.5587804163002053, + "grad_norm": 3.34311488115709, + "learning_rate": 7.820633059788979e-05, + "loss": 3.9224228858947754, + "step": 953, + "token_acc": 0.22049115949680975 + }, + { + "epoch": 0.5593667546174143, + "grad_norm": 3.55119858655416, + "learning_rate": 7.828839390386869e-05, + "loss": 3.8433961868286133, + "step": 954, + "token_acc": 0.22821853817715113 + }, + { + "epoch": 0.5599530929346233, + "grad_norm": 3.6379440858234435, + "learning_rate": 7.837045720984759e-05, + "loss": 3.845787525177002, + "step": 955, + "token_acc": 0.2277142050326816 + }, + { + "epoch": 0.5605394312518323, + "grad_norm": 4.911104202124572, + "learning_rate": 7.845252051582649e-05, + "loss": 3.793720245361328, + "step": 956, + "token_acc": 0.23277526493334635 + }, + { + "epoch": 0.5611257695690414, + "grad_norm": 3.0046046696018176, + "learning_rate": 7.853458382180538e-05, + "loss": 3.8267722129821777, + "step": 957, + "token_acc": 0.23047471819645732 + }, + { + "epoch": 0.5617121078862504, + "grad_norm": 4.197325105947007, + "learning_rate": 7.861664712778428e-05, + "loss": 3.818058967590332, + "step": 958, + "token_acc": 0.23168694696112738 + }, + { + "epoch": 0.5622984462034594, + "grad_norm": 3.4418132419606637, + "learning_rate": 7.869871043376318e-05, + "loss": 3.8434574604034424, + "step": 959, + "token_acc": 0.2277691542187773 + }, + { + "epoch": 0.5628847845206685, + "grad_norm": 4.028887928871811, + "learning_rate": 7.878077373974208e-05, + "loss": 3.855584144592285, + "step": 960, + "token_acc": 0.22577280805302963 + }, + { + "epoch": 0.5634711228378775, + "grad_norm": 3.1334857441013084, + "learning_rate": 7.886283704572098e-05, + "loss": 3.834045886993408, + "step": 961, + "token_acc": 0.2291132213766788 + }, + { + "epoch": 0.5640574611550865, + "grad_norm": 3.4515580992555503, + "learning_rate": 7.894490035169987e-05, + "loss": 3.851886749267578, + "step": 962, + "token_acc": 0.22691975619533153 + }, + { + "epoch": 0.5646437994722955, + "grad_norm": 4.453338764321934, + "learning_rate": 7.902696365767877e-05, + "loss": 3.8006951808929443, + "step": 963, + "token_acc": 0.2355371156799247 + }, + { + "epoch": 0.5652301377895046, + "grad_norm": 4.533303865172233, + "learning_rate": 7.910902696365767e-05, + "loss": 3.811504602432251, + "step": 964, + "token_acc": 0.22935611183030616 + }, + { + "epoch": 0.5658164761067136, + "grad_norm": 3.221479116110373, + "learning_rate": 7.919109026963657e-05, + "loss": 3.838282346725464, + "step": 965, + "token_acc": 0.2277996781040797 + }, + { + "epoch": 0.5664028144239226, + "grad_norm": 5.345395767924592, + "learning_rate": 7.927315357561548e-05, + "loss": 3.852888584136963, + "step": 966, + "token_acc": 0.22853371041270334 + }, + { + "epoch": 0.5669891527411316, + "grad_norm": 2.6754339870035375, + "learning_rate": 7.935521688159438e-05, + "loss": 3.899423122406006, + "step": 967, + "token_acc": 0.220977424646992 + }, + { + "epoch": 0.5675754910583407, + "grad_norm": 8.317987045475705, + "learning_rate": 7.943728018757328e-05, + "loss": 3.886955738067627, + "step": 968, + "token_acc": 0.22348943524855572 + }, + { + "epoch": 0.5681618293755497, + "grad_norm": 4.085085559551292, + "learning_rate": 7.951934349355216e-05, + "loss": 3.8788070678710938, + "step": 969, + "token_acc": 0.22676565486024436 + }, + { + "epoch": 0.5687481676927587, + "grad_norm": 7.174908146785812, + "learning_rate": 7.960140679953106e-05, + "loss": 3.93218994140625, + "step": 970, + "token_acc": 0.21808026150029122 + }, + { + "epoch": 0.5693345060099677, + "grad_norm": 4.421127123008212, + "learning_rate": 7.968347010550996e-05, + "loss": 3.8866190910339355, + "step": 971, + "token_acc": 0.22490683014408314 + }, + { + "epoch": 0.5699208443271768, + "grad_norm": 5.367228969842158, + "learning_rate": 7.976553341148885e-05, + "loss": 3.8557634353637695, + "step": 972, + "token_acc": 0.22785068424902213 + }, + { + "epoch": 0.5705071826443858, + "grad_norm": 3.16810781283421, + "learning_rate": 7.984759671746775e-05, + "loss": 3.9037954807281494, + "step": 973, + "token_acc": 0.22018437157385987 + }, + { + "epoch": 0.5710935209615948, + "grad_norm": 3.8105249774460224, + "learning_rate": 7.992966002344665e-05, + "loss": 3.8523316383361816, + "step": 974, + "token_acc": 0.22604303874422266 + }, + { + "epoch": 0.5716798592788038, + "grad_norm": 3.4622471120927036, + "learning_rate": 8.001172332942555e-05, + "loss": 3.8481078147888184, + "step": 975, + "token_acc": 0.22952883161013612 + }, + { + "epoch": 0.5722661975960129, + "grad_norm": 3.639228196611874, + "learning_rate": 8.009378663540445e-05, + "loss": 3.8009986877441406, + "step": 976, + "token_acc": 0.233648143500601 + }, + { + "epoch": 0.5728525359132219, + "grad_norm": 5.00103382334742, + "learning_rate": 8.017584994138334e-05, + "loss": 3.814574718475342, + "step": 977, + "token_acc": 0.23216061226573795 + }, + { + "epoch": 0.5734388742304309, + "grad_norm": 3.7071355160869612, + "learning_rate": 8.025791324736224e-05, + "loss": 3.8253519535064697, + "step": 978, + "token_acc": 0.2289901702472639 + }, + { + "epoch": 0.5740252125476399, + "grad_norm": 4.481567337595048, + "learning_rate": 8.033997655334114e-05, + "loss": 3.8796417713165283, + "step": 979, + "token_acc": 0.22221097029321965 + }, + { + "epoch": 0.574611550864849, + "grad_norm": 3.237650655814511, + "learning_rate": 8.042203985932004e-05, + "loss": 3.833104133605957, + "step": 980, + "token_acc": 0.22838650627615062 + }, + { + "epoch": 0.575197889182058, + "grad_norm": 3.2423057510132693, + "learning_rate": 8.050410316529894e-05, + "loss": 3.889162063598633, + "step": 981, + "token_acc": 0.22233752924370095 + }, + { + "epoch": 0.575784227499267, + "grad_norm": 3.513320179650478, + "learning_rate": 8.058616647127783e-05, + "loss": 3.827302932739258, + "step": 982, + "token_acc": 0.2291965557167383 + }, + { + "epoch": 0.576370565816476, + "grad_norm": 4.732291375856808, + "learning_rate": 8.066822977725675e-05, + "loss": 3.8552141189575195, + "step": 983, + "token_acc": 0.2254483786422735 + }, + { + "epoch": 0.5769569041336852, + "grad_norm": 2.5648213631552346, + "learning_rate": 8.075029308323564e-05, + "loss": 3.7717976570129395, + "step": 984, + "token_acc": 0.23429037452165366 + }, + { + "epoch": 0.5775432424508942, + "grad_norm": 4.971729409570693, + "learning_rate": 8.083235638921454e-05, + "loss": 3.7867424488067627, + "step": 985, + "token_acc": 0.23247293877745084 + }, + { + "epoch": 0.5781295807681032, + "grad_norm": 3.76378411267868, + "learning_rate": 8.091441969519344e-05, + "loss": 3.799938678741455, + "step": 986, + "token_acc": 0.2325908520973163 + }, + { + "epoch": 0.5787159190853123, + "grad_norm": 4.084654307778991, + "learning_rate": 8.099648300117232e-05, + "loss": 3.883906364440918, + "step": 987, + "token_acc": 0.2216784762400052 + }, + { + "epoch": 0.5793022574025213, + "grad_norm": 2.8510927600248688, + "learning_rate": 8.107854630715122e-05, + "loss": 3.84855318069458, + "step": 988, + "token_acc": 0.22796355722388453 + }, + { + "epoch": 0.5798885957197303, + "grad_norm": 4.9541197346937675, + "learning_rate": 8.116060961313012e-05, + "loss": 3.819871425628662, + "step": 989, + "token_acc": 0.22912656693312966 + }, + { + "epoch": 0.5804749340369393, + "grad_norm": 3.5064190020491766, + "learning_rate": 8.124267291910902e-05, + "loss": 3.8603312969207764, + "step": 990, + "token_acc": 0.22497547606767593 + }, + { + "epoch": 0.5810612723541484, + "grad_norm": 3.8543529276605493, + "learning_rate": 8.132473622508792e-05, + "loss": 3.7963509559631348, + "step": 991, + "token_acc": 0.231695692497939 + }, + { + "epoch": 0.5816476106713574, + "grad_norm": 3.7517608582258704, + "learning_rate": 8.140679953106681e-05, + "loss": 3.840233087539673, + "step": 992, + "token_acc": 0.22612567002614864 + }, + { + "epoch": 0.5822339489885664, + "grad_norm": 4.125403802049692, + "learning_rate": 8.148886283704571e-05, + "loss": 3.871232509613037, + "step": 993, + "token_acc": 0.22211566872113336 + }, + { + "epoch": 0.5828202873057754, + "grad_norm": 2.467875132986388, + "learning_rate": 8.157092614302461e-05, + "loss": 3.798952579498291, + "step": 994, + "token_acc": 0.23011739313272972 + }, + { + "epoch": 0.5834066256229845, + "grad_norm": 4.377110377657691, + "learning_rate": 8.165298944900351e-05, + "loss": 3.821575164794922, + "step": 995, + "token_acc": 0.22830792054299254 + }, + { + "epoch": 0.5839929639401935, + "grad_norm": 2.493508094623644, + "learning_rate": 8.17350527549824e-05, + "loss": 3.821772575378418, + "step": 996, + "token_acc": 0.22838963026013123 + }, + { + "epoch": 0.5845793022574025, + "grad_norm": 4.365756314753711, + "learning_rate": 8.18171160609613e-05, + "loss": 3.812507152557373, + "step": 997, + "token_acc": 0.22803536725578177 + }, + { + "epoch": 0.5851656405746115, + "grad_norm": 3.0145318160243026, + "learning_rate": 8.18991793669402e-05, + "loss": 3.8375725746154785, + "step": 998, + "token_acc": 0.22480492448014727 + }, + { + "epoch": 0.5857519788918206, + "grad_norm": 3.8988707731223124, + "learning_rate": 8.19812426729191e-05, + "loss": 3.805799722671509, + "step": 999, + "token_acc": 0.23037427871729924 + }, + { + "epoch": 0.5863383172090296, + "grad_norm": 3.0808158733267352, + "learning_rate": 8.2063305978898e-05, + "loss": 3.772102117538452, + "step": 1000, + "token_acc": 0.2322652993316163 + }, + { + "epoch": 0.5869246555262386, + "grad_norm": 4.052979513195679, + "learning_rate": 8.214536928487691e-05, + "loss": 3.7995927333831787, + "step": 1001, + "token_acc": 0.2302411922692085 + }, + { + "epoch": 0.5875109938434476, + "grad_norm": 3.3241052941074765, + "learning_rate": 8.222743259085581e-05, + "loss": 3.8492989540100098, + "step": 1002, + "token_acc": 0.22498169750244246 + }, + { + "epoch": 0.5880973321606567, + "grad_norm": 4.493207372726703, + "learning_rate": 8.23094958968347e-05, + "loss": 3.8294248580932617, + "step": 1003, + "token_acc": 0.22555666439618346 + }, + { + "epoch": 0.5886836704778657, + "grad_norm": 3.6574692454369067, + "learning_rate": 8.23915592028136e-05, + "loss": 3.78488826751709, + "step": 1004, + "token_acc": 0.23005784369621488 + }, + { + "epoch": 0.5892700087950747, + "grad_norm": 4.664427598528099, + "learning_rate": 8.247362250879249e-05, + "loss": 3.792283535003662, + "step": 1005, + "token_acc": 0.22856545591744334 + }, + { + "epoch": 0.5898563471122837, + "grad_norm": 3.6045116390437184, + "learning_rate": 8.255568581477139e-05, + "loss": 3.789374828338623, + "step": 1006, + "token_acc": 0.2289365496062212 + }, + { + "epoch": 0.5904426854294929, + "grad_norm": 3.3496260892697896, + "learning_rate": 8.263774912075028e-05, + "loss": 3.7920684814453125, + "step": 1007, + "token_acc": 0.2311466633401423 + }, + { + "epoch": 0.5910290237467019, + "grad_norm": 3.619648182489753, + "learning_rate": 8.271981242672918e-05, + "loss": 3.7120020389556885, + "step": 1008, + "token_acc": 0.2391520909357829 + }, + { + "epoch": 0.5916153620639109, + "grad_norm": 4.3879056276709925, + "learning_rate": 8.280187573270808e-05, + "loss": 3.740029811859131, + "step": 1009, + "token_acc": 0.2370463118774374 + }, + { + "epoch": 0.5922017003811199, + "grad_norm": 4.4660548087970655, + "learning_rate": 8.288393903868698e-05, + "loss": 3.750211715698242, + "step": 1010, + "token_acc": 0.23159138124867243 + }, + { + "epoch": 0.592788038698329, + "grad_norm": 1.907076142560956, + "learning_rate": 8.296600234466588e-05, + "loss": 3.7922568321228027, + "step": 1011, + "token_acc": 0.2287552534633944 + }, + { + "epoch": 0.593374377015538, + "grad_norm": 3.7097748134385293, + "learning_rate": 8.304806565064477e-05, + "loss": 3.7345829010009766, + "step": 1012, + "token_acc": 0.23655176571661116 + }, + { + "epoch": 0.593960715332747, + "grad_norm": 4.163467583377942, + "learning_rate": 8.313012895662367e-05, + "loss": 3.776966094970703, + "step": 1013, + "token_acc": 0.23140533798413615 + }, + { + "epoch": 0.594547053649956, + "grad_norm": 3.260173078907783, + "learning_rate": 8.321219226260257e-05, + "loss": 3.791015863418579, + "step": 1014, + "token_acc": 0.23041021288083502 + }, + { + "epoch": 0.5951333919671651, + "grad_norm": 3.514703885640435, + "learning_rate": 8.329425556858147e-05, + "loss": 3.7367687225341797, + "step": 1015, + "token_acc": 0.2350947352379351 + }, + { + "epoch": 0.5957197302843741, + "grad_norm": 3.4714243345061067, + "learning_rate": 8.337631887456037e-05, + "loss": 3.77022123336792, + "step": 1016, + "token_acc": 0.23148969582020457 + }, + { + "epoch": 0.5963060686015831, + "grad_norm": 2.877015534546585, + "learning_rate": 8.345838218053926e-05, + "loss": 3.7913310527801514, + "step": 1017, + "token_acc": 0.22898245172388473 + }, + { + "epoch": 0.5968924069187922, + "grad_norm": 2.5590966237896695, + "learning_rate": 8.354044548651817e-05, + "loss": 3.755291223526001, + "step": 1018, + "token_acc": 0.2351382315780457 + }, + { + "epoch": 0.5974787452360012, + "grad_norm": 3.9699392669798286, + "learning_rate": 8.362250879249707e-05, + "loss": 3.7486839294433594, + "step": 1019, + "token_acc": 0.23506730561853312 + }, + { + "epoch": 0.5980650835532102, + "grad_norm": 2.998953762342674, + "learning_rate": 8.370457209847597e-05, + "loss": 3.736001968383789, + "step": 1020, + "token_acc": 0.23651671096162474 + }, + { + "epoch": 0.5986514218704192, + "grad_norm": 4.237700727176671, + "learning_rate": 8.378663540445487e-05, + "loss": 3.7611141204833984, + "step": 1021, + "token_acc": 0.23102844855157065 + }, + { + "epoch": 0.5992377601876283, + "grad_norm": 3.5538029873037194, + "learning_rate": 8.386869871043375e-05, + "loss": 3.7258243560791016, + "step": 1022, + "token_acc": 0.23802262653530862 + }, + { + "epoch": 0.5998240985048373, + "grad_norm": 2.3206302334963826, + "learning_rate": 8.395076201641265e-05, + "loss": 3.7062408924102783, + "step": 1023, + "token_acc": 0.23824927140494132 + }, + { + "epoch": 0.6004104368220463, + "grad_norm": 4.5421253566708115, + "learning_rate": 8.403282532239155e-05, + "loss": 3.711066722869873, + "step": 1024, + "token_acc": 0.23627117253824387 + }, + { + "epoch": 0.6009967751392553, + "grad_norm": 2.803887573222275, + "learning_rate": 8.411488862837045e-05, + "loss": 3.706165313720703, + "step": 1025, + "token_acc": 0.23566379868385534 + }, + { + "epoch": 0.6015831134564644, + "grad_norm": 4.259402169832166, + "learning_rate": 8.419695193434935e-05, + "loss": 3.725330352783203, + "step": 1026, + "token_acc": 0.23529890468161507 + }, + { + "epoch": 0.6021694517736734, + "grad_norm": 2.7303569657284914, + "learning_rate": 8.427901524032824e-05, + "loss": 3.7342751026153564, + "step": 1027, + "token_acc": 0.23525420062992553 + }, + { + "epoch": 0.6027557900908824, + "grad_norm": 3.6093176522584356, + "learning_rate": 8.436107854630714e-05, + "loss": 3.72261381149292, + "step": 1028, + "token_acc": 0.23667605387035354 + }, + { + "epoch": 0.6033421284080914, + "grad_norm": 4.047757923738355, + "learning_rate": 8.444314185228604e-05, + "loss": 3.767302989959717, + "step": 1029, + "token_acc": 0.23157634761548143 + }, + { + "epoch": 0.6039284667253005, + "grad_norm": 2.3706404937674423, + "learning_rate": 8.452520515826494e-05, + "loss": 3.712876319885254, + "step": 1030, + "token_acc": 0.23685148705511302 + }, + { + "epoch": 0.6045148050425095, + "grad_norm": 5.332984072327762, + "learning_rate": 8.460726846424384e-05, + "loss": 3.7768425941467285, + "step": 1031, + "token_acc": 0.23090280051227924 + }, + { + "epoch": 0.6051011433597185, + "grad_norm": 2.998308173797864, + "learning_rate": 8.468933177022273e-05, + "loss": 3.7286295890808105, + "step": 1032, + "token_acc": 0.2367908193974169 + }, + { + "epoch": 0.6056874816769275, + "grad_norm": 3.6982111907045265, + "learning_rate": 8.477139507620163e-05, + "loss": 3.7446091175079346, + "step": 1033, + "token_acc": 0.23353397935003387 + }, + { + "epoch": 0.6062738199941367, + "grad_norm": 3.4246905874889504, + "learning_rate": 8.485345838218053e-05, + "loss": 3.7749390602111816, + "step": 1034, + "token_acc": 0.23203850390809422 + }, + { + "epoch": 0.6068601583113457, + "grad_norm": 3.168977790872297, + "learning_rate": 8.493552168815943e-05, + "loss": 3.6910877227783203, + "step": 1035, + "token_acc": 0.23968113780456454 + }, + { + "epoch": 0.6074464966285547, + "grad_norm": 3.19579919696364, + "learning_rate": 8.501758499413834e-05, + "loss": 3.7172834873199463, + "step": 1036, + "token_acc": 0.2358989617591376 + }, + { + "epoch": 0.6080328349457637, + "grad_norm": 2.9278837359029586, + "learning_rate": 8.509964830011724e-05, + "loss": 3.7255663871765137, + "step": 1037, + "token_acc": 0.23685890842445825 + }, + { + "epoch": 0.6086191732629728, + "grad_norm": 2.5079710963209836, + "learning_rate": 8.518171160609613e-05, + "loss": 3.6861560344696045, + "step": 1038, + "token_acc": 0.2407471372647533 + }, + { + "epoch": 0.6092055115801818, + "grad_norm": 3.4824479968247255, + "learning_rate": 8.526377491207503e-05, + "loss": 3.674631118774414, + "step": 1039, + "token_acc": 0.24154742835605927 + }, + { + "epoch": 0.6097918498973908, + "grad_norm": 2.168821207363241, + "learning_rate": 8.534583821805392e-05, + "loss": 3.6727075576782227, + "step": 1040, + "token_acc": 0.2419392059420181 + }, + { + "epoch": 0.6103781882145998, + "grad_norm": 4.029948416394446, + "learning_rate": 8.542790152403281e-05, + "loss": 3.7046327590942383, + "step": 1041, + "token_acc": 0.24004632424777983 + }, + { + "epoch": 0.6109645265318089, + "grad_norm": 3.0809701570578287, + "learning_rate": 8.550996483001171e-05, + "loss": 3.677265167236328, + "step": 1042, + "token_acc": 0.23810717867889383 + }, + { + "epoch": 0.6115508648490179, + "grad_norm": 4.350522018156594, + "learning_rate": 8.559202813599061e-05, + "loss": 3.7118568420410156, + "step": 1043, + "token_acc": 0.23546996590920866 + }, + { + "epoch": 0.6121372031662269, + "grad_norm": 2.1699610013671586, + "learning_rate": 8.567409144196951e-05, + "loss": 3.6330552101135254, + "step": 1044, + "token_acc": 0.2459863761070717 + }, + { + "epoch": 0.612723541483436, + "grad_norm": 3.9420042519746405, + "learning_rate": 8.575615474794841e-05, + "loss": 3.694444417953491, + "step": 1045, + "token_acc": 0.23935295915034402 + }, + { + "epoch": 0.613309879800645, + "grad_norm": 3.3507339284219313, + "learning_rate": 8.58382180539273e-05, + "loss": 3.69856595993042, + "step": 1046, + "token_acc": 0.2388438495083775 + }, + { + "epoch": 0.613896218117854, + "grad_norm": 3.3542631031758607, + "learning_rate": 8.59202813599062e-05, + "loss": 3.709279775619507, + "step": 1047, + "token_acc": 0.23611059995057807 + }, + { + "epoch": 0.614482556435063, + "grad_norm": 3.131451202344225, + "learning_rate": 8.60023446658851e-05, + "loss": 3.6492738723754883, + "step": 1048, + "token_acc": 0.24169178027171928 + }, + { + "epoch": 0.6150688947522721, + "grad_norm": 2.8347374075490013, + "learning_rate": 8.6084407971864e-05, + "loss": 3.6759862899780273, + "step": 1049, + "token_acc": 0.2393384545772403 + }, + { + "epoch": 0.6156552330694811, + "grad_norm": 2.6071825669225253, + "learning_rate": 8.61664712778429e-05, + "loss": 3.7384555339813232, + "step": 1050, + "token_acc": 0.23241476455475205 + }, + { + "epoch": 0.6162415713866901, + "grad_norm": 4.567171547211958, + "learning_rate": 8.62485345838218e-05, + "loss": 3.6436378955841064, + "step": 1051, + "token_acc": 0.2427529904259451 + }, + { + "epoch": 0.6168279097038991, + "grad_norm": 2.487937000712364, + "learning_rate": 8.633059788980069e-05, + "loss": 3.7416181564331055, + "step": 1052, + "token_acc": 0.23178649102429008 + }, + { + "epoch": 0.6174142480211082, + "grad_norm": 4.010611861245166, + "learning_rate": 8.64126611957796e-05, + "loss": 3.6738507747650146, + "step": 1053, + "token_acc": 0.23784842881652812 + }, + { + "epoch": 0.6180005863383172, + "grad_norm": 2.174947451428352, + "learning_rate": 8.64947245017585e-05, + "loss": 3.7118053436279297, + "step": 1054, + "token_acc": 0.23602486034292383 + }, + { + "epoch": 0.6185869246555262, + "grad_norm": 5.193925962315059, + "learning_rate": 8.65767878077374e-05, + "loss": 3.671924114227295, + "step": 1055, + "token_acc": 0.2392359741877525 + }, + { + "epoch": 0.6191732629727352, + "grad_norm": 2.8666862821282466, + "learning_rate": 8.66588511137163e-05, + "loss": 3.7156877517700195, + "step": 1056, + "token_acc": 0.23500763547814602 + }, + { + "epoch": 0.6197596012899443, + "grad_norm": 3.3663052715442903, + "learning_rate": 8.67409144196952e-05, + "loss": 3.6975743770599365, + "step": 1057, + "token_acc": 0.2385550200113011 + }, + { + "epoch": 0.6203459396071533, + "grad_norm": 3.189160920945845, + "learning_rate": 8.682297772567408e-05, + "loss": 3.6914165019989014, + "step": 1058, + "token_acc": 0.2393398475752544 + }, + { + "epoch": 0.6209322779243623, + "grad_norm": 4.869527161252376, + "learning_rate": 8.690504103165298e-05, + "loss": 3.6959986686706543, + "step": 1059, + "token_acc": 0.2385128211952255 + }, + { + "epoch": 0.6215186162415713, + "grad_norm": 2.4614450550082374, + "learning_rate": 8.698710433763188e-05, + "loss": 3.6999077796936035, + "step": 1060, + "token_acc": 0.23591083264844326 + }, + { + "epoch": 0.6221049545587805, + "grad_norm": 4.072375519133194, + "learning_rate": 8.706916764361077e-05, + "loss": 3.691514015197754, + "step": 1061, + "token_acc": 0.23823964664293198 + }, + { + "epoch": 0.6226912928759895, + "grad_norm": 3.625572014663007, + "learning_rate": 8.715123094958967e-05, + "loss": 3.700716257095337, + "step": 1062, + "token_acc": 0.2363547874296083 + }, + { + "epoch": 0.6232776311931985, + "grad_norm": 3.383984250362769, + "learning_rate": 8.723329425556857e-05, + "loss": 3.663546562194824, + "step": 1063, + "token_acc": 0.24092448517124596 + }, + { + "epoch": 0.6238639695104075, + "grad_norm": 4.082003104613677, + "learning_rate": 8.731535756154747e-05, + "loss": 3.667447090148926, + "step": 1064, + "token_acc": 0.24112965448722692 + }, + { + "epoch": 0.6244503078276166, + "grad_norm": 2.80398553725358, + "learning_rate": 8.739742086752637e-05, + "loss": 3.7277731895446777, + "step": 1065, + "token_acc": 0.2339683588444738 + }, + { + "epoch": 0.6250366461448256, + "grad_norm": 4.115443600583568, + "learning_rate": 8.747948417350526e-05, + "loss": 3.7254858016967773, + "step": 1066, + "token_acc": 0.23352642532377152 + }, + { + "epoch": 0.6256229844620346, + "grad_norm": 4.077202625103572, + "learning_rate": 8.756154747948416e-05, + "loss": 3.7083616256713867, + "step": 1067, + "token_acc": 0.23424778190723752 + }, + { + "epoch": 0.6262093227792436, + "grad_norm": 2.354941417764202, + "learning_rate": 8.764361078546306e-05, + "loss": 3.706695079803467, + "step": 1068, + "token_acc": 0.23444976076555024 + }, + { + "epoch": 0.6267956610964527, + "grad_norm": 3.93936008108172, + "learning_rate": 8.772567409144196e-05, + "loss": 3.596928834915161, + "step": 1069, + "token_acc": 0.24960757578491286 + }, + { + "epoch": 0.6273819994136617, + "grad_norm": 3.3987872552203573, + "learning_rate": 8.780773739742086e-05, + "loss": 3.699096441268921, + "step": 1070, + "token_acc": 0.2341964866623292 + }, + { + "epoch": 0.6279683377308707, + "grad_norm": 3.401667475513786, + "learning_rate": 8.788980070339977e-05, + "loss": 3.702594757080078, + "step": 1071, + "token_acc": 0.23767863710057546 + }, + { + "epoch": 0.6285546760480798, + "grad_norm": 2.4172271108047236, + "learning_rate": 8.797186400937867e-05, + "loss": 3.6958084106445312, + "step": 1072, + "token_acc": 0.2372658244104474 + }, + { + "epoch": 0.6291410143652888, + "grad_norm": 4.44721798079579, + "learning_rate": 8.805392731535756e-05, + "loss": 3.689333915710449, + "step": 1073, + "token_acc": 0.2372842050338763 + }, + { + "epoch": 0.6297273526824978, + "grad_norm": 2.557873645925693, + "learning_rate": 8.813599062133646e-05, + "loss": 3.706573009490967, + "step": 1074, + "token_acc": 0.2361011888406514 + }, + { + "epoch": 0.6303136909997068, + "grad_norm": 3.110943105471363, + "learning_rate": 8.821805392731536e-05, + "loss": 3.640707015991211, + "step": 1075, + "token_acc": 0.2426175805319398 + }, + { + "epoch": 0.6309000293169159, + "grad_norm": 2.7776847834192595, + "learning_rate": 8.830011723329424e-05, + "loss": 3.651592493057251, + "step": 1076, + "token_acc": 0.24182716049382716 + }, + { + "epoch": 0.6314863676341249, + "grad_norm": 3.2471767990688467, + "learning_rate": 8.838218053927314e-05, + "loss": 3.645282745361328, + "step": 1077, + "token_acc": 0.2406760135621621 + }, + { + "epoch": 0.6320727059513339, + "grad_norm": 2.153274338713322, + "learning_rate": 8.846424384525204e-05, + "loss": 3.669600486755371, + "step": 1078, + "token_acc": 0.2407547309852696 + }, + { + "epoch": 0.6326590442685429, + "grad_norm": 2.3869554506560204, + "learning_rate": 8.854630715123094e-05, + "loss": 3.6873092651367188, + "step": 1079, + "token_acc": 0.23399029643371635 + }, + { + "epoch": 0.633245382585752, + "grad_norm": 3.970641996102504, + "learning_rate": 8.862837045720984e-05, + "loss": 3.6670117378234863, + "step": 1080, + "token_acc": 0.23933887389433742 + }, + { + "epoch": 0.633831720902961, + "grad_norm": 2.564318014878147, + "learning_rate": 8.871043376318873e-05, + "loss": 3.6191978454589844, + "step": 1081, + "token_acc": 0.24724905611248535 + }, + { + "epoch": 0.63441805922017, + "grad_norm": 2.7879682733894557, + "learning_rate": 8.879249706916763e-05, + "loss": 3.645559310913086, + "step": 1082, + "token_acc": 0.24035654033661447 + }, + { + "epoch": 0.635004397537379, + "grad_norm": 2.9523661166247086, + "learning_rate": 8.887456037514653e-05, + "loss": 3.7270026206970215, + "step": 1083, + "token_acc": 0.23134017627606734 + }, + { + "epoch": 0.6355907358545881, + "grad_norm": 2.1803907468983414, + "learning_rate": 8.895662368112543e-05, + "loss": 3.6534385681152344, + "step": 1084, + "token_acc": 0.2413595957360481 + }, + { + "epoch": 0.6361770741717971, + "grad_norm": 3.888937263374872, + "learning_rate": 8.903868698710433e-05, + "loss": 3.686830997467041, + "step": 1085, + "token_acc": 0.2363456723286188 + }, + { + "epoch": 0.6367634124890061, + "grad_norm": 2.4595163704192484, + "learning_rate": 8.912075029308322e-05, + "loss": 3.6888790130615234, + "step": 1086, + "token_acc": 0.23675515204505942 + }, + { + "epoch": 0.6373497508062151, + "grad_norm": 3.422397678688959, + "learning_rate": 8.920281359906212e-05, + "loss": 3.66855525970459, + "step": 1087, + "token_acc": 0.23799390409926854 + }, + { + "epoch": 0.6379360891234243, + "grad_norm": 3.6720086876870677, + "learning_rate": 8.928487690504103e-05, + "loss": 3.646956443786621, + "step": 1088, + "token_acc": 0.24156942845656554 + }, + { + "epoch": 0.6385224274406333, + "grad_norm": 2.718551158889029, + "learning_rate": 8.936694021101993e-05, + "loss": 3.6580452919006348, + "step": 1089, + "token_acc": 0.24035392798690672 + }, + { + "epoch": 0.6391087657578423, + "grad_norm": 3.3331653852882344, + "learning_rate": 8.944900351699883e-05, + "loss": 3.6371679306030273, + "step": 1090, + "token_acc": 0.2429623213512343 + }, + { + "epoch": 0.6396951040750513, + "grad_norm": 3.066466128395858, + "learning_rate": 8.953106682297773e-05, + "loss": 3.6113815307617188, + "step": 1091, + "token_acc": 0.24449353758664338 + }, + { + "epoch": 0.6402814423922604, + "grad_norm": 2.8253532638045575, + "learning_rate": 8.961313012895663e-05, + "loss": 3.621619701385498, + "step": 1092, + "token_acc": 0.24361911414966894 + }, + { + "epoch": 0.6408677807094694, + "grad_norm": 2.9059126345825614, + "learning_rate": 8.969519343493551e-05, + "loss": 3.697326421737671, + "step": 1093, + "token_acc": 0.23558159300631373 + }, + { + "epoch": 0.6414541190266784, + "grad_norm": 3.1473309626173753, + "learning_rate": 8.977725674091441e-05, + "loss": 3.62583589553833, + "step": 1094, + "token_acc": 0.24409304778020194 + }, + { + "epoch": 0.6420404573438874, + "grad_norm": 3.8133130825219004, + "learning_rate": 8.98593200468933e-05, + "loss": 3.582160472869873, + "step": 1095, + "token_acc": 0.2478891140269089 + }, + { + "epoch": 0.6426267956610965, + "grad_norm": 4.464738856390558, + "learning_rate": 8.99413833528722e-05, + "loss": 3.617539882659912, + "step": 1096, + "token_acc": 0.24480830064532455 + }, + { + "epoch": 0.6432131339783055, + "grad_norm": 2.001651958570203, + "learning_rate": 9.00234466588511e-05, + "loss": 3.6067187786102295, + "step": 1097, + "token_acc": 0.2448461301077064 + }, + { + "epoch": 0.6437994722955145, + "grad_norm": 6.209760739375893, + "learning_rate": 9.010550996483e-05, + "loss": 3.6563830375671387, + "step": 1098, + "token_acc": 0.2384539066365522 + }, + { + "epoch": 0.6443858106127235, + "grad_norm": 3.657729168594278, + "learning_rate": 9.01875732708089e-05, + "loss": 3.690416097640991, + "step": 1099, + "token_acc": 0.2359796253713858 + }, + { + "epoch": 0.6449721489299326, + "grad_norm": 5.177879422812686, + "learning_rate": 9.02696365767878e-05, + "loss": 3.698824405670166, + "step": 1100, + "token_acc": 0.23293829803501798 + }, + { + "epoch": 0.6455584872471416, + "grad_norm": 2.837210865795638, + "learning_rate": 9.03516998827667e-05, + "loss": 3.690340995788574, + "step": 1101, + "token_acc": 0.23765873237674703 + }, + { + "epoch": 0.6461448255643506, + "grad_norm": 3.7143206955545582, + "learning_rate": 9.043376318874559e-05, + "loss": 3.6323437690734863, + "step": 1102, + "token_acc": 0.24088070982582976 + }, + { + "epoch": 0.6467311638815597, + "grad_norm": 3.5384857704439807, + "learning_rate": 9.051582649472449e-05, + "loss": 3.642756938934326, + "step": 1103, + "token_acc": 0.24209850968622398 + }, + { + "epoch": 0.6473175021987687, + "grad_norm": 2.7745112061890373, + "learning_rate": 9.059788980070339e-05, + "loss": 3.628681182861328, + "step": 1104, + "token_acc": 0.24424118976824286 + }, + { + "epoch": 0.6479038405159777, + "grad_norm": 3.2377103860365137, + "learning_rate": 9.06799531066823e-05, + "loss": 3.675654888153076, + "step": 1105, + "token_acc": 0.2371810456067046 + }, + { + "epoch": 0.6484901788331867, + "grad_norm": 2.912580618600177, + "learning_rate": 9.07620164126612e-05, + "loss": 3.6180553436279297, + "step": 1106, + "token_acc": 0.24411638399507998 + }, + { + "epoch": 0.6490765171503958, + "grad_norm": 2.6859285650232483, + "learning_rate": 9.08440797186401e-05, + "loss": 3.6335134506225586, + "step": 1107, + "token_acc": 0.2420051138109348 + }, + { + "epoch": 0.6496628554676048, + "grad_norm": 2.850486126272157, + "learning_rate": 9.0926143024619e-05, + "loss": 3.6664273738861084, + "step": 1108, + "token_acc": 0.23794996331992613 + }, + { + "epoch": 0.6502491937848138, + "grad_norm": 3.4743188561289826, + "learning_rate": 9.100820633059789e-05, + "loss": 3.5565309524536133, + "step": 1109, + "token_acc": 0.25057429285230354 + }, + { + "epoch": 0.6508355321020228, + "grad_norm": 2.6378148939634944, + "learning_rate": 9.109026963657679e-05, + "loss": 3.630723476409912, + "step": 1110, + "token_acc": 0.24331036587451296 + }, + { + "epoch": 0.6514218704192319, + "grad_norm": 3.9544118375263633, + "learning_rate": 9.117233294255567e-05, + "loss": 3.578401565551758, + "step": 1111, + "token_acc": 0.24591891714304487 + }, + { + "epoch": 0.6520082087364409, + "grad_norm": 2.3420689073134304, + "learning_rate": 9.125439624853457e-05, + "loss": 3.6231515407562256, + "step": 1112, + "token_acc": 0.24393922816012004 + }, + { + "epoch": 0.6525945470536499, + "grad_norm": 3.153669656250764, + "learning_rate": 9.133645955451347e-05, + "loss": 3.6007745265960693, + "step": 1113, + "token_acc": 0.2467528801991122 + }, + { + "epoch": 0.6531808853708589, + "grad_norm": 2.3708753999240626, + "learning_rate": 9.141852286049237e-05, + "loss": 3.6407856941223145, + "step": 1114, + "token_acc": 0.24141487229388625 + }, + { + "epoch": 0.653767223688068, + "grad_norm": 3.5101712701699626, + "learning_rate": 9.150058616647127e-05, + "loss": 3.63830828666687, + "step": 1115, + "token_acc": 0.24304251569816454 + }, + { + "epoch": 0.6543535620052771, + "grad_norm": 1.8514542885456984, + "learning_rate": 9.158264947245016e-05, + "loss": 3.625324249267578, + "step": 1116, + "token_acc": 0.24061530865721384 + }, + { + "epoch": 0.6549399003224861, + "grad_norm": 3.880143560761657, + "learning_rate": 9.166471277842906e-05, + "loss": 3.5791852474212646, + "step": 1117, + "token_acc": 0.24861210741352963 + }, + { + "epoch": 0.6555262386396951, + "grad_norm": 2.2666327395386414, + "learning_rate": 9.174677608440796e-05, + "loss": 3.6924595832824707, + "step": 1118, + "token_acc": 0.2344515594142895 + }, + { + "epoch": 0.6561125769569042, + "grad_norm": 4.079671924939505, + "learning_rate": 9.182883939038686e-05, + "loss": 3.633694648742676, + "step": 1119, + "token_acc": 0.24215553762617942 + }, + { + "epoch": 0.6566989152741132, + "grad_norm": 3.0725267431622543, + "learning_rate": 9.191090269636576e-05, + "loss": 3.635021924972534, + "step": 1120, + "token_acc": 0.24189268107183115 + }, + { + "epoch": 0.6572852535913222, + "grad_norm": 2.6338148154108776, + "learning_rate": 9.199296600234465e-05, + "loss": 3.640258312225342, + "step": 1121, + "token_acc": 0.24034322631710145 + }, + { + "epoch": 0.6578715919085312, + "grad_norm": 2.4462303048005842, + "learning_rate": 9.207502930832355e-05, + "loss": 3.671175718307495, + "step": 1122, + "token_acc": 0.23558122381079427 + }, + { + "epoch": 0.6584579302257403, + "grad_norm": 3.551281536640821, + "learning_rate": 9.215709261430246e-05, + "loss": 3.6547622680664062, + "step": 1123, + "token_acc": 0.23981787185155712 + }, + { + "epoch": 0.6590442685429493, + "grad_norm": 2.1500169923634846, + "learning_rate": 9.223915592028136e-05, + "loss": 3.6179089546203613, + "step": 1124, + "token_acc": 0.24364431801260097 + }, + { + "epoch": 0.6596306068601583, + "grad_norm": 2.911642161670254, + "learning_rate": 9.232121922626026e-05, + "loss": 3.5438332557678223, + "step": 1125, + "token_acc": 0.25195284723282996 + }, + { + "epoch": 0.6602169451773673, + "grad_norm": 2.915592239384677, + "learning_rate": 9.240328253223916e-05, + "loss": 3.6207621097564697, + "step": 1126, + "token_acc": 0.24005256078954773 + }, + { + "epoch": 0.6608032834945764, + "grad_norm": 2.662367597552409, + "learning_rate": 9.248534583821806e-05, + "loss": 3.612668752670288, + "step": 1127, + "token_acc": 0.24411036949244172 + }, + { + "epoch": 0.6613896218117854, + "grad_norm": 3.343971525513835, + "learning_rate": 9.256740914419695e-05, + "loss": 3.6547317504882812, + "step": 1128, + "token_acc": 0.23889939802239488 + }, + { + "epoch": 0.6619759601289944, + "grad_norm": 2.482227129861332, + "learning_rate": 9.264947245017584e-05, + "loss": 3.5920207500457764, + "step": 1129, + "token_acc": 0.2469671720250918 + }, + { + "epoch": 0.6625622984462035, + "grad_norm": 2.6387899521644766, + "learning_rate": 9.273153575615474e-05, + "loss": 3.5902719497680664, + "step": 1130, + "token_acc": 0.24591198136038445 + }, + { + "epoch": 0.6631486367634125, + "grad_norm": 2.1154082629265556, + "learning_rate": 9.281359906213363e-05, + "loss": 3.6242785453796387, + "step": 1131, + "token_acc": 0.23954727889839586 + }, + { + "epoch": 0.6637349750806215, + "grad_norm": 3.225756091203333, + "learning_rate": 9.289566236811253e-05, + "loss": 3.6327247619628906, + "step": 1132, + "token_acc": 0.24043628454452406 + }, + { + "epoch": 0.6643213133978305, + "grad_norm": 2.075184478783505, + "learning_rate": 9.297772567409143e-05, + "loss": 3.6500983238220215, + "step": 1133, + "token_acc": 0.23965786529897212 + }, + { + "epoch": 0.6649076517150396, + "grad_norm": 3.555137517946713, + "learning_rate": 9.305978898007033e-05, + "loss": 3.646602153778076, + "step": 1134, + "token_acc": 0.2397526466509015 + }, + { + "epoch": 0.6654939900322486, + "grad_norm": 2.2292875747747587, + "learning_rate": 9.314185228604923e-05, + "loss": 3.5895256996154785, + "step": 1135, + "token_acc": 0.2457210395750263 + }, + { + "epoch": 0.6660803283494576, + "grad_norm": 2.844636266034063, + "learning_rate": 9.322391559202812e-05, + "loss": 3.5618162155151367, + "step": 1136, + "token_acc": 0.25125059986080717 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.199168991504178, + "learning_rate": 9.330597889800702e-05, + "loss": 3.610119342803955, + "step": 1137, + "token_acc": 0.24221313259893504 + }, + { + "epoch": 0.6672530049838757, + "grad_norm": 3.1596247675696447, + "learning_rate": 9.338804220398592e-05, + "loss": 3.584787607192993, + "step": 1138, + "token_acc": 0.24636447265413924 + }, + { + "epoch": 0.6678393433010847, + "grad_norm": 3.7970734713321836, + "learning_rate": 9.347010550996482e-05, + "loss": 3.6107027530670166, + "step": 1139, + "token_acc": 0.24291383594160498 + }, + { + "epoch": 0.6684256816182937, + "grad_norm": 2.0825958033192054, + "learning_rate": 9.355216881594373e-05, + "loss": 3.598156690597534, + "step": 1140, + "token_acc": 0.24665940313834517 + }, + { + "epoch": 0.6690120199355027, + "grad_norm": 4.472428704872754, + "learning_rate": 9.363423212192263e-05, + "loss": 3.615342617034912, + "step": 1141, + "token_acc": 0.24178383796221145 + }, + { + "epoch": 0.6695983582527119, + "grad_norm": 2.202871165158378, + "learning_rate": 9.371629542790152e-05, + "loss": 3.5906100273132324, + "step": 1142, + "token_acc": 0.2459319891783013 + }, + { + "epoch": 0.6701846965699209, + "grad_norm": 4.179642944134784, + "learning_rate": 9.379835873388042e-05, + "loss": 3.5967178344726562, + "step": 1143, + "token_acc": 0.24476220145922725 + }, + { + "epoch": 0.6707710348871299, + "grad_norm": 2.7187991648401812, + "learning_rate": 9.388042203985932e-05, + "loss": 3.634884834289551, + "step": 1144, + "token_acc": 0.2402460088137016 + }, + { + "epoch": 0.6713573732043389, + "grad_norm": 3.5855757949469456, + "learning_rate": 9.396248534583822e-05, + "loss": 3.632540702819824, + "step": 1145, + "token_acc": 0.23987249039718053 + }, + { + "epoch": 0.671943711521548, + "grad_norm": 2.483847800891102, + "learning_rate": 9.404454865181712e-05, + "loss": 3.5583815574645996, + "step": 1146, + "token_acc": 0.2491700905568115 + }, + { + "epoch": 0.672530049838757, + "grad_norm": 2.595520530828991, + "learning_rate": 9.4126611957796e-05, + "loss": 3.5689821243286133, + "step": 1147, + "token_acc": 0.24784972327771354 + }, + { + "epoch": 0.673116388155966, + "grad_norm": 2.242352049434097, + "learning_rate": 9.42086752637749e-05, + "loss": 3.6121182441711426, + "step": 1148, + "token_acc": 0.24175938110619294 + }, + { + "epoch": 0.673702726473175, + "grad_norm": 2.653874761361489, + "learning_rate": 9.42907385697538e-05, + "loss": 3.594228982925415, + "step": 1149, + "token_acc": 0.24437320563911386 + }, + { + "epoch": 0.6742890647903841, + "grad_norm": 2.042469062284592, + "learning_rate": 9.43728018757327e-05, + "loss": 3.6022393703460693, + "step": 1150, + "token_acc": 0.2442453080843458 + }, + { + "epoch": 0.6748754031075931, + "grad_norm": 3.5747253738696525, + "learning_rate": 9.445486518171159e-05, + "loss": 3.5794405937194824, + "step": 1151, + "token_acc": 0.2472802502510639 + }, + { + "epoch": 0.6754617414248021, + "grad_norm": 2.7483289849776877, + "learning_rate": 9.453692848769049e-05, + "loss": 3.5764782428741455, + "step": 1152, + "token_acc": 0.24691186749977323 + }, + { + "epoch": 0.6760480797420111, + "grad_norm": 3.105169588977674, + "learning_rate": 9.461899179366939e-05, + "loss": 3.582529067993164, + "step": 1153, + "token_acc": 0.2441149865365197 + }, + { + "epoch": 0.6766344180592202, + "grad_norm": 2.1664284262066325, + "learning_rate": 9.470105509964829e-05, + "loss": 3.6183509826660156, + "step": 1154, + "token_acc": 0.24155947479989962 + }, + { + "epoch": 0.6772207563764292, + "grad_norm": 2.5547271519292507, + "learning_rate": 9.478311840562719e-05, + "loss": 3.5611495971679688, + "step": 1155, + "token_acc": 0.24853782659793192 + }, + { + "epoch": 0.6778070946936382, + "grad_norm": 2.7906522204595188, + "learning_rate": 9.486518171160608e-05, + "loss": 3.6035265922546387, + "step": 1156, + "token_acc": 0.24316116439111599 + }, + { + "epoch": 0.6783934330108473, + "grad_norm": 3.309156250841553, + "learning_rate": 9.494724501758498e-05, + "loss": 3.5210158824920654, + "step": 1157, + "token_acc": 0.25216078986013796 + }, + { + "epoch": 0.6789797713280563, + "grad_norm": 2.746292794685027, + "learning_rate": 9.502930832356389e-05, + "loss": 3.6111483573913574, + "step": 1158, + "token_acc": 0.24146687769403408 + }, + { + "epoch": 0.6795661096452653, + "grad_norm": 1.9844604159645325, + "learning_rate": 9.511137162954279e-05, + "loss": 3.569340944290161, + "step": 1159, + "token_acc": 0.24738920625534566 + }, + { + "epoch": 0.6801524479624743, + "grad_norm": 2.388014953048503, + "learning_rate": 9.519343493552169e-05, + "loss": 3.4791207313537598, + "step": 1160, + "token_acc": 0.2562385535545949 + }, + { + "epoch": 0.6807387862796834, + "grad_norm": 2.516515604662825, + "learning_rate": 9.527549824150059e-05, + "loss": 3.6110122203826904, + "step": 1161, + "token_acc": 0.24173699832236417 + }, + { + "epoch": 0.6813251245968924, + "grad_norm": 4.169876385631033, + "learning_rate": 9.535756154747948e-05, + "loss": 3.5419564247131348, + "step": 1162, + "token_acc": 0.250024398888438 + }, + { + "epoch": 0.6819114629141014, + "grad_norm": 2.2782455342120365, + "learning_rate": 9.543962485345838e-05, + "loss": 3.593749761581421, + "step": 1163, + "token_acc": 0.2437383265828629 + }, + { + "epoch": 0.6824978012313104, + "grad_norm": 3.1015729107772585, + "learning_rate": 9.552168815943727e-05, + "loss": 3.563033103942871, + "step": 1164, + "token_acc": 0.24950859883596782 + }, + { + "epoch": 0.6830841395485195, + "grad_norm": 3.009176898620129, + "learning_rate": 9.560375146541616e-05, + "loss": 3.5905706882476807, + "step": 1165, + "token_acc": 0.24329414523129306 + }, + { + "epoch": 0.6836704778657285, + "grad_norm": 4.217561168197373, + "learning_rate": 9.568581477139506e-05, + "loss": 3.584014415740967, + "step": 1166, + "token_acc": 0.24540054599970068 + }, + { + "epoch": 0.6842568161829375, + "grad_norm": 2.3586906505896343, + "learning_rate": 9.576787807737396e-05, + "loss": 3.6028177738189697, + "step": 1167, + "token_acc": 0.24410240685207032 + }, + { + "epoch": 0.6848431545001465, + "grad_norm": 2.9530525977202484, + "learning_rate": 9.584994138335286e-05, + "loss": 3.603588581085205, + "step": 1168, + "token_acc": 0.244469648096893 + }, + { + "epoch": 0.6854294928173557, + "grad_norm": 2.5844753375797853, + "learning_rate": 9.593200468933176e-05, + "loss": 3.5811610221862793, + "step": 1169, + "token_acc": 0.24398261644751504 + }, + { + "epoch": 0.6860158311345647, + "grad_norm": 1.6056034309786322, + "learning_rate": 9.601406799531065e-05, + "loss": 3.5396506786346436, + "step": 1170, + "token_acc": 0.2499470680885125 + }, + { + "epoch": 0.6866021694517737, + "grad_norm": 3.958588136251107, + "learning_rate": 9.609613130128955e-05, + "loss": 3.576620101928711, + "step": 1171, + "token_acc": 0.2452747775339676 + }, + { + "epoch": 0.6871885077689827, + "grad_norm": 2.023040570597517, + "learning_rate": 9.617819460726845e-05, + "loss": 3.5877130031585693, + "step": 1172, + "token_acc": 0.2438442288680178 + }, + { + "epoch": 0.6877748460861918, + "grad_norm": 3.7897590561604555, + "learning_rate": 9.626025791324735e-05, + "loss": 3.5919458866119385, + "step": 1173, + "token_acc": 0.242657824933687 + }, + { + "epoch": 0.6883611844034008, + "grad_norm": 3.462756709799863, + "learning_rate": 9.634232121922625e-05, + "loss": 3.609170436859131, + "step": 1174, + "token_acc": 0.24336571722595535 + }, + { + "epoch": 0.6889475227206098, + "grad_norm": 2.643834595176728, + "learning_rate": 9.642438452520516e-05, + "loss": 3.6074862480163574, + "step": 1175, + "token_acc": 0.24344418587962727 + }, + { + "epoch": 0.6895338610378188, + "grad_norm": 2.71792640940671, + "learning_rate": 9.650644783118406e-05, + "loss": 3.59952712059021, + "step": 1176, + "token_acc": 0.2432607021947541 + }, + { + "epoch": 0.6901201993550279, + "grad_norm": 3.2931582454606714, + "learning_rate": 9.658851113716295e-05, + "loss": 3.5765795707702637, + "step": 1177, + "token_acc": 0.24243539525224111 + }, + { + "epoch": 0.6907065376722369, + "grad_norm": 2.7547126705212945, + "learning_rate": 9.667057444314185e-05, + "loss": 3.6101436614990234, + "step": 1178, + "token_acc": 0.24072255509428783 + }, + { + "epoch": 0.6912928759894459, + "grad_norm": 2.0813176695133775, + "learning_rate": 9.675263774912075e-05, + "loss": 3.574465751647949, + "step": 1179, + "token_acc": 0.24785099153576673 + }, + { + "epoch": 0.6918792143066549, + "grad_norm": 3.9381562207803946, + "learning_rate": 9.683470105509965e-05, + "loss": 3.568744659423828, + "step": 1180, + "token_acc": 0.24604529245024231 + }, + { + "epoch": 0.692465552623864, + "grad_norm": 2.107174513486288, + "learning_rate": 9.691676436107855e-05, + "loss": 3.619206428527832, + "step": 1181, + "token_acc": 0.24190360623807056 + }, + { + "epoch": 0.693051890941073, + "grad_norm": 2.5158419415880204, + "learning_rate": 9.699882766705743e-05, + "loss": 3.510016441345215, + "step": 1182, + "token_acc": 0.2509512336330931 + }, + { + "epoch": 0.693638229258282, + "grad_norm": 2.223861925486757, + "learning_rate": 9.708089097303633e-05, + "loss": 3.5318374633789062, + "step": 1183, + "token_acc": 0.2515877221324717 + }, + { + "epoch": 0.6942245675754911, + "grad_norm": 2.7141995108551313, + "learning_rate": 9.716295427901523e-05, + "loss": 3.520838737487793, + "step": 1184, + "token_acc": 0.2508289437923046 + }, + { + "epoch": 0.6948109058927001, + "grad_norm": 2.6825105202463475, + "learning_rate": 9.724501758499412e-05, + "loss": 3.5538434982299805, + "step": 1185, + "token_acc": 0.24910541451988158 + }, + { + "epoch": 0.6953972442099091, + "grad_norm": 3.433787000203574, + "learning_rate": 9.732708089097302e-05, + "loss": 3.543549060821533, + "step": 1186, + "token_acc": 0.24789602971463331 + }, + { + "epoch": 0.6959835825271181, + "grad_norm": 2.4260343764528094, + "learning_rate": 9.740914419695192e-05, + "loss": 3.581066608428955, + "step": 1187, + "token_acc": 0.24430200878334418 + }, + { + "epoch": 0.6965699208443272, + "grad_norm": 2.4153502188031033, + "learning_rate": 9.749120750293082e-05, + "loss": 3.5370540618896484, + "step": 1188, + "token_acc": 0.24838737399121358 + }, + { + "epoch": 0.6971562591615362, + "grad_norm": 2.1093712765345507, + "learning_rate": 9.757327080890972e-05, + "loss": 3.5378026962280273, + "step": 1189, + "token_acc": 0.2484754570993364 + }, + { + "epoch": 0.6977425974787452, + "grad_norm": 2.5279875322057714, + "learning_rate": 9.765533411488861e-05, + "loss": 3.596057891845703, + "step": 1190, + "token_acc": 0.24359018913319522 + }, + { + "epoch": 0.6983289357959542, + "grad_norm": 2.6419982404441384, + "learning_rate": 9.773739742086751e-05, + "loss": 3.5669679641723633, + "step": 1191, + "token_acc": 0.24765664711568092 + }, + { + "epoch": 0.6989152741131633, + "grad_norm": 2.7353021714608814, + "learning_rate": 9.781946072684641e-05, + "loss": 3.5290098190307617, + "step": 1192, + "token_acc": 0.25008044809589414 + }, + { + "epoch": 0.6995016124303723, + "grad_norm": 3.274750141238885, + "learning_rate": 9.790152403282532e-05, + "loss": 3.564891815185547, + "step": 1193, + "token_acc": 0.2463891119012119 + }, + { + "epoch": 0.7000879507475813, + "grad_norm": 2.0040008550109776, + "learning_rate": 9.798358733880422e-05, + "loss": 3.515172243118286, + "step": 1194, + "token_acc": 0.2532676471607525 + }, + { + "epoch": 0.7006742890647903, + "grad_norm": 2.4645147825159803, + "learning_rate": 9.806565064478312e-05, + "loss": 3.580850124359131, + "step": 1195, + "token_acc": 0.24458582523602967 + }, + { + "epoch": 0.7012606273819995, + "grad_norm": 2.0921528252891073, + "learning_rate": 9.814771395076202e-05, + "loss": 3.540644884109497, + "step": 1196, + "token_acc": 0.24851149339741 + }, + { + "epoch": 0.7018469656992085, + "grad_norm": 4.074097228077286, + "learning_rate": 9.822977725674091e-05, + "loss": 3.541962146759033, + "step": 1197, + "token_acc": 0.2475354743317081 + }, + { + "epoch": 0.7024333040164175, + "grad_norm": 1.8856338237330241, + "learning_rate": 9.831184056271981e-05, + "loss": 3.609485626220703, + "step": 1198, + "token_acc": 0.24053244661613835 + }, + { + "epoch": 0.7030196423336265, + "grad_norm": 3.806516590019726, + "learning_rate": 9.839390386869871e-05, + "loss": 3.6008718013763428, + "step": 1199, + "token_acc": 0.2434892567240831 + }, + { + "epoch": 0.7036059806508356, + "grad_norm": 2.70317909706124, + "learning_rate": 9.84759671746776e-05, + "loss": 3.606748104095459, + "step": 1200, + "token_acc": 0.24390290750639343 + }, + { + "epoch": 0.7041923189680446, + "grad_norm": 2.4246448090661925, + "learning_rate": 9.855803048065649e-05, + "loss": 3.601593494415283, + "step": 1201, + "token_acc": 0.24274322773434961 + }, + { + "epoch": 0.7047786572852536, + "grad_norm": 2.052612131005801, + "learning_rate": 9.864009378663539e-05, + "loss": 3.578190326690674, + "step": 1202, + "token_acc": 0.24568551631069474 + }, + { + "epoch": 0.7053649956024626, + "grad_norm": 3.038200640478679, + "learning_rate": 9.872215709261429e-05, + "loss": 3.6017799377441406, + "step": 1203, + "token_acc": 0.24196362810809668 + }, + { + "epoch": 0.7059513339196717, + "grad_norm": 2.365673862777938, + "learning_rate": 9.880422039859319e-05, + "loss": 3.5267391204833984, + "step": 1204, + "token_acc": 0.2492665991304715 + }, + { + "epoch": 0.7065376722368807, + "grad_norm": 2.8864967649728133, + "learning_rate": 9.888628370457208e-05, + "loss": 3.5487489700317383, + "step": 1205, + "token_acc": 0.24962579640712307 + }, + { + "epoch": 0.7071240105540897, + "grad_norm": 2.481756996527984, + "learning_rate": 9.896834701055098e-05, + "loss": 3.571739673614502, + "step": 1206, + "token_acc": 0.24509623120077187 + }, + { + "epoch": 0.7077103488712987, + "grad_norm": 2.738989124914833, + "learning_rate": 9.905041031652988e-05, + "loss": 3.5909862518310547, + "step": 1207, + "token_acc": 0.2429637688360732 + }, + { + "epoch": 0.7082966871885078, + "grad_norm": 2.751069773899159, + "learning_rate": 9.913247362250878e-05, + "loss": 3.5597996711730957, + "step": 1208, + "token_acc": 0.24779391355807048 + }, + { + "epoch": 0.7088830255057168, + "grad_norm": 2.2558497451786232, + "learning_rate": 9.921453692848768e-05, + "loss": 3.5285754203796387, + "step": 1209, + "token_acc": 0.25005577131731854 + }, + { + "epoch": 0.7094693638229258, + "grad_norm": 1.6738495355304766, + "learning_rate": 9.929660023446659e-05, + "loss": 3.5502572059631348, + "step": 1210, + "token_acc": 0.2475436245892749 + }, + { + "epoch": 0.7100557021401348, + "grad_norm": 3.750214077876057, + "learning_rate": 9.937866354044549e-05, + "loss": 3.5298142433166504, + "step": 1211, + "token_acc": 0.25007025761124124 + }, + { + "epoch": 0.7106420404573439, + "grad_norm": 1.7417864911260688, + "learning_rate": 9.946072684642438e-05, + "loss": 3.5526413917541504, + "step": 1212, + "token_acc": 0.24815282722713525 + }, + { + "epoch": 0.7112283787745529, + "grad_norm": 3.0864405750735724, + "learning_rate": 9.954279015240328e-05, + "loss": 3.5368363857269287, + "step": 1213, + "token_acc": 0.24839768682345328 + }, + { + "epoch": 0.7118147170917619, + "grad_norm": 1.9335035028660956, + "learning_rate": 9.962485345838218e-05, + "loss": 3.5323128700256348, + "step": 1214, + "token_acc": 0.24854425712614744 + }, + { + "epoch": 0.712401055408971, + "grad_norm": 2.188916205146948, + "learning_rate": 9.970691676436108e-05, + "loss": 3.5288867950439453, + "step": 1215, + "token_acc": 0.25078088536243853 + }, + { + "epoch": 0.71298739372618, + "grad_norm": 1.6033803284467862, + "learning_rate": 9.978898007033998e-05, + "loss": 3.5163066387176514, + "step": 1216, + "token_acc": 0.25171774399877617 + }, + { + "epoch": 0.713573732043389, + "grad_norm": 2.6779715310208396, + "learning_rate": 9.987104337631887e-05, + "loss": 3.5640363693237305, + "step": 1217, + "token_acc": 0.24574018157603114 + }, + { + "epoch": 0.714160070360598, + "grad_norm": 3.5850998682601025, + "learning_rate": 9.995310668229776e-05, + "loss": 3.5692007541656494, + "step": 1218, + "token_acc": 0.24389504019414532 + }, + { + "epoch": 0.7147464086778071, + "grad_norm": 1.859973620386237, + "learning_rate": 0.00010003516998827666, + "loss": 3.522157669067383, + "step": 1219, + "token_acc": 0.250383317604168 + }, + { + "epoch": 0.7153327469950161, + "grad_norm": 3.590657150733063, + "learning_rate": 0.00010011723329425555, + "loss": 3.574082374572754, + "step": 1220, + "token_acc": 0.24495054157438034 + }, + { + "epoch": 0.7159190853122251, + "grad_norm": 2.3371002353368784, + "learning_rate": 0.00010019929660023445, + "loss": 3.554076671600342, + "step": 1221, + "token_acc": 0.2478048411878794 + }, + { + "epoch": 0.7165054236294341, + "grad_norm": 3.037711524732583, + "learning_rate": 0.00010028135990621335, + "loss": 3.5288844108581543, + "step": 1222, + "token_acc": 0.2503178489245674 + }, + { + "epoch": 0.7170917619466433, + "grad_norm": 2.8460803608924876, + "learning_rate": 0.00010036342321219225, + "loss": 3.5708634853363037, + "step": 1223, + "token_acc": 0.24549793853317206 + }, + { + "epoch": 0.7176781002638523, + "grad_norm": 2.2697022290538014, + "learning_rate": 0.00010044548651817115, + "loss": 3.488515853881836, + "step": 1224, + "token_acc": 0.25668942657576643 + }, + { + "epoch": 0.7182644385810613, + "grad_norm": 3.241206908521797, + "learning_rate": 0.00010052754982415004, + "loss": 3.5712051391601562, + "step": 1225, + "token_acc": 0.2463248381610347 + }, + { + "epoch": 0.7188507768982703, + "grad_norm": 2.104504358295846, + "learning_rate": 0.00010060961313012894, + "loss": 3.576154947280884, + "step": 1226, + "token_acc": 0.24413529456441935 + }, + { + "epoch": 0.7194371152154794, + "grad_norm": 2.314494942554422, + "learning_rate": 0.00010069167643610784, + "loss": 3.5392565727233887, + "step": 1227, + "token_acc": 0.2488117619189226 + }, + { + "epoch": 0.7200234535326884, + "grad_norm": 2.6589998889116955, + "learning_rate": 0.00010077373974208675, + "loss": 3.5736331939697266, + "step": 1228, + "token_acc": 0.24295954142088783 + }, + { + "epoch": 0.7206097918498974, + "grad_norm": 2.2610709655514136, + "learning_rate": 0.00010085580304806565, + "loss": 3.4810173511505127, + "step": 1229, + "token_acc": 0.2567681452246103 + }, + { + "epoch": 0.7211961301671064, + "grad_norm": 2.671917040299621, + "learning_rate": 0.00010093786635404455, + "loss": 3.540220260620117, + "step": 1230, + "token_acc": 0.24977325624471464 + }, + { + "epoch": 0.7217824684843155, + "grad_norm": 1.9517242286861856, + "learning_rate": 0.00010101992966002345, + "loss": 3.545020341873169, + "step": 1231, + "token_acc": 0.24789976231456437 + }, + { + "epoch": 0.7223688068015245, + "grad_norm": 2.845469277029176, + "learning_rate": 0.00010110199296600234, + "loss": 3.535029649734497, + "step": 1232, + "token_acc": 0.2502838728670108 + }, + { + "epoch": 0.7229551451187335, + "grad_norm": 2.592958351738877, + "learning_rate": 0.00010118405627198124, + "loss": 3.5664520263671875, + "step": 1233, + "token_acc": 0.24465025906735752 + }, + { + "epoch": 0.7235414834359425, + "grad_norm": 2.9008662370789438, + "learning_rate": 0.00010126611957796014, + "loss": 3.492316246032715, + "step": 1234, + "token_acc": 0.25446014527447575 + }, + { + "epoch": 0.7241278217531516, + "grad_norm": 1.8629647765058137, + "learning_rate": 0.00010134818288393904, + "loss": 3.496278762817383, + "step": 1235, + "token_acc": 0.25511106742677414 + }, + { + "epoch": 0.7247141600703606, + "grad_norm": 2.959362238274771, + "learning_rate": 0.00010143024618991792, + "loss": 3.5446105003356934, + "step": 1236, + "token_acc": 0.24830599630218944 + }, + { + "epoch": 0.7253004983875696, + "grad_norm": 2.527224324267823, + "learning_rate": 0.00010151230949589682, + "loss": 3.5205023288726807, + "step": 1237, + "token_acc": 0.25118066852075355 + }, + { + "epoch": 0.7258868367047786, + "grad_norm": 2.0121949201708347, + "learning_rate": 0.00010159437280187572, + "loss": 3.5169951915740967, + "step": 1238, + "token_acc": 0.24935938748678818 + }, + { + "epoch": 0.7264731750219877, + "grad_norm": 2.0104404347405613, + "learning_rate": 0.00010167643610785462, + "loss": 3.5548434257507324, + "step": 1239, + "token_acc": 0.24538883708051543 + }, + { + "epoch": 0.7270595133391967, + "grad_norm": 2.7040633366424096, + "learning_rate": 0.00010175849941383351, + "loss": 3.4885833263397217, + "step": 1240, + "token_acc": 0.2543689936249167 + }, + { + "epoch": 0.7276458516564057, + "grad_norm": 2.0061427786516686, + "learning_rate": 0.00010184056271981241, + "loss": 3.511047124862671, + "step": 1241, + "token_acc": 0.2537486138566125 + }, + { + "epoch": 0.7282321899736148, + "grad_norm": 2.536271553883254, + "learning_rate": 0.00010192262602579131, + "loss": 3.493194818496704, + "step": 1242, + "token_acc": 0.2547939780250363 + }, + { + "epoch": 0.7288185282908238, + "grad_norm": 1.8317891904897208, + "learning_rate": 0.00010200468933177021, + "loss": 3.5765206813812256, + "step": 1243, + "token_acc": 0.2457818989766784 + }, + { + "epoch": 0.7294048666080328, + "grad_norm": 2.38526486413872, + "learning_rate": 0.0001020867526377491, + "loss": 3.48268985748291, + "step": 1244, + "token_acc": 0.2563839294679975 + }, + { + "epoch": 0.7299912049252418, + "grad_norm": 2.5772036161189735, + "learning_rate": 0.00010216881594372802, + "loss": 3.5784783363342285, + "step": 1245, + "token_acc": 0.24209169017860563 + }, + { + "epoch": 0.7305775432424509, + "grad_norm": 2.6043678173199907, + "learning_rate": 0.00010225087924970692, + "loss": 3.5291495323181152, + "step": 1246, + "token_acc": 0.2499946937216114 + }, + { + "epoch": 0.73116388155966, + "grad_norm": 3.219606353142494, + "learning_rate": 0.00010233294255568581, + "loss": 3.5300350189208984, + "step": 1247, + "token_acc": 0.24908366150415862 + }, + { + "epoch": 0.731750219876869, + "grad_norm": 2.1215338027371056, + "learning_rate": 0.00010241500586166471, + "loss": 3.512089252471924, + "step": 1248, + "token_acc": 0.25182376873553713 + }, + { + "epoch": 0.732336558194078, + "grad_norm": 2.446167725847885, + "learning_rate": 0.00010249706916764361, + "loss": 3.5400424003601074, + "step": 1249, + "token_acc": 0.24788967674290488 + }, + { + "epoch": 0.7329228965112871, + "grad_norm": 2.809960705904554, + "learning_rate": 0.00010257913247362251, + "loss": 3.5229735374450684, + "step": 1250, + "token_acc": 0.25057285909356913 + }, + { + "epoch": 0.7335092348284961, + "grad_norm": 2.557896557668373, + "learning_rate": 0.0001026611957796014, + "loss": 3.512618064880371, + "step": 1251, + "token_acc": 0.2539301181279022 + }, + { + "epoch": 0.7340955731457051, + "grad_norm": 2.455850532467235, + "learning_rate": 0.0001027432590855803, + "loss": 3.4953341484069824, + "step": 1252, + "token_acc": 0.2543974456328771 + }, + { + "epoch": 0.7346819114629141, + "grad_norm": 2.979672897445438, + "learning_rate": 0.00010282532239155919, + "loss": 3.5181024074554443, + "step": 1253, + "token_acc": 0.25044222554294737 + }, + { + "epoch": 0.7352682497801232, + "grad_norm": 1.5263870848523355, + "learning_rate": 0.00010290738569753809, + "loss": 3.5820112228393555, + "step": 1254, + "token_acc": 0.24260758260436963 + }, + { + "epoch": 0.7358545880973322, + "grad_norm": 3.5919981618115844, + "learning_rate": 0.00010298944900351698, + "loss": 3.552605152130127, + "step": 1255, + "token_acc": 0.24636127520880016 + }, + { + "epoch": 0.7364409264145412, + "grad_norm": 1.6240463326800108, + "learning_rate": 0.00010307151230949588, + "loss": 3.510108470916748, + "step": 1256, + "token_acc": 0.2505355545849712 + }, + { + "epoch": 0.7370272647317502, + "grad_norm": 3.6007783652019163, + "learning_rate": 0.00010315357561547478, + "loss": 3.64194393157959, + "step": 1257, + "token_acc": 0.23846942527069082 + }, + { + "epoch": 0.7376136030489593, + "grad_norm": 2.282954206775683, + "learning_rate": 0.00010323563892145368, + "loss": 3.595787525177002, + "step": 1258, + "token_acc": 0.24153297223824038 + }, + { + "epoch": 0.7381999413661683, + "grad_norm": 2.5223354393119553, + "learning_rate": 0.00010331770222743258, + "loss": 3.5291693210601807, + "step": 1259, + "token_acc": 0.248165227976422 + }, + { + "epoch": 0.7387862796833773, + "grad_norm": 2.1372816804294863, + "learning_rate": 0.00010339976553341147, + "loss": 3.5437755584716797, + "step": 1260, + "token_acc": 0.2475048028426746 + }, + { + "epoch": 0.7393726180005863, + "grad_norm": 1.8932817110190208, + "learning_rate": 0.00010348182883939037, + "loss": 3.5661864280700684, + "step": 1261, + "token_acc": 0.24571944221632627 + }, + { + "epoch": 0.7399589563177954, + "grad_norm": 2.391256696665423, + "learning_rate": 0.00010356389214536927, + "loss": 3.504255771636963, + "step": 1262, + "token_acc": 0.25250944679662424 + }, + { + "epoch": 0.7405452946350044, + "grad_norm": 2.685601519172893, + "learning_rate": 0.00010364595545134818, + "loss": 3.5313851833343506, + "step": 1263, + "token_acc": 0.2481709251006596 + }, + { + "epoch": 0.7411316329522134, + "grad_norm": 2.2388382671648754, + "learning_rate": 0.00010372801875732708, + "loss": 3.486743450164795, + "step": 1264, + "token_acc": 0.25470351116615525 + }, + { + "epoch": 0.7417179712694224, + "grad_norm": 1.9576954399106532, + "learning_rate": 0.00010381008206330598, + "loss": 3.574458599090576, + "step": 1265, + "token_acc": 0.24379003402082675 + }, + { + "epoch": 0.7423043095866315, + "grad_norm": 2.6172956782483006, + "learning_rate": 0.00010389214536928487, + "loss": 3.4823784828186035, + "step": 1266, + "token_acc": 0.2542963539779342 + }, + { + "epoch": 0.7428906479038405, + "grad_norm": 2.192499468194125, + "learning_rate": 0.00010397420867526377, + "loss": 3.4702978134155273, + "step": 1267, + "token_acc": 0.256539926007245 + }, + { + "epoch": 0.7434769862210495, + "grad_norm": 1.9839557932065688, + "learning_rate": 0.00010405627198124267, + "loss": 3.4872279167175293, + "step": 1268, + "token_acc": 0.25513607184506226 + }, + { + "epoch": 0.7440633245382586, + "grad_norm": 1.7868150723998975, + "learning_rate": 0.00010413833528722157, + "loss": 3.53373646736145, + "step": 1269, + "token_acc": 0.24947466697518492 + }, + { + "epoch": 0.7446496628554676, + "grad_norm": 2.6476805529637137, + "learning_rate": 0.00010422039859320047, + "loss": 3.5297203063964844, + "step": 1270, + "token_acc": 0.24858181780590738 + }, + { + "epoch": 0.7452360011726766, + "grad_norm": 2.361338562750735, + "learning_rate": 0.00010430246189917935, + "loss": 3.4865264892578125, + "step": 1271, + "token_acc": 0.2543234734315481 + }, + { + "epoch": 0.7458223394898856, + "grad_norm": 2.265009490380844, + "learning_rate": 0.00010438452520515825, + "loss": 3.482454299926758, + "step": 1272, + "token_acc": 0.25472569876367696 + }, + { + "epoch": 0.7464086778070947, + "grad_norm": 1.6720529335710348, + "learning_rate": 0.00010446658851113715, + "loss": 3.4993371963500977, + "step": 1273, + "token_acc": 0.2512804262247779 + }, + { + "epoch": 0.7469950161243037, + "grad_norm": 1.972450313656642, + "learning_rate": 0.00010454865181711605, + "loss": 3.448000431060791, + "step": 1274, + "token_acc": 0.2572244917744068 + }, + { + "epoch": 0.7475813544415127, + "grad_norm": 2.3841494510760484, + "learning_rate": 0.00010463071512309494, + "loss": 3.489889144897461, + "step": 1275, + "token_acc": 0.2525078088629619 + }, + { + "epoch": 0.7481676927587217, + "grad_norm": 2.9279797394009432, + "learning_rate": 0.00010471277842907384, + "loss": 3.549703359603882, + "step": 1276, + "token_acc": 0.24451048547155743 + }, + { + "epoch": 0.7487540310759309, + "grad_norm": 2.019867216389879, + "learning_rate": 0.00010479484173505274, + "loss": 3.46688175201416, + "step": 1277, + "token_acc": 0.2562691099311121 + }, + { + "epoch": 0.7493403693931399, + "grad_norm": 2.730985763023011, + "learning_rate": 0.00010487690504103164, + "loss": 3.519805431365967, + "step": 1278, + "token_acc": 0.2495800415377097 + }, + { + "epoch": 0.7499267077103489, + "grad_norm": 2.082638619791961, + "learning_rate": 0.00010495896834701054, + "loss": 3.5153207778930664, + "step": 1279, + "token_acc": 0.249756050579269 + }, + { + "epoch": 0.7505130460275579, + "grad_norm": 2.8405793172197704, + "learning_rate": 0.00010504103165298945, + "loss": 3.492912769317627, + "step": 1280, + "token_acc": 0.25220687313662454 + }, + { + "epoch": 0.751099384344767, + "grad_norm": 1.730288990487118, + "learning_rate": 0.00010512309495896834, + "loss": 3.481194496154785, + "step": 1281, + "token_acc": 0.25366853717801674 + }, + { + "epoch": 0.751685722661976, + "grad_norm": 3.2119553012188846, + "learning_rate": 0.00010520515826494724, + "loss": 3.449909210205078, + "step": 1282, + "token_acc": 0.2580981878989935 + }, + { + "epoch": 0.752272060979185, + "grad_norm": 2.742712747612491, + "learning_rate": 0.00010528722157092614, + "loss": 3.5121009349823, + "step": 1283, + "token_acc": 0.2505381258869112 + }, + { + "epoch": 0.752858399296394, + "grad_norm": 1.4545011188575085, + "learning_rate": 0.00010536928487690504, + "loss": 3.5439231395721436, + "step": 1284, + "token_acc": 0.245371196083449 + }, + { + "epoch": 0.7534447376136031, + "grad_norm": 2.41673697456245, + "learning_rate": 0.00010545134818288394, + "loss": 3.5207419395446777, + "step": 1285, + "token_acc": 0.24917407613587292 + }, + { + "epoch": 0.7540310759308121, + "grad_norm": 1.9507894036337186, + "learning_rate": 0.00010553341148886283, + "loss": 3.4737367630004883, + "step": 1286, + "token_acc": 0.25532937930478916 + }, + { + "epoch": 0.7546174142480211, + "grad_norm": 2.9967319340906027, + "learning_rate": 0.00010561547479484173, + "loss": 3.564824104309082, + "step": 1287, + "token_acc": 0.24293510864775616 + }, + { + "epoch": 0.7552037525652301, + "grad_norm": 2.300008195888897, + "learning_rate": 0.00010569753810082063, + "loss": 3.5064244270324707, + "step": 1288, + "token_acc": 0.25193001798112635 + }, + { + "epoch": 0.7557900908824392, + "grad_norm": 2.3517556004978397, + "learning_rate": 0.00010577960140679951, + "loss": 3.590050220489502, + "step": 1289, + "token_acc": 0.24096708928711602 + }, + { + "epoch": 0.7563764291996482, + "grad_norm": 2.6320760163602963, + "learning_rate": 0.00010586166471277841, + "loss": 3.470578670501709, + "step": 1290, + "token_acc": 0.2527637593075482 + }, + { + "epoch": 0.7569627675168572, + "grad_norm": 2.0623161093793674, + "learning_rate": 0.00010594372801875731, + "loss": 3.515133857727051, + "step": 1291, + "token_acc": 0.25027538689158396 + }, + { + "epoch": 0.7575491058340662, + "grad_norm": 2.5105947765665224, + "learning_rate": 0.00010602579132473621, + "loss": 3.5093860626220703, + "step": 1292, + "token_acc": 0.2482950010882972 + }, + { + "epoch": 0.7581354441512753, + "grad_norm": 1.967381747787795, + "learning_rate": 0.00010610785463071511, + "loss": 3.512925624847412, + "step": 1293, + "token_acc": 0.25259228865269817 + }, + { + "epoch": 0.7587217824684843, + "grad_norm": 2.9659572022441885, + "learning_rate": 0.000106189917936694, + "loss": 3.5080997943878174, + "step": 1294, + "token_acc": 0.24911178235243958 + }, + { + "epoch": 0.7593081207856933, + "grad_norm": 1.6005001325087476, + "learning_rate": 0.0001062719812426729, + "loss": 3.5016462802886963, + "step": 1295, + "token_acc": 0.251606945532458 + }, + { + "epoch": 0.7598944591029023, + "grad_norm": 2.6865929254540344, + "learning_rate": 0.0001063540445486518, + "loss": 3.4618418216705322, + "step": 1296, + "token_acc": 0.2538365834173684 + }, + { + "epoch": 0.7604807974201114, + "grad_norm": 2.88349206228666, + "learning_rate": 0.00010643610785463071, + "loss": 3.511580467224121, + "step": 1297, + "token_acc": 0.2507736209649972 + }, + { + "epoch": 0.7610671357373204, + "grad_norm": 2.3547615757394693, + "learning_rate": 0.00010651817116060961, + "loss": 3.526371479034424, + "step": 1298, + "token_acc": 0.24630399750480844 + }, + { + "epoch": 0.7616534740545294, + "grad_norm": 2.0520291843334606, + "learning_rate": 0.00010660023446658851, + "loss": 3.4733638763427734, + "step": 1299, + "token_acc": 0.25465289876473635 + }, + { + "epoch": 0.7622398123717385, + "grad_norm": 2.245402735569946, + "learning_rate": 0.0001066822977725674, + "loss": 3.4583024978637695, + "step": 1300, + "token_acc": 0.25607082760856476 + }, + { + "epoch": 0.7628261506889475, + "grad_norm": 3.0051181804502227, + "learning_rate": 0.0001067643610785463, + "loss": 3.5066702365875244, + "step": 1301, + "token_acc": 0.25065920915185136 + }, + { + "epoch": 0.7634124890061565, + "grad_norm": 1.2258156928439565, + "learning_rate": 0.0001068464243845252, + "loss": 3.463474750518799, + "step": 1302, + "token_acc": 0.2557656844363171 + }, + { + "epoch": 0.7639988273233655, + "grad_norm": 5.055052158392281, + "learning_rate": 0.0001069284876905041, + "loss": 3.4673821926116943, + "step": 1303, + "token_acc": 0.2538427733245611 + }, + { + "epoch": 0.7645851656405747, + "grad_norm": 3.14728517943582, + "learning_rate": 0.000107010550996483, + "loss": 3.562363624572754, + "step": 1304, + "token_acc": 0.24510205244858516 + }, + { + "epoch": 0.7651715039577837, + "grad_norm": 3.8605077443426206, + "learning_rate": 0.0001070926143024619, + "loss": 3.5847511291503906, + "step": 1305, + "token_acc": 0.24290127884575974 + }, + { + "epoch": 0.7657578422749927, + "grad_norm": 2.6920499855816162, + "learning_rate": 0.0001071746776084408, + "loss": 3.5618789196014404, + "step": 1306, + "token_acc": 0.2441602246625102 + }, + { + "epoch": 0.7663441805922017, + "grad_norm": 2.662048176619308, + "learning_rate": 0.00010725674091441968, + "loss": 3.5402517318725586, + "step": 1307, + "token_acc": 0.24658553926790266 + }, + { + "epoch": 0.7669305189094108, + "grad_norm": 1.8288703937541881, + "learning_rate": 0.00010733880422039858, + "loss": 3.4651176929473877, + "step": 1308, + "token_acc": 0.25594564882115206 + }, + { + "epoch": 0.7675168572266198, + "grad_norm": 2.242627630469795, + "learning_rate": 0.00010742086752637747, + "loss": 3.4793124198913574, + "step": 1309, + "token_acc": 0.25246749472596547 + }, + { + "epoch": 0.7681031955438288, + "grad_norm": 2.4204574822065843, + "learning_rate": 0.00010750293083235637, + "loss": 3.460197925567627, + "step": 1310, + "token_acc": 0.25845337102998533 + }, + { + "epoch": 0.7686895338610378, + "grad_norm": 1.83129764013251, + "learning_rate": 0.00010758499413833527, + "loss": 3.5346317291259766, + "step": 1311, + "token_acc": 0.2473667714075756 + }, + { + "epoch": 0.7692758721782469, + "grad_norm": 2.851110336501319, + "learning_rate": 0.00010766705744431417, + "loss": 3.516423225402832, + "step": 1312, + "token_acc": 0.2480323152873366 + }, + { + "epoch": 0.7698622104954559, + "grad_norm": 2.2898225486202115, + "learning_rate": 0.00010774912075029307, + "loss": 3.529078245162964, + "step": 1313, + "token_acc": 0.2473062822840706 + }, + { + "epoch": 0.7704485488126649, + "grad_norm": 3.1516583731822956, + "learning_rate": 0.00010783118405627196, + "loss": 3.5145180225372314, + "step": 1314, + "token_acc": 0.25032044722584257 + }, + { + "epoch": 0.7710348871298739, + "grad_norm": 1.6654665695307302, + "learning_rate": 0.00010791324736225088, + "loss": 3.485283851623535, + "step": 1315, + "token_acc": 0.25288256284238286 + }, + { + "epoch": 0.771621225447083, + "grad_norm": 3.0481150432100215, + "learning_rate": 0.00010799531066822977, + "loss": 3.5010576248168945, + "step": 1316, + "token_acc": 0.25212017007830106 + }, + { + "epoch": 0.772207563764292, + "grad_norm": 1.5440237008993152, + "learning_rate": 0.00010807737397420867, + "loss": 3.4440574645996094, + "step": 1317, + "token_acc": 0.25964460090945285 + }, + { + "epoch": 0.772793902081501, + "grad_norm": 3.1774850389882334, + "learning_rate": 0.00010815943728018757, + "loss": 3.516622304916382, + "step": 1318, + "token_acc": 0.24847255943011126 + }, + { + "epoch": 0.77338024039871, + "grad_norm": 2.0792547371686556, + "learning_rate": 0.00010824150058616647, + "loss": 3.536007881164551, + "step": 1319, + "token_acc": 0.2448501407950348 + }, + { + "epoch": 0.7739665787159191, + "grad_norm": 2.0371222955932624, + "learning_rate": 0.00010832356389214537, + "loss": 3.487272262573242, + "step": 1320, + "token_acc": 0.2544468735887014 + }, + { + "epoch": 0.7745529170331281, + "grad_norm": 2.546753999628975, + "learning_rate": 0.00010840562719812426, + "loss": 3.4719176292419434, + "step": 1321, + "token_acc": 0.25392249930669625 + }, + { + "epoch": 0.7751392553503371, + "grad_norm": 1.5637097394076207, + "learning_rate": 0.00010848769050410316, + "loss": 3.514017105102539, + "step": 1322, + "token_acc": 0.25017965163670686 + }, + { + "epoch": 0.7757255936675461, + "grad_norm": 3.269707113769341, + "learning_rate": 0.00010856975381008206, + "loss": 3.516347885131836, + "step": 1323, + "token_acc": 0.2499133973001952 + }, + { + "epoch": 0.7763119319847552, + "grad_norm": 1.9912346894159882, + "learning_rate": 0.00010865181711606094, + "loss": 3.504991054534912, + "step": 1324, + "token_acc": 0.2508131817151413 + }, + { + "epoch": 0.7768982703019642, + "grad_norm": 2.879831577433987, + "learning_rate": 0.00010873388042203984, + "loss": 3.4906930923461914, + "step": 1325, + "token_acc": 0.2510815590951514 + }, + { + "epoch": 0.7774846086191732, + "grad_norm": 2.212997090047063, + "learning_rate": 0.00010881594372801874, + "loss": 3.5146164894104004, + "step": 1326, + "token_acc": 0.25096581792162925 + }, + { + "epoch": 0.7780709469363823, + "grad_norm": 3.2422513766347083, + "learning_rate": 0.00010889800703399764, + "loss": 3.5304179191589355, + "step": 1327, + "token_acc": 0.24632720651299037 + }, + { + "epoch": 0.7786572852535913, + "grad_norm": 1.5576904445351991, + "learning_rate": 0.00010898007033997654, + "loss": 3.4751577377319336, + "step": 1328, + "token_acc": 0.25465443905260954 + }, + { + "epoch": 0.7792436235708003, + "grad_norm": 3.757390382238241, + "learning_rate": 0.00010906213364595543, + "loss": 3.4894869327545166, + "step": 1329, + "token_acc": 0.2518094614367059 + }, + { + "epoch": 0.7798299618880093, + "grad_norm": 2.5143653018251144, + "learning_rate": 0.00010914419695193433, + "loss": 3.5633764266967773, + "step": 1330, + "token_acc": 0.24517024313638253 + }, + { + "epoch": 0.7804163002052185, + "grad_norm": 3.4313682505203853, + "learning_rate": 0.00010922626025791323, + "loss": 3.4894797801971436, + "step": 1331, + "token_acc": 0.2524260865448846 + }, + { + "epoch": 0.7810026385224275, + "grad_norm": 2.8838704178580414, + "learning_rate": 0.00010930832356389214, + "loss": 3.5439133644104004, + "step": 1332, + "token_acc": 0.24663373669879737 + }, + { + "epoch": 0.7815889768396365, + "grad_norm": 2.168133642666341, + "learning_rate": 0.00010939038686987104, + "loss": 3.531949996948242, + "step": 1333, + "token_acc": 0.24533899388762437 + }, + { + "epoch": 0.7821753151568455, + "grad_norm": 2.1749043186467176, + "learning_rate": 0.00010947245017584994, + "loss": 3.56597638130188, + "step": 1334, + "token_acc": 0.2438011519423874 + }, + { + "epoch": 0.7827616534740546, + "grad_norm": 1.7071748806849016, + "learning_rate": 0.00010955451348182884, + "loss": 3.503934860229492, + "step": 1335, + "token_acc": 0.25062254652816973 + }, + { + "epoch": 0.7833479917912636, + "grad_norm": 3.3967387622971317, + "learning_rate": 0.00010963657678780773, + "loss": 3.473453998565674, + "step": 1336, + "token_acc": 0.25408249577072345 + }, + { + "epoch": 0.7839343301084726, + "grad_norm": 1.7258696464588632, + "learning_rate": 0.00010971864009378663, + "loss": 3.460827112197876, + "step": 1337, + "token_acc": 0.2563349481631643 + }, + { + "epoch": 0.7845206684256816, + "grad_norm": 2.045523738336018, + "learning_rate": 0.00010980070339976553, + "loss": 3.514976978302002, + "step": 1338, + "token_acc": 0.2498473075677025 + }, + { + "epoch": 0.7851070067428907, + "grad_norm": 2.065586944123851, + "learning_rate": 0.00010988276670574443, + "loss": 3.51725435256958, + "step": 1339, + "token_acc": 0.2490453479740014 + }, + { + "epoch": 0.7856933450600997, + "grad_norm": 2.0088446653241667, + "learning_rate": 0.00010996483001172333, + "loss": 3.5098557472229004, + "step": 1340, + "token_acc": 0.25026360665981073 + }, + { + "epoch": 0.7862796833773087, + "grad_norm": 2.529519584850874, + "learning_rate": 0.00011004689331770222, + "loss": 3.5220985412597656, + "step": 1341, + "token_acc": 0.24702599696920294 + }, + { + "epoch": 0.7868660216945177, + "grad_norm": 1.5336792351690491, + "learning_rate": 0.00011012895662368111, + "loss": 3.5066158771514893, + "step": 1342, + "token_acc": 0.25002165925155817 + }, + { + "epoch": 0.7874523600117268, + "grad_norm": 2.291017187126893, + "learning_rate": 0.00011021101992966, + "loss": 3.4693357944488525, + "step": 1343, + "token_acc": 0.2536555165551124 + }, + { + "epoch": 0.7880386983289358, + "grad_norm": 1.6694639154637716, + "learning_rate": 0.0001102930832356389, + "loss": 3.5315287113189697, + "step": 1344, + "token_acc": 0.24749664619352696 + }, + { + "epoch": 0.7886250366461448, + "grad_norm": 2.749098491784195, + "learning_rate": 0.0001103751465416178, + "loss": 3.4832682609558105, + "step": 1345, + "token_acc": 0.25314299747292707 + }, + { + "epoch": 0.7892113749633538, + "grad_norm": 1.476909349305623, + "learning_rate": 0.0001104572098475967, + "loss": 3.4796810150146484, + "step": 1346, + "token_acc": 0.25257131566084634 + }, + { + "epoch": 0.7897977132805629, + "grad_norm": 2.1944216006067476, + "learning_rate": 0.0001105392731535756, + "loss": 3.4824490547180176, + "step": 1347, + "token_acc": 0.2553413363755422 + }, + { + "epoch": 0.7903840515977719, + "grad_norm": 2.3354412549144605, + "learning_rate": 0.0001106213364595545, + "loss": 3.5235204696655273, + "step": 1348, + "token_acc": 0.24800021316848303 + }, + { + "epoch": 0.7909703899149809, + "grad_norm": 2.4777474562265667, + "learning_rate": 0.0001107033997655334, + "loss": 3.512127161026001, + "step": 1349, + "token_acc": 0.24980902650607453 + }, + { + "epoch": 0.7915567282321899, + "grad_norm": 1.649031003375605, + "learning_rate": 0.0001107854630715123, + "loss": 3.447403907775879, + "step": 1350, + "token_acc": 0.2584091274638497 + }, + { + "epoch": 0.792143066549399, + "grad_norm": 1.9134891836115946, + "learning_rate": 0.0001108675263774912, + "loss": 3.5353763103485107, + "step": 1351, + "token_acc": 0.24636798126226497 + }, + { + "epoch": 0.792729404866608, + "grad_norm": 2.08475671079445, + "learning_rate": 0.0001109495896834701, + "loss": 3.4992029666900635, + "step": 1352, + "token_acc": 0.252505169595067 + }, + { + "epoch": 0.793315743183817, + "grad_norm": 1.610110042459501, + "learning_rate": 0.000111031652989449, + "loss": 3.5161097049713135, + "step": 1353, + "token_acc": 0.24626238304541184 + }, + { + "epoch": 0.7939020815010261, + "grad_norm": 1.6731768057426692, + "learning_rate": 0.0001111137162954279, + "loss": 3.547534227371216, + "step": 1354, + "token_acc": 0.2443312503001809 + }, + { + "epoch": 0.7944884198182351, + "grad_norm": 1.7656188632511818, + "learning_rate": 0.0001111957796014068, + "loss": 3.4768528938293457, + "step": 1355, + "token_acc": 0.2538136648540949 + }, + { + "epoch": 0.7950747581354441, + "grad_norm": 2.124341571882291, + "learning_rate": 0.0001112778429073857, + "loss": 3.4576823711395264, + "step": 1356, + "token_acc": 0.25689171758054175 + }, + { + "epoch": 0.7956610964526531, + "grad_norm": 2.149388011481822, + "learning_rate": 0.00011135990621336459, + "loss": 3.389029026031494, + "step": 1357, + "token_acc": 0.26545436315807525 + }, + { + "epoch": 0.7962474347698623, + "grad_norm": 1.485384161533022, + "learning_rate": 0.00011144196951934349, + "loss": 3.54351806640625, + "step": 1358, + "token_acc": 0.2442869471977404 + }, + { + "epoch": 0.7968337730870713, + "grad_norm": 2.688752375578216, + "learning_rate": 0.00011152403282532239, + "loss": 3.477694034576416, + "step": 1359, + "token_acc": 0.2528829115941105 + }, + { + "epoch": 0.7974201114042803, + "grad_norm": 1.8182793551601826, + "learning_rate": 0.00011160609613130127, + "loss": 3.479078769683838, + "step": 1360, + "token_acc": 0.25203874311201074 + }, + { + "epoch": 0.7980064497214893, + "grad_norm": 2.5228575256990453, + "learning_rate": 0.00011168815943728017, + "loss": 3.463512897491455, + "step": 1361, + "token_acc": 0.2555741655901898 + }, + { + "epoch": 0.7985927880386984, + "grad_norm": 1.4337638118674019, + "learning_rate": 0.00011177022274325907, + "loss": 3.407137632369995, + "step": 1362, + "token_acc": 0.2643104351813438 + }, + { + "epoch": 0.7991791263559074, + "grad_norm": 2.924363927645141, + "learning_rate": 0.00011185228604923797, + "loss": 3.5044167041778564, + "step": 1363, + "token_acc": 0.2503583506483293 + }, + { + "epoch": 0.7997654646731164, + "grad_norm": 2.154290283771355, + "learning_rate": 0.00011193434935521686, + "loss": 3.461982250213623, + "step": 1364, + "token_acc": 0.25475857299540217 + }, + { + "epoch": 0.8003518029903254, + "grad_norm": 1.910232496974231, + "learning_rate": 0.00011201641266119576, + "loss": 3.489426374435425, + "step": 1365, + "token_acc": 0.25238179678558187 + }, + { + "epoch": 0.8009381413075345, + "grad_norm": 1.9702064608666856, + "learning_rate": 0.00011209847596717466, + "loss": 3.4879424571990967, + "step": 1366, + "token_acc": 0.25162392813080314 + }, + { + "epoch": 0.8015244796247435, + "grad_norm": 1.8080870784131426, + "learning_rate": 0.00011218053927315357, + "loss": 3.480198860168457, + "step": 1367, + "token_acc": 0.25261769774978554 + }, + { + "epoch": 0.8021108179419525, + "grad_norm": 1.5444889050467996, + "learning_rate": 0.00011226260257913247, + "loss": 3.478196620941162, + "step": 1368, + "token_acc": 0.252042683815058 + }, + { + "epoch": 0.8026971562591615, + "grad_norm": 2.1629539813191063, + "learning_rate": 0.00011234466588511137, + "loss": 3.440249443054199, + "step": 1369, + "token_acc": 0.2579075062149088 + }, + { + "epoch": 0.8032834945763706, + "grad_norm": 1.9236920704974576, + "learning_rate": 0.00011242672919109027, + "loss": 3.450601100921631, + "step": 1370, + "token_acc": 0.2549119170984456 + }, + { + "epoch": 0.8038698328935796, + "grad_norm": 2.2304314135180916, + "learning_rate": 0.00011250879249706916, + "loss": 3.479851722717285, + "step": 1371, + "token_acc": 0.25291415633254827 + }, + { + "epoch": 0.8044561712107886, + "grad_norm": 1.8960556850224077, + "learning_rate": 0.00011259085580304806, + "loss": 3.482255697250366, + "step": 1372, + "token_acc": 0.25490154063725384 + }, + { + "epoch": 0.8050425095279976, + "grad_norm": 1.2038320410597012, + "learning_rate": 0.00011267291910902696, + "loss": 3.466371536254883, + "step": 1373, + "token_acc": 0.25450082472776164 + }, + { + "epoch": 0.8056288478452067, + "grad_norm": 2.020094947377214, + "learning_rate": 0.00011275498241500586, + "loss": 3.4769277572631836, + "step": 1374, + "token_acc": 0.25208877681229874 + }, + { + "epoch": 0.8062151861624157, + "grad_norm": 1.8740998109647056, + "learning_rate": 0.00011283704572098476, + "loss": 3.4642021656036377, + "step": 1375, + "token_acc": 0.2553745858197606 + }, + { + "epoch": 0.8068015244796247, + "grad_norm": 2.0622125512504192, + "learning_rate": 0.00011291910902696365, + "loss": 3.454598903656006, + "step": 1376, + "token_acc": 0.2547711134886444 + }, + { + "epoch": 0.8073878627968337, + "grad_norm": 2.2548132477222937, + "learning_rate": 0.00011300117233294255, + "loss": 3.486370325088501, + "step": 1377, + "token_acc": 0.2518087686810464 + }, + { + "epoch": 0.8079742011140428, + "grad_norm": 1.613118107044528, + "learning_rate": 0.00011308323563892144, + "loss": 3.4848580360412598, + "step": 1378, + "token_acc": 0.251870957049559 + }, + { + "epoch": 0.8085605394312518, + "grad_norm": 1.9422231512314676, + "learning_rate": 0.00011316529894490033, + "loss": 3.45316481590271, + "step": 1379, + "token_acc": 0.25464443122106684 + }, + { + "epoch": 0.8091468777484608, + "grad_norm": 1.6653469606209894, + "learning_rate": 0.00011324736225087923, + "loss": 3.4937076568603516, + "step": 1380, + "token_acc": 0.25082990679999695 + }, + { + "epoch": 0.8097332160656698, + "grad_norm": 2.124939943643208, + "learning_rate": 0.00011332942555685813, + "loss": 3.477816581726074, + "step": 1381, + "token_acc": 0.2506954641812961 + }, + { + "epoch": 0.810319554382879, + "grad_norm": 1.6321763000035545, + "learning_rate": 0.00011341148886283703, + "loss": 3.4749388694763184, + "step": 1382, + "token_acc": 0.25319382531514967 + }, + { + "epoch": 0.810905892700088, + "grad_norm": 2.119510104123133, + "learning_rate": 0.00011349355216881593, + "loss": 3.4837558269500732, + "step": 1383, + "token_acc": 0.24929537149974343 + }, + { + "epoch": 0.811492231017297, + "grad_norm": 2.068022797366922, + "learning_rate": 0.00011357561547479482, + "loss": 3.4494569301605225, + "step": 1384, + "token_acc": 0.2564743190620416 + }, + { + "epoch": 0.8120785693345061, + "grad_norm": 1.6422919960846851, + "learning_rate": 0.00011365767878077373, + "loss": 3.4373133182525635, + "step": 1385, + "token_acc": 0.25709290748449715 + }, + { + "epoch": 0.8126649076517151, + "grad_norm": 2.2372045104091556, + "learning_rate": 0.00011373974208675263, + "loss": 3.445688486099243, + "step": 1386, + "token_acc": 0.25671052800164 + }, + { + "epoch": 0.8132512459689241, + "grad_norm": 1.2201084013861518, + "learning_rate": 0.00011382180539273153, + "loss": 3.430572509765625, + "step": 1387, + "token_acc": 0.2585926991047822 + }, + { + "epoch": 0.8138375842861331, + "grad_norm": 2.324996247286695, + "learning_rate": 0.00011390386869871043, + "loss": 3.5182652473449707, + "step": 1388, + "token_acc": 0.24654964844480623 + }, + { + "epoch": 0.8144239226033422, + "grad_norm": 2.675608736844797, + "learning_rate": 0.00011398593200468933, + "loss": 3.4582936763763428, + "step": 1389, + "token_acc": 0.25326306483248584 + }, + { + "epoch": 0.8150102609205512, + "grad_norm": 1.869600242399582, + "learning_rate": 0.00011406799531066822, + "loss": 3.442856788635254, + "step": 1390, + "token_acc": 0.25580843940124287 + }, + { + "epoch": 0.8155965992377602, + "grad_norm": 1.7032887151046605, + "learning_rate": 0.00011415005861664712, + "loss": 3.4262657165527344, + "step": 1391, + "token_acc": 0.2573925909123715 + }, + { + "epoch": 0.8161829375549692, + "grad_norm": 2.9015546454073955, + "learning_rate": 0.00011423212192262602, + "loss": 3.483952045440674, + "step": 1392, + "token_acc": 0.2538033246745454 + }, + { + "epoch": 0.8167692758721783, + "grad_norm": 2.1028304392623722, + "learning_rate": 0.00011431418522860492, + "loss": 3.4491629600524902, + "step": 1393, + "token_acc": 0.25597029030297147 + }, + { + "epoch": 0.8173556141893873, + "grad_norm": 1.8078791600454742, + "learning_rate": 0.00011439624853458382, + "loss": 3.4457709789276123, + "step": 1394, + "token_acc": 0.25515805362930527 + }, + { + "epoch": 0.8179419525065963, + "grad_norm": 3.169047467189759, + "learning_rate": 0.00011447831184056271, + "loss": 3.4571824073791504, + "step": 1395, + "token_acc": 0.25467445547946094 + }, + { + "epoch": 0.8185282908238053, + "grad_norm": 1.5951260690361297, + "learning_rate": 0.0001145603751465416, + "loss": 3.4346394538879395, + "step": 1396, + "token_acc": 0.25597249799332517 + }, + { + "epoch": 0.8191146291410144, + "grad_norm": 4.385116591485995, + "learning_rate": 0.0001146424384525205, + "loss": 3.5010201930999756, + "step": 1397, + "token_acc": 0.25109525731530313 + }, + { + "epoch": 0.8197009674582234, + "grad_norm": 3.343843702761358, + "learning_rate": 0.0001147245017584994, + "loss": 3.554399251937866, + "step": 1398, + "token_acc": 0.24304291691943938 + }, + { + "epoch": 0.8202873057754324, + "grad_norm": 2.404543363456073, + "learning_rate": 0.00011480656506447829, + "loss": 3.514979362487793, + "step": 1399, + "token_acc": 0.24826293292219118 + }, + { + "epoch": 0.8208736440926414, + "grad_norm": 2.2479958394061224, + "learning_rate": 0.00011488862837045719, + "loss": 3.4540510177612305, + "step": 1400, + "token_acc": 0.25484034491498686 + }, + { + "epoch": 0.8214599824098505, + "grad_norm": 1.6229950669512192, + "learning_rate": 0.00011497069167643609, + "loss": 3.5001330375671387, + "step": 1401, + "token_acc": 0.2508741647961068 + }, + { + "epoch": 0.8220463207270595, + "grad_norm": 2.1256985540553814, + "learning_rate": 0.000115052754982415, + "loss": 3.4449234008789062, + "step": 1402, + "token_acc": 0.25856909825760804 + }, + { + "epoch": 0.8226326590442685, + "grad_norm": 2.1685634983470177, + "learning_rate": 0.0001151348182883939, + "loss": 3.4522035121917725, + "step": 1403, + "token_acc": 0.25680800440533214 + }, + { + "epoch": 0.8232189973614775, + "grad_norm": 2.2222628223915164, + "learning_rate": 0.0001152168815943728, + "loss": 3.4078269004821777, + "step": 1404, + "token_acc": 0.26125508009863146 + }, + { + "epoch": 0.8238053356786866, + "grad_norm": 2.2540913456394476, + "learning_rate": 0.0001152989449003517, + "loss": 3.421428680419922, + "step": 1405, + "token_acc": 0.25714753536351054 + }, + { + "epoch": 0.8243916739958956, + "grad_norm": 1.5758790277310466, + "learning_rate": 0.00011538100820633059, + "loss": 3.4481430053710938, + "step": 1406, + "token_acc": 0.25467517413292573 + }, + { + "epoch": 0.8249780123131046, + "grad_norm": 2.1963270728808055, + "learning_rate": 0.00011546307151230949, + "loss": 3.4928040504455566, + "step": 1407, + "token_acc": 0.2503780031677821 + }, + { + "epoch": 0.8255643506303136, + "grad_norm": 1.3170446656811312, + "learning_rate": 0.00011554513481828839, + "loss": 3.3883275985717773, + "step": 1408, + "token_acc": 0.2641605875769006 + }, + { + "epoch": 0.8261506889475227, + "grad_norm": 2.9419033942044566, + "learning_rate": 0.00011562719812426729, + "loss": 3.5186703205108643, + "step": 1409, + "token_acc": 0.24717833560563532 + }, + { + "epoch": 0.8267370272647317, + "grad_norm": 1.1692380459741931, + "learning_rate": 0.00011570926143024618, + "loss": 3.437136650085449, + "step": 1410, + "token_acc": 0.2561185338518797 + }, + { + "epoch": 0.8273233655819408, + "grad_norm": 2.6918183369605857, + "learning_rate": 0.00011579132473622508, + "loss": 3.488126277923584, + "step": 1411, + "token_acc": 0.25136150234741783 + }, + { + "epoch": 0.8279097038991499, + "grad_norm": 1.9497520367799692, + "learning_rate": 0.00011587338804220398, + "loss": 3.4991955757141113, + "step": 1412, + "token_acc": 0.24891923899654736 + }, + { + "epoch": 0.8284960422163589, + "grad_norm": 1.8313543979831282, + "learning_rate": 0.00011595545134818286, + "loss": 3.428920269012451, + "step": 1413, + "token_acc": 0.2576608239518813 + }, + { + "epoch": 0.8290823805335679, + "grad_norm": 1.957978553494661, + "learning_rate": 0.00011603751465416176, + "loss": 3.5101370811462402, + "step": 1414, + "token_acc": 0.2488521455789091 + }, + { + "epoch": 0.8296687188507769, + "grad_norm": 1.6542488213501336, + "learning_rate": 0.00011611957796014066, + "loss": 3.4838385581970215, + "step": 1415, + "token_acc": 0.2526956193234586 + }, + { + "epoch": 0.830255057167986, + "grad_norm": 2.321389631130462, + "learning_rate": 0.00011620164126611956, + "loss": 3.5034406185150146, + "step": 1416, + "token_acc": 0.25020156356658035 + }, + { + "epoch": 0.830841395485195, + "grad_norm": 1.6499007840372506, + "learning_rate": 0.00011628370457209846, + "loss": 3.4260053634643555, + "step": 1417, + "token_acc": 0.25813857349740915 + }, + { + "epoch": 0.831427733802404, + "grad_norm": 1.9767241748547444, + "learning_rate": 0.00011636576787807735, + "loss": 3.4941108226776123, + "step": 1418, + "token_acc": 0.2509137140864369 + }, + { + "epoch": 0.832014072119613, + "grad_norm": 2.245413634651665, + "learning_rate": 0.00011644783118405625, + "loss": 3.472487688064575, + "step": 1419, + "token_acc": 0.25300140973377916 + }, + { + "epoch": 0.8326004104368221, + "grad_norm": 1.8269350997122824, + "learning_rate": 0.00011652989449003516, + "loss": 3.445422410964966, + "step": 1420, + "token_acc": 0.25644839238173256 + }, + { + "epoch": 0.8331867487540311, + "grad_norm": 2.0756480585037935, + "learning_rate": 0.00011661195779601406, + "loss": 3.5002686977386475, + "step": 1421, + "token_acc": 0.24849583237358538 + }, + { + "epoch": 0.8337730870712401, + "grad_norm": 1.5245351710215633, + "learning_rate": 0.00011669402110199296, + "loss": 3.466574192047119, + "step": 1422, + "token_acc": 0.2529519734068948 + }, + { + "epoch": 0.8343594253884491, + "grad_norm": 1.3507138759851334, + "learning_rate": 0.00011677608440797186, + "loss": 3.396169662475586, + "step": 1423, + "token_acc": 0.2617191860760136 + }, + { + "epoch": 0.8349457637056582, + "grad_norm": 1.794485469643318, + "learning_rate": 0.00011685814771395076, + "loss": 3.3983442783355713, + "step": 1424, + "token_acc": 0.26067963124957944 + }, + { + "epoch": 0.8355321020228672, + "grad_norm": 1.903084282425927, + "learning_rate": 0.00011694021101992965, + "loss": 3.402369260787964, + "step": 1425, + "token_acc": 0.2619126957680773 + }, + { + "epoch": 0.8361184403400762, + "grad_norm": 1.704019101817661, + "learning_rate": 0.00011702227432590855, + "loss": 3.439713478088379, + "step": 1426, + "token_acc": 0.2553901137673177 + }, + { + "epoch": 0.8367047786572852, + "grad_norm": 1.8834001389273514, + "learning_rate": 0.00011710433763188745, + "loss": 3.462554693222046, + "step": 1427, + "token_acc": 0.2548908457448736 + }, + { + "epoch": 0.8372911169744943, + "grad_norm": 2.1597924429599407, + "learning_rate": 0.00011718640093786635, + "loss": 3.486043930053711, + "step": 1428, + "token_acc": 0.25108148487482884 + }, + { + "epoch": 0.8378774552917033, + "grad_norm": 1.49570010610963, + "learning_rate": 0.00011726846424384525, + "loss": 3.4443469047546387, + "step": 1429, + "token_acc": 0.25529090812374455 + }, + { + "epoch": 0.8384637936089123, + "grad_norm": 1.8904824609825497, + "learning_rate": 0.00011735052754982414, + "loss": 3.4574596881866455, + "step": 1430, + "token_acc": 0.25480277381931005 + }, + { + "epoch": 0.8390501319261213, + "grad_norm": 2.065599763445802, + "learning_rate": 0.00011743259085580303, + "loss": 3.4586968421936035, + "step": 1431, + "token_acc": 0.2553851937653681 + }, + { + "epoch": 0.8396364702433304, + "grad_norm": 1.6000344979178496, + "learning_rate": 0.00011751465416178193, + "loss": 3.4332659244537354, + "step": 1432, + "token_acc": 0.25718306524798445 + }, + { + "epoch": 0.8402228085605394, + "grad_norm": 1.9424360611777232, + "learning_rate": 0.00011759671746776082, + "loss": 3.477078914642334, + "step": 1433, + "token_acc": 0.2522593807939057 + }, + { + "epoch": 0.8408091468777484, + "grad_norm": 2.8275531600655217, + "learning_rate": 0.00011767878077373972, + "loss": 3.437272310256958, + "step": 1434, + "token_acc": 0.255872663143887 + }, + { + "epoch": 0.8413954851949574, + "grad_norm": 1.1009720617380683, + "learning_rate": 0.00011776084407971862, + "loss": 3.435957431793213, + "step": 1435, + "token_acc": 0.25730451720619607 + }, + { + "epoch": 0.8419818235121665, + "grad_norm": 3.0925689339273075, + "learning_rate": 0.00011784290738569752, + "loss": 3.5108790397644043, + "step": 1436, + "token_acc": 0.24727710349502982 + }, + { + "epoch": 0.8425681618293756, + "grad_norm": 1.954305740798933, + "learning_rate": 0.00011792497069167643, + "loss": 3.539501667022705, + "step": 1437, + "token_acc": 0.24504596823820554 + }, + { + "epoch": 0.8431545001465846, + "grad_norm": 1.6259211904974364, + "learning_rate": 0.00011800703399765533, + "loss": 3.5022828578948975, + "step": 1438, + "token_acc": 0.25097712463826016 + }, + { + "epoch": 0.8437408384637937, + "grad_norm": 1.7919078644937987, + "learning_rate": 0.00011808909730363423, + "loss": 3.45670485496521, + "step": 1439, + "token_acc": 0.2525527806454029 + }, + { + "epoch": 0.8443271767810027, + "grad_norm": 1.582170295473844, + "learning_rate": 0.00011817116060961312, + "loss": 3.463327407836914, + "step": 1440, + "token_acc": 0.25437266820800986 + }, + { + "epoch": 0.8449135150982117, + "grad_norm": 1.7006151646335155, + "learning_rate": 0.00011825322391559202, + "loss": 3.472337007522583, + "step": 1441, + "token_acc": 0.2535334405645203 + }, + { + "epoch": 0.8454998534154207, + "grad_norm": 1.7882102151699137, + "learning_rate": 0.00011833528722157092, + "loss": 3.44679594039917, + "step": 1442, + "token_acc": 0.2544991389016804 + }, + { + "epoch": 0.8460861917326298, + "grad_norm": 1.0512086685683577, + "learning_rate": 0.00011841735052754982, + "loss": 3.489717960357666, + "step": 1443, + "token_acc": 0.249919418956959 + }, + { + "epoch": 0.8466725300498388, + "grad_norm": 2.4652123281569622, + "learning_rate": 0.00011849941383352872, + "loss": 3.4744672775268555, + "step": 1444, + "token_acc": 0.251611797333115 + }, + { + "epoch": 0.8472588683670478, + "grad_norm": 1.2425158461126327, + "learning_rate": 0.00011858147713950761, + "loss": 3.456843852996826, + "step": 1445, + "token_acc": 0.25381877851345913 + }, + { + "epoch": 0.8478452066842568, + "grad_norm": 1.8970534216319244, + "learning_rate": 0.00011866354044548651, + "loss": 3.4840238094329834, + "step": 1446, + "token_acc": 0.2508379800353551 + }, + { + "epoch": 0.8484315450014659, + "grad_norm": 1.7764065929036983, + "learning_rate": 0.00011874560375146541, + "loss": 3.432133197784424, + "step": 1447, + "token_acc": 0.25989832985834965 + }, + { + "epoch": 0.8490178833186749, + "grad_norm": 1.2333360229939117, + "learning_rate": 0.00011882766705744431, + "loss": 3.4992733001708984, + "step": 1448, + "token_acc": 0.24982165373711102 + }, + { + "epoch": 0.8496042216358839, + "grad_norm": 1.7075093549146882, + "learning_rate": 0.00011890973036342319, + "loss": 3.4612910747528076, + "step": 1449, + "token_acc": 0.25407074323933276 + }, + { + "epoch": 0.8501905599530929, + "grad_norm": 1.4091663420826739, + "learning_rate": 0.00011899179366940209, + "loss": 3.4539175033569336, + "step": 1450, + "token_acc": 0.25379739359817316 + }, + { + "epoch": 0.850776898270302, + "grad_norm": 1.5223086875245324, + "learning_rate": 0.00011907385697538099, + "loss": 3.4142401218414307, + "step": 1451, + "token_acc": 0.26066212774803804 + }, + { + "epoch": 0.851363236587511, + "grad_norm": 1.4204352725613607, + "learning_rate": 0.00011915592028135989, + "loss": 3.456674575805664, + "step": 1452, + "token_acc": 0.2546225943641574 + }, + { + "epoch": 0.85194957490472, + "grad_norm": 2.1928904110604446, + "learning_rate": 0.00011923798358733878, + "loss": 3.472041606903076, + "step": 1453, + "token_acc": 0.2532037830375691 + }, + { + "epoch": 0.852535913221929, + "grad_norm": 1.4519761038592274, + "learning_rate": 0.00011932004689331768, + "loss": 3.4731338024139404, + "step": 1454, + "token_acc": 0.25183974466223086 + }, + { + "epoch": 0.8531222515391381, + "grad_norm": 2.2473210362642058, + "learning_rate": 0.0001194021101992966, + "loss": 3.4378409385681152, + "step": 1455, + "token_acc": 0.2577174198302897 + }, + { + "epoch": 0.8537085898563471, + "grad_norm": 1.4044904593766538, + "learning_rate": 0.00011948417350527549, + "loss": 3.369058132171631, + "step": 1456, + "token_acc": 0.26429449036372127 + }, + { + "epoch": 0.8542949281735561, + "grad_norm": 2.0249574315585197, + "learning_rate": 0.00011956623681125439, + "loss": 3.4121270179748535, + "step": 1457, + "token_acc": 0.25990162810534456 + }, + { + "epoch": 0.8548812664907651, + "grad_norm": 1.5081759271690975, + "learning_rate": 0.00011964830011723329, + "loss": 3.4754323959350586, + "step": 1458, + "token_acc": 0.25273340890354495 + }, + { + "epoch": 0.8554676048079742, + "grad_norm": 2.366400906599703, + "learning_rate": 0.00011973036342321219, + "loss": 3.3916494846343994, + "step": 1459, + "token_acc": 0.26229431526763375 + }, + { + "epoch": 0.8560539431251832, + "grad_norm": 1.499165918510265, + "learning_rate": 0.00011981242672919108, + "loss": 3.4688644409179688, + "step": 1460, + "token_acc": 0.2537529285858907 + }, + { + "epoch": 0.8566402814423922, + "grad_norm": 1.437435472100298, + "learning_rate": 0.00011989449003516998, + "loss": 3.4200077056884766, + "step": 1461, + "token_acc": 0.25759275043739627 + }, + { + "epoch": 0.8572266197596012, + "grad_norm": 1.6170195219990595, + "learning_rate": 0.00011997655334114888, + "loss": 3.465322494506836, + "step": 1462, + "token_acc": 0.25385671657556313 + }, + { + "epoch": 0.8578129580768104, + "grad_norm": 1.7116062493677813, + "learning_rate": 0.00012005861664712778, + "loss": 3.461017608642578, + "step": 1463, + "token_acc": 0.25443857965451055 + }, + { + "epoch": 0.8583992963940194, + "grad_norm": 1.1841054062320047, + "learning_rate": 0.00012014067995310668, + "loss": 3.4397692680358887, + "step": 1464, + "token_acc": 0.25731477052440777 + }, + { + "epoch": 0.8589856347112284, + "grad_norm": 1.8306737616180009, + "learning_rate": 0.00012022274325908557, + "loss": 3.419232130050659, + "step": 1465, + "token_acc": 0.2580833851247957 + }, + { + "epoch": 0.8595719730284375, + "grad_norm": 1.5784342683509562, + "learning_rate": 0.00012030480656506447, + "loss": 3.412738561630249, + "step": 1466, + "token_acc": 0.258783507000491 + }, + { + "epoch": 0.8601583113456465, + "grad_norm": 1.8577545998297527, + "learning_rate": 0.00012038686987104336, + "loss": 3.452714443206787, + "step": 1467, + "token_acc": 0.25439143406279147 + }, + { + "epoch": 0.8607446496628555, + "grad_norm": 1.537325896969621, + "learning_rate": 0.00012046893317702225, + "loss": 3.4643139839172363, + "step": 1468, + "token_acc": 0.25153157149798255 + }, + { + "epoch": 0.8613309879800645, + "grad_norm": 1.2333943346809917, + "learning_rate": 0.00012055099648300115, + "loss": 3.427891731262207, + "step": 1469, + "token_acc": 0.2589686509703271 + }, + { + "epoch": 0.8619173262972736, + "grad_norm": 2.1060774891726655, + "learning_rate": 0.00012063305978898005, + "loss": 3.399210214614868, + "step": 1470, + "token_acc": 0.2619854032298451 + }, + { + "epoch": 0.8625036646144826, + "grad_norm": 1.1850622123484198, + "learning_rate": 0.00012071512309495895, + "loss": 3.4835376739501953, + "step": 1471, + "token_acc": 0.25012865841954607 + }, + { + "epoch": 0.8630900029316916, + "grad_norm": 1.45820854118914, + "learning_rate": 0.00012079718640093786, + "loss": 3.4814066886901855, + "step": 1472, + "token_acc": 0.2520966040701522 + }, + { + "epoch": 0.8636763412489006, + "grad_norm": 2.152330476321324, + "learning_rate": 0.00012087924970691676, + "loss": 3.423205852508545, + "step": 1473, + "token_acc": 0.2606073596578971 + }, + { + "epoch": 0.8642626795661097, + "grad_norm": 1.008599675486724, + "learning_rate": 0.00012096131301289566, + "loss": 3.47617244720459, + "step": 1474, + "token_acc": 0.25345913410062515 + }, + { + "epoch": 0.8648490178833187, + "grad_norm": 2.793285852707622, + "learning_rate": 0.00012104337631887455, + "loss": 3.4529669284820557, + "step": 1475, + "token_acc": 0.2525538251617051 + }, + { + "epoch": 0.8654353562005277, + "grad_norm": 1.6071907101284855, + "learning_rate": 0.00012112543962485345, + "loss": 3.445446491241455, + "step": 1476, + "token_acc": 0.2558612804078507 + }, + { + "epoch": 0.8660216945177367, + "grad_norm": 2.59028463843677, + "learning_rate": 0.00012120750293083235, + "loss": 3.4565694332122803, + "step": 1477, + "token_acc": 0.25320589776755403 + }, + { + "epoch": 0.8666080328349458, + "grad_norm": 2.0575528482623975, + "learning_rate": 0.00012128956623681125, + "loss": 3.5098876953125, + "step": 1478, + "token_acc": 0.24792793753164727 + }, + { + "epoch": 0.8671943711521548, + "grad_norm": 1.5940516012889903, + "learning_rate": 0.00012137162954279015, + "loss": 3.4364118576049805, + "step": 1479, + "token_acc": 0.25552153319701204 + }, + { + "epoch": 0.8677807094693638, + "grad_norm": 1.6677439125124707, + "learning_rate": 0.00012145369284876904, + "loss": 3.463893413543701, + "step": 1480, + "token_acc": 0.25354651927885247 + }, + { + "epoch": 0.8683670477865728, + "grad_norm": 1.5513655219486746, + "learning_rate": 0.00012153575615474794, + "loss": 3.441990375518799, + "step": 1481, + "token_acc": 0.2558676514278864 + }, + { + "epoch": 0.8689533861037819, + "grad_norm": 1.6522320320110375, + "learning_rate": 0.00012161781946072684, + "loss": 3.3729634284973145, + "step": 1482, + "token_acc": 0.26538235807306126 + }, + { + "epoch": 0.8695397244209909, + "grad_norm": 1.2768603566985224, + "learning_rate": 0.00012169988276670574, + "loss": 3.4236669540405273, + "step": 1483, + "token_acc": 0.2578060330846578 + }, + { + "epoch": 0.8701260627381999, + "grad_norm": 2.2431656393440926, + "learning_rate": 0.00012178194607268462, + "loss": 3.442612648010254, + "step": 1484, + "token_acc": 0.25558748739071957 + }, + { + "epoch": 0.8707124010554089, + "grad_norm": 1.2946558345907428, + "learning_rate": 0.00012186400937866352, + "loss": 3.462536334991455, + "step": 1485, + "token_acc": 0.2524354752036435 + }, + { + "epoch": 0.871298739372618, + "grad_norm": 2.448818162980202, + "learning_rate": 0.00012194607268464242, + "loss": 3.468039035797119, + "step": 1486, + "token_acc": 0.2510686407311558 + }, + { + "epoch": 0.871885077689827, + "grad_norm": 1.55103730874604, + "learning_rate": 0.00012202813599062132, + "loss": 3.427125930786133, + "step": 1487, + "token_acc": 0.2583153468673006 + }, + { + "epoch": 0.872471416007036, + "grad_norm": 2.0842968900205596, + "learning_rate": 0.00012211019929660023, + "loss": 3.464477062225342, + "step": 1488, + "token_acc": 0.2532736097888346 + }, + { + "epoch": 0.873057754324245, + "grad_norm": 1.4843798066973837, + "learning_rate": 0.0001221922626025791, + "loss": 3.455716133117676, + "step": 1489, + "token_acc": 0.25366416257218083 + }, + { + "epoch": 0.8736440926414542, + "grad_norm": 1.2739777809175559, + "learning_rate": 0.00012227432590855802, + "loss": 3.447477102279663, + "step": 1490, + "token_acc": 0.25466691019656085 + }, + { + "epoch": 0.8742304309586632, + "grad_norm": 2.1859177736222946, + "learning_rate": 0.0001223563892145369, + "loss": 3.428298234939575, + "step": 1491, + "token_acc": 0.2575932657356036 + }, + { + "epoch": 0.8748167692758722, + "grad_norm": 1.9309164066343492, + "learning_rate": 0.00012243845252051582, + "loss": 3.4118127822875977, + "step": 1492, + "token_acc": 0.25880211788438445 + }, + { + "epoch": 0.8754031075930812, + "grad_norm": 1.2159960008627448, + "learning_rate": 0.0001225205158264947, + "loss": 3.426647663116455, + "step": 1493, + "token_acc": 0.25611590928287864 + }, + { + "epoch": 0.8759894459102903, + "grad_norm": 2.3467847192363904, + "learning_rate": 0.00012260257913247362, + "loss": 3.4199390411376953, + "step": 1494, + "token_acc": 0.25861233952347307 + }, + { + "epoch": 0.8765757842274993, + "grad_norm": 1.0311038411610973, + "learning_rate": 0.0001226846424384525, + "loss": 3.458796977996826, + "step": 1495, + "token_acc": 0.2529837523836246 + }, + { + "epoch": 0.8771621225447083, + "grad_norm": 1.7572641175282862, + "learning_rate": 0.0001227667057444314, + "loss": 3.4372024536132812, + "step": 1496, + "token_acc": 0.2582377987201153 + }, + { + "epoch": 0.8777484608619174, + "grad_norm": 1.581679540309632, + "learning_rate": 0.00012284876905041032, + "loss": 3.519561767578125, + "step": 1497, + "token_acc": 0.24561182711335208 + }, + { + "epoch": 0.8783347991791264, + "grad_norm": 1.3016882985742975, + "learning_rate": 0.0001229308323563892, + "loss": 3.4367005825042725, + "step": 1498, + "token_acc": 0.2563187825812227 + }, + { + "epoch": 0.8789211374963354, + "grad_norm": 1.691039645160426, + "learning_rate": 0.00012301289566236812, + "loss": 3.4573256969451904, + "step": 1499, + "token_acc": 0.25297267672796836 + }, + { + "epoch": 0.8795074758135444, + "grad_norm": 1.4327149142596116, + "learning_rate": 0.000123094958968347, + "loss": 3.4507312774658203, + "step": 1500, + "token_acc": 0.25454048554512887 + }, + { + "epoch": 0.8800938141307535, + "grad_norm": 1.7851981679790219, + "learning_rate": 0.00012317702227432591, + "loss": 3.429867744445801, + "step": 1501, + "token_acc": 0.25758230363777757 + }, + { + "epoch": 0.8806801524479625, + "grad_norm": 1.226875374192547, + "learning_rate": 0.0001232590855803048, + "loss": 3.4147696495056152, + "step": 1502, + "token_acc": 0.2602180228377484 + }, + { + "epoch": 0.8812664907651715, + "grad_norm": 1.5263218588430374, + "learning_rate": 0.00012334114888628368, + "loss": 3.4408297538757324, + "step": 1503, + "token_acc": 0.25580881408197353 + }, + { + "epoch": 0.8818528290823805, + "grad_norm": 1.3675672997456356, + "learning_rate": 0.0001234232121922626, + "loss": 3.4493942260742188, + "step": 1504, + "token_acc": 0.25357806068979283 + }, + { + "epoch": 0.8824391673995896, + "grad_norm": 1.382906136927589, + "learning_rate": 0.00012350527549824148, + "loss": 3.385406494140625, + "step": 1505, + "token_acc": 0.26275747131181376 + }, + { + "epoch": 0.8830255057167986, + "grad_norm": 1.815356813818446, + "learning_rate": 0.0001235873388042204, + "loss": 3.377808094024658, + "step": 1506, + "token_acc": 0.26319879584950184 + }, + { + "epoch": 0.8836118440340076, + "grad_norm": 1.9836318665184827, + "learning_rate": 0.00012366940211019928, + "loss": 3.4552505016326904, + "step": 1507, + "token_acc": 0.2530039210205941 + }, + { + "epoch": 0.8841981823512166, + "grad_norm": 1.3542132771549567, + "learning_rate": 0.0001237514654161782, + "loss": 3.4395880699157715, + "step": 1508, + "token_acc": 0.2572736526247707 + }, + { + "epoch": 0.8847845206684257, + "grad_norm": 1.3766723702014485, + "learning_rate": 0.00012383352872215707, + "loss": 3.393324375152588, + "step": 1509, + "token_acc": 0.26035008212780275 + }, + { + "epoch": 0.8853708589856347, + "grad_norm": 1.804991285081376, + "learning_rate": 0.00012391559202813598, + "loss": 3.3616652488708496, + "step": 1510, + "token_acc": 0.26688124386218387 + }, + { + "epoch": 0.8859571973028437, + "grad_norm": 1.6895761177182043, + "learning_rate": 0.00012399765533411487, + "loss": 3.453643560409546, + "step": 1511, + "token_acc": 0.2555476507628966 + }, + { + "epoch": 0.8865435356200527, + "grad_norm": 1.8597568453152975, + "learning_rate": 0.00012407971864009378, + "loss": 3.4125051498413086, + "step": 1512, + "token_acc": 0.25941851051624215 + }, + { + "epoch": 0.8871298739372618, + "grad_norm": 1.4951639880153045, + "learning_rate": 0.00012416178194607266, + "loss": 3.403019905090332, + "step": 1513, + "token_acc": 0.2605627876397107 + }, + { + "epoch": 0.8877162122544708, + "grad_norm": 1.8128308016909087, + "learning_rate": 0.00012424384525205157, + "loss": 3.475231170654297, + "step": 1514, + "token_acc": 0.2504313669362213 + }, + { + "epoch": 0.8883025505716798, + "grad_norm": 1.8037971469348852, + "learning_rate": 0.00012432590855803049, + "loss": 3.465027332305908, + "step": 1515, + "token_acc": 0.2539705320851488 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.0962125523400004, + "learning_rate": 0.00012440797186400937, + "loss": 3.4204673767089844, + "step": 1516, + "token_acc": 0.25520376159457114 + }, + { + "epoch": 0.889475227206098, + "grad_norm": 1.2899251765543887, + "learning_rate": 0.00012449003516998828, + "loss": 3.4191970825195312, + "step": 1517, + "token_acc": 0.2562839687232858 + }, + { + "epoch": 0.890061565523307, + "grad_norm": 2.727379528242444, + "learning_rate": 0.00012457209847596717, + "loss": 3.4421167373657227, + "step": 1518, + "token_acc": 0.25599884549283025 + }, + { + "epoch": 0.890647903840516, + "grad_norm": 1.145673622375147, + "learning_rate": 0.00012465416178194608, + "loss": 3.4065799713134766, + "step": 1519, + "token_acc": 0.2602408549072592 + }, + { + "epoch": 0.891234242157725, + "grad_norm": 2.666349245087391, + "learning_rate": 0.00012473622508792496, + "loss": 3.409372329711914, + "step": 1520, + "token_acc": 0.25798298177689954 + }, + { + "epoch": 0.8918205804749341, + "grad_norm": 1.9670408669208799, + "learning_rate": 0.00012481828839390385, + "loss": 3.4458847045898438, + "step": 1521, + "token_acc": 0.25505430719388605 + }, + { + "epoch": 0.8924069187921431, + "grad_norm": 2.1251656124526446, + "learning_rate": 0.00012490035169988276, + "loss": 3.456653594970703, + "step": 1522, + "token_acc": 0.2542231314741246 + }, + { + "epoch": 0.8929932571093521, + "grad_norm": 1.9416610422185518, + "learning_rate": 0.00012498241500586164, + "loss": 3.452975034713745, + "step": 1523, + "token_acc": 0.2530405918692655 + }, + { + "epoch": 0.8935795954265612, + "grad_norm": 1.7195799586848999, + "learning_rate": 0.00012506447831184055, + "loss": 3.4389374256134033, + "step": 1524, + "token_acc": 0.25461891478923604 + }, + { + "epoch": 0.8941659337437702, + "grad_norm": 1.7943957290731274, + "learning_rate": 0.00012514654161781944, + "loss": 3.422852039337158, + "step": 1525, + "token_acc": 0.25631443902591355 + }, + { + "epoch": 0.8947522720609792, + "grad_norm": 1.7734554839471535, + "learning_rate": 0.00012522860492379835, + "loss": 3.4251604080200195, + "step": 1526, + "token_acc": 0.2581400232535508 + }, + { + "epoch": 0.8953386103781882, + "grad_norm": 1.6463671389067425, + "learning_rate": 0.00012531066822977724, + "loss": 3.3679966926574707, + "step": 1527, + "token_acc": 0.2642591788999079 + }, + { + "epoch": 0.8959249486953973, + "grad_norm": 1.6201143698248306, + "learning_rate": 0.00012539273153575615, + "loss": 3.473146438598633, + "step": 1528, + "token_acc": 0.2533326823853475 + }, + { + "epoch": 0.8965112870126063, + "grad_norm": 1.0734999066798894, + "learning_rate": 0.00012547479484173503, + "loss": 3.3924050331115723, + "step": 1529, + "token_acc": 0.2626420143983436 + }, + { + "epoch": 0.8970976253298153, + "grad_norm": 2.1257549894392036, + "learning_rate": 0.00012555685814771394, + "loss": 3.367523193359375, + "step": 1530, + "token_acc": 0.2644689777951465 + }, + { + "epoch": 0.8976839636470243, + "grad_norm": 1.6466681516304258, + "learning_rate": 0.00012563892145369283, + "loss": 3.440213203430176, + "step": 1531, + "token_acc": 0.25518867177394233 + }, + { + "epoch": 0.8982703019642334, + "grad_norm": 1.8194296395151108, + "learning_rate": 0.00012572098475967174, + "loss": 3.4583487510681152, + "step": 1532, + "token_acc": 0.25308281920893794 + }, + { + "epoch": 0.8988566402814424, + "grad_norm": 1.0345156604330248, + "learning_rate": 0.00012580304806565065, + "loss": 3.3876595497131348, + "step": 1533, + "token_acc": 0.2609349987307475 + }, + { + "epoch": 0.8994429785986514, + "grad_norm": 1.753055775056012, + "learning_rate": 0.00012588511137162953, + "loss": 3.40232253074646, + "step": 1534, + "token_acc": 0.26097320121224504 + }, + { + "epoch": 0.9000293169158604, + "grad_norm": 1.234639541558314, + "learning_rate": 0.00012596717467760845, + "loss": 3.4030961990356445, + "step": 1535, + "token_acc": 0.260651337141889 + }, + { + "epoch": 0.9006156552330695, + "grad_norm": 1.2991405981535866, + "learning_rate": 0.00012604923798358733, + "loss": 3.4115095138549805, + "step": 1536, + "token_acc": 0.258784043724546 + }, + { + "epoch": 0.9012019935502785, + "grad_norm": 2.163376676254576, + "learning_rate": 0.00012613130128956624, + "loss": 3.3936753273010254, + "step": 1537, + "token_acc": 0.2604160744981795 + }, + { + "epoch": 0.9017883318674875, + "grad_norm": 1.4498963812030876, + "learning_rate": 0.00012621336459554513, + "loss": 3.4051949977874756, + "step": 1538, + "token_acc": 0.25826653638282265 + }, + { + "epoch": 0.9023746701846965, + "grad_norm": 1.6528266548629278, + "learning_rate": 0.000126295427901524, + "loss": 3.4513165950775146, + "step": 1539, + "token_acc": 0.2534301643579233 + }, + { + "epoch": 0.9029610085019056, + "grad_norm": 1.095087526042891, + "learning_rate": 0.00012637749120750292, + "loss": 3.401042938232422, + "step": 1540, + "token_acc": 0.2597976234823662 + }, + { + "epoch": 0.9035473468191146, + "grad_norm": 1.784380241259064, + "learning_rate": 0.0001264595545134818, + "loss": 3.3909621238708496, + "step": 1541, + "token_acc": 0.2607756376887038 + }, + { + "epoch": 0.9041336851363236, + "grad_norm": 1.041432635979975, + "learning_rate": 0.00012654161781946072, + "loss": 3.3980960845947266, + "step": 1542, + "token_acc": 0.2607974358354876 + }, + { + "epoch": 0.9047200234535326, + "grad_norm": 1.4305055170568384, + "learning_rate": 0.0001266236811254396, + "loss": 3.3993449211120605, + "step": 1543, + "token_acc": 0.26053646026548816 + }, + { + "epoch": 0.9053063617707418, + "grad_norm": 1.518294467064156, + "learning_rate": 0.00012670574443141851, + "loss": 3.4432902336120605, + "step": 1544, + "token_acc": 0.2555351647275333 + }, + { + "epoch": 0.9058927000879508, + "grad_norm": 1.8622711267314034, + "learning_rate": 0.0001267878077373974, + "loss": 3.3634085655212402, + "step": 1545, + "token_acc": 0.26421013424543993 + }, + { + "epoch": 0.9064790384051598, + "grad_norm": 1.1560263965212023, + "learning_rate": 0.0001268698710433763, + "loss": 3.3357577323913574, + "step": 1546, + "token_acc": 0.2676453819840365 + }, + { + "epoch": 0.9070653767223688, + "grad_norm": 1.8559841530718768, + "learning_rate": 0.0001269519343493552, + "loss": 3.372191905975342, + "step": 1547, + "token_acc": 0.2636687439618655 + }, + { + "epoch": 0.9076517150395779, + "grad_norm": 1.585022739453405, + "learning_rate": 0.0001270339976553341, + "loss": 3.4497013092041016, + "step": 1548, + "token_acc": 0.2519981107331991 + }, + { + "epoch": 0.9082380533567869, + "grad_norm": 1.5410059864668335, + "learning_rate": 0.00012711606096131302, + "loss": 3.3897926807403564, + "step": 1549, + "token_acc": 0.2595405541876409 + }, + { + "epoch": 0.9088243916739959, + "grad_norm": 1.555896756624451, + "learning_rate": 0.0001271981242672919, + "loss": 3.37626576423645, + "step": 1550, + "token_acc": 0.26381750633806184 + }, + { + "epoch": 0.909410729991205, + "grad_norm": 1.1968189040705774, + "learning_rate": 0.00012728018757327081, + "loss": 3.408867120742798, + "step": 1551, + "token_acc": 0.25903616085785086 + }, + { + "epoch": 0.909997068308414, + "grad_norm": 1.8814723521216314, + "learning_rate": 0.0001273622508792497, + "loss": 3.4293951988220215, + "step": 1552, + "token_acc": 0.25559339727278657 + }, + { + "epoch": 0.910583406625623, + "grad_norm": 1.4785780763266745, + "learning_rate": 0.0001274443141852286, + "loss": 3.3968286514282227, + "step": 1553, + "token_acc": 0.26147151372644983 + }, + { + "epoch": 0.911169744942832, + "grad_norm": 1.612523874928874, + "learning_rate": 0.0001275263774912075, + "loss": 3.4345650672912598, + "step": 1554, + "token_acc": 0.2570978914235786 + }, + { + "epoch": 0.9117560832600411, + "grad_norm": 1.3980844447729044, + "learning_rate": 0.00012760844079718638, + "loss": 3.449166774749756, + "step": 1555, + "token_acc": 0.25410012193574943 + }, + { + "epoch": 0.9123424215772501, + "grad_norm": 1.8388607441425078, + "learning_rate": 0.0001276905041031653, + "loss": 3.457688808441162, + "step": 1556, + "token_acc": 0.25203113041053954 + }, + { + "epoch": 0.9129287598944591, + "grad_norm": 1.6041631915136352, + "learning_rate": 0.00012777256740914417, + "loss": 3.476592540740967, + "step": 1557, + "token_acc": 0.2504371493497323 + }, + { + "epoch": 0.9135150982116681, + "grad_norm": 1.4907487624017328, + "learning_rate": 0.00012785463071512309, + "loss": 3.432101249694824, + "step": 1558, + "token_acc": 0.2568024282362133 + }, + { + "epoch": 0.9141014365288772, + "grad_norm": 1.425227318655967, + "learning_rate": 0.00012793669402110197, + "loss": 3.4399032592773438, + "step": 1559, + "token_acc": 0.25403551632718696 + }, + { + "epoch": 0.9146877748460862, + "grad_norm": 2.5193843486930687, + "learning_rate": 0.00012801875732708088, + "loss": 3.424142837524414, + "step": 1560, + "token_acc": 0.2573708425917021 + }, + { + "epoch": 0.9152741131632952, + "grad_norm": 1.0234863893872745, + "learning_rate": 0.00012810082063305977, + "loss": 3.3329296112060547, + "step": 1561, + "token_acc": 0.26969947600676325 + }, + { + "epoch": 0.9158604514805042, + "grad_norm": 2.590742252634702, + "learning_rate": 0.00012818288393903868, + "loss": 3.4161808490753174, + "step": 1562, + "token_acc": 0.25874583625873 + }, + { + "epoch": 0.9164467897977133, + "grad_norm": 1.4663309937114128, + "learning_rate": 0.00012826494724501756, + "loss": 3.389841318130493, + "step": 1563, + "token_acc": 0.2628703652633431 + }, + { + "epoch": 0.9170331281149223, + "grad_norm": 1.750696503375946, + "learning_rate": 0.00012834701055099647, + "loss": 3.3607654571533203, + "step": 1564, + "token_acc": 0.26589156477583176 + }, + { + "epoch": 0.9176194664321313, + "grad_norm": 1.8751730302893155, + "learning_rate": 0.00012842907385697536, + "loss": 3.435530185699463, + "step": 1565, + "token_acc": 0.2553990435409742 + }, + { + "epoch": 0.9182058047493403, + "grad_norm": 1.817345647313248, + "learning_rate": 0.00012851113716295427, + "loss": 3.391427755355835, + "step": 1566, + "token_acc": 0.26175877976152573 + }, + { + "epoch": 0.9187921430665494, + "grad_norm": 1.3867129127005364, + "learning_rate": 0.00012859320046893318, + "loss": 3.449869155883789, + "step": 1567, + "token_acc": 0.25318217760889145 + }, + { + "epoch": 0.9193784813837584, + "grad_norm": 1.2924012128996152, + "learning_rate": 0.00012867526377491207, + "loss": 3.3871898651123047, + "step": 1568, + "token_acc": 0.2636144438922677 + }, + { + "epoch": 0.9199648197009674, + "grad_norm": 1.1307441521273516, + "learning_rate": 0.00012875732708089098, + "loss": 3.3623342514038086, + "step": 1569, + "token_acc": 0.2632221423362006 + }, + { + "epoch": 0.9205511580181764, + "grad_norm": 1.372860410810995, + "learning_rate": 0.00012883939038686986, + "loss": 3.380150556564331, + "step": 1570, + "token_acc": 0.2638815008797414 + }, + { + "epoch": 0.9211374963353856, + "grad_norm": 1.642020094143525, + "learning_rate": 0.00012892145369284877, + "loss": 3.3960001468658447, + "step": 1571, + "token_acc": 0.26034435104947407 + }, + { + "epoch": 0.9217238346525946, + "grad_norm": 1.383253037315965, + "learning_rate": 0.00012900351699882766, + "loss": 3.4062883853912354, + "step": 1572, + "token_acc": 0.25979498985314337 + }, + { + "epoch": 0.9223101729698036, + "grad_norm": 1.7401216627480691, + "learning_rate": 0.00012908558030480654, + "loss": 3.3856728076934814, + "step": 1573, + "token_acc": 0.2611058193617885 + }, + { + "epoch": 0.9228965112870126, + "grad_norm": 1.2925508874163962, + "learning_rate": 0.00012916764361078545, + "loss": 3.4103055000305176, + "step": 1574, + "token_acc": 0.2571754531899058 + }, + { + "epoch": 0.9234828496042217, + "grad_norm": 1.6468940858529026, + "learning_rate": 0.00012924970691676434, + "loss": 3.3652539253234863, + "step": 1575, + "token_acc": 0.26435405458635236 + }, + { + "epoch": 0.9240691879214307, + "grad_norm": 1.2925208290653964, + "learning_rate": 0.00012933177022274325, + "loss": 3.396791934967041, + "step": 1576, + "token_acc": 0.26056945186423286 + }, + { + "epoch": 0.9246555262386397, + "grad_norm": 1.7004359844038002, + "learning_rate": 0.00012941383352872213, + "loss": 3.3742918968200684, + "step": 1577, + "token_acc": 0.26298866358142775 + }, + { + "epoch": 0.9252418645558487, + "grad_norm": 1.0928600623173799, + "learning_rate": 0.00012949589683470105, + "loss": 3.4830098152160645, + "step": 1578, + "token_acc": 0.2503633240850839 + }, + { + "epoch": 0.9258282028730578, + "grad_norm": 1.966378935257831, + "learning_rate": 0.00012957796014067993, + "loss": 3.372520923614502, + "step": 1579, + "token_acc": 0.2631443999055629 + }, + { + "epoch": 0.9264145411902668, + "grad_norm": 1.754489044553854, + "learning_rate": 0.00012966002344665884, + "loss": 3.4189040660858154, + "step": 1580, + "token_acc": 0.25668263298260513 + }, + { + "epoch": 0.9270008795074758, + "grad_norm": 0.990692161242356, + "learning_rate": 0.00012974208675263773, + "loss": 3.3704352378845215, + "step": 1581, + "token_acc": 0.26270092660201366 + }, + { + "epoch": 0.9275872178246849, + "grad_norm": 1.2790744314627125, + "learning_rate": 0.00012982415005861664, + "loss": 3.4020233154296875, + "step": 1582, + "token_acc": 0.25990416632827096 + }, + { + "epoch": 0.9281735561418939, + "grad_norm": 1.0448719491627247, + "learning_rate": 0.00012990621336459552, + "loss": 3.387636184692383, + "step": 1583, + "token_acc": 0.26169775327445033 + }, + { + "epoch": 0.9287598944591029, + "grad_norm": 1.7311121151879958, + "learning_rate": 0.00012998827667057443, + "loss": 3.4150383472442627, + "step": 1584, + "token_acc": 0.2595987543142981 + }, + { + "epoch": 0.9293462327763119, + "grad_norm": 1.1357092911466424, + "learning_rate": 0.00013007033997655335, + "loss": 3.406409740447998, + "step": 1585, + "token_acc": 0.25788372093023254 + }, + { + "epoch": 0.929932571093521, + "grad_norm": 1.3576007877852292, + "learning_rate": 0.00013015240328253223, + "loss": 3.4089412689208984, + "step": 1586, + "token_acc": 0.2586472832305491 + }, + { + "epoch": 0.93051890941073, + "grad_norm": 1.1590308635175053, + "learning_rate": 0.00013023446658851114, + "loss": 3.391888380050659, + "step": 1587, + "token_acc": 0.2597921600398853 + }, + { + "epoch": 0.931105247727939, + "grad_norm": 1.592440322825381, + "learning_rate": 0.00013031652989449003, + "loss": 3.443631172180176, + "step": 1588, + "token_acc": 0.2542345051903475 + }, + { + "epoch": 0.931691586045148, + "grad_norm": 1.5514572272748557, + "learning_rate": 0.00013039859320046894, + "loss": 3.418938636779785, + "step": 1589, + "token_acc": 0.25787297351673605 + }, + { + "epoch": 0.9322779243623571, + "grad_norm": 1.1885770848576238, + "learning_rate": 0.00013048065650644782, + "loss": 3.3787739276885986, + "step": 1590, + "token_acc": 0.26254568120925137 + }, + { + "epoch": 0.9328642626795661, + "grad_norm": 1.9901442438009058, + "learning_rate": 0.0001305627198124267, + "loss": 3.397382974624634, + "step": 1591, + "token_acc": 0.25973850812001104 + }, + { + "epoch": 0.9334506009967751, + "grad_norm": 1.129930689222301, + "learning_rate": 0.00013064478311840562, + "loss": 3.411898136138916, + "step": 1592, + "token_acc": 0.25577946728836987 + }, + { + "epoch": 0.9340369393139841, + "grad_norm": 2.031584292920118, + "learning_rate": 0.0001307268464243845, + "loss": 3.3807005882263184, + "step": 1593, + "token_acc": 0.2636437293824874 + }, + { + "epoch": 0.9346232776311932, + "grad_norm": 1.296994906816686, + "learning_rate": 0.0001308089097303634, + "loss": 3.3560585975646973, + "step": 1594, + "token_acc": 0.26376548294062657 + }, + { + "epoch": 0.9352096159484022, + "grad_norm": 1.9775410954257926, + "learning_rate": 0.0001308909730363423, + "loss": 3.4179506301879883, + "step": 1595, + "token_acc": 0.2577368503843605 + }, + { + "epoch": 0.9357959542656112, + "grad_norm": 1.4133324628897241, + "learning_rate": 0.0001309730363423212, + "loss": 3.35154128074646, + "step": 1596, + "token_acc": 0.26564876771821744 + }, + { + "epoch": 0.9363822925828202, + "grad_norm": 1.3139799506616736, + "learning_rate": 0.0001310550996483001, + "loss": 3.42692494392395, + "step": 1597, + "token_acc": 0.2573869194855602 + }, + { + "epoch": 0.9369686309000294, + "grad_norm": 1.4344248979779852, + "learning_rate": 0.000131137162954279, + "loss": 3.365551710128784, + "step": 1598, + "token_acc": 0.26263915593543485 + }, + { + "epoch": 0.9375549692172384, + "grad_norm": 1.6553291202539733, + "learning_rate": 0.0001312192262602579, + "loss": 3.3806092739105225, + "step": 1599, + "token_acc": 0.26291846868492713 + }, + { + "epoch": 0.9381413075344474, + "grad_norm": 1.7767331012761172, + "learning_rate": 0.0001313012895662368, + "loss": 3.4010696411132812, + "step": 1600, + "token_acc": 0.2596315790860905 + }, + { + "epoch": 0.9387276458516564, + "grad_norm": 1.3415274981476004, + "learning_rate": 0.00013138335287221569, + "loss": 3.404432773590088, + "step": 1601, + "token_acc": 0.2579591140199862 + }, + { + "epoch": 0.9393139841688655, + "grad_norm": 1.1834821769725241, + "learning_rate": 0.0001314654161781946, + "loss": 3.3415231704711914, + "step": 1602, + "token_acc": 0.2676237422766809 + }, + { + "epoch": 0.9399003224860745, + "grad_norm": 1.5921073153485024, + "learning_rate": 0.0001315474794841735, + "loss": 3.4134697914123535, + "step": 1603, + "token_acc": 0.25610784594487374 + }, + { + "epoch": 0.9404866608032835, + "grad_norm": 1.5553226010270753, + "learning_rate": 0.0001316295427901524, + "loss": 3.394374370574951, + "step": 1604, + "token_acc": 0.2591971115757757 + }, + { + "epoch": 0.9410729991204925, + "grad_norm": 1.4485197695516605, + "learning_rate": 0.0001317116060961313, + "loss": 3.4404358863830566, + "step": 1605, + "token_acc": 0.2536410677267907 + }, + { + "epoch": 0.9416593374377016, + "grad_norm": 1.915942734401626, + "learning_rate": 0.0001317936694021102, + "loss": 3.429225444793701, + "step": 1606, + "token_acc": 0.2555212106073852 + }, + { + "epoch": 0.9422456757549106, + "grad_norm": 0.8992380986124461, + "learning_rate": 0.0001318757327080891, + "loss": 3.359910011291504, + "step": 1607, + "token_acc": 0.2632898670632604 + }, + { + "epoch": 0.9428320140721196, + "grad_norm": 1.8996140048995376, + "learning_rate": 0.00013195779601406799, + "loss": 3.373720407485962, + "step": 1608, + "token_acc": 0.2626686152532947 + }, + { + "epoch": 0.9434183523893287, + "grad_norm": 1.3420597263068255, + "learning_rate": 0.00013203985932004687, + "loss": 3.406137466430664, + "step": 1609, + "token_acc": 0.2581210578500289 + }, + { + "epoch": 0.9440046907065377, + "grad_norm": 1.7061286596703917, + "learning_rate": 0.00013212192262602578, + "loss": 3.4146323204040527, + "step": 1610, + "token_acc": 0.2584042792823355 + }, + { + "epoch": 0.9445910290237467, + "grad_norm": 1.5283232374270648, + "learning_rate": 0.00013220398593200467, + "loss": 3.3759312629699707, + "step": 1611, + "token_acc": 0.26065202289192496 + }, + { + "epoch": 0.9451773673409557, + "grad_norm": 1.2291675172064562, + "learning_rate": 0.00013228604923798358, + "loss": 3.4050798416137695, + "step": 1612, + "token_acc": 0.2602949260294926 + }, + { + "epoch": 0.9457637056581648, + "grad_norm": 1.4991455514857643, + "learning_rate": 0.00013236811254396246, + "loss": 3.41379451751709, + "step": 1613, + "token_acc": 0.2582484413381142 + }, + { + "epoch": 0.9463500439753738, + "grad_norm": 1.3134137672134356, + "learning_rate": 0.00013245017584994137, + "loss": 3.345651149749756, + "step": 1614, + "token_acc": 0.26608187134502925 + }, + { + "epoch": 0.9469363822925828, + "grad_norm": 1.6124270851088944, + "learning_rate": 0.00013253223915592026, + "loss": 3.3834378719329834, + "step": 1615, + "token_acc": 0.26219965839908627 + }, + { + "epoch": 0.9475227206097918, + "grad_norm": 1.194257244938607, + "learning_rate": 0.00013261430246189917, + "loss": 3.372851610183716, + "step": 1616, + "token_acc": 0.2638059266018058 + }, + { + "epoch": 0.9481090589270009, + "grad_norm": 2.0996889776797163, + "learning_rate": 0.00013269636576787805, + "loss": 3.3949944972991943, + "step": 1617, + "token_acc": 0.259176367652237 + }, + { + "epoch": 0.9486953972442099, + "grad_norm": 1.0273019701280062, + "learning_rate": 0.00013277842907385697, + "loss": 3.407728672027588, + "step": 1618, + "token_acc": 0.25823909701602543 + }, + { + "epoch": 0.9492817355614189, + "grad_norm": 2.0780664285476567, + "learning_rate": 0.00013286049237983588, + "loss": 3.3907909393310547, + "step": 1619, + "token_acc": 0.25960532572515455 + }, + { + "epoch": 0.9498680738786279, + "grad_norm": 1.2504846098075775, + "learning_rate": 0.00013294255568581476, + "loss": 3.437098503112793, + "step": 1620, + "token_acc": 0.25519283218052286 + }, + { + "epoch": 0.950454412195837, + "grad_norm": 1.9071060161508944, + "learning_rate": 0.00013302461899179367, + "loss": 3.4237794876098633, + "step": 1621, + "token_acc": 0.25517933646542623 + }, + { + "epoch": 0.951040750513046, + "grad_norm": 1.8396084325295134, + "learning_rate": 0.00013310668229777256, + "loss": 3.3670296669006348, + "step": 1622, + "token_acc": 0.26320507894688694 + }, + { + "epoch": 0.951627088830255, + "grad_norm": 1.0676503991102975, + "learning_rate": 0.00013318874560375147, + "loss": 3.38714599609375, + "step": 1623, + "token_acc": 0.2601433215321077 + }, + { + "epoch": 0.952213427147464, + "grad_norm": 2.0407141237803064, + "learning_rate": 0.00013327080890973035, + "loss": 3.409733533859253, + "step": 1624, + "token_acc": 0.2583709690198875 + }, + { + "epoch": 0.9527997654646732, + "grad_norm": 1.264426496633966, + "learning_rate": 0.00013335287221570926, + "loss": 3.43257999420166, + "step": 1625, + "token_acc": 0.2538907178867286 + }, + { + "epoch": 0.9533861037818822, + "grad_norm": 1.4797886208242987, + "learning_rate": 0.00013343493552168815, + "loss": 3.385241746902466, + "step": 1626, + "token_acc": 0.2614268809979844 + }, + { + "epoch": 0.9539724420990912, + "grad_norm": 1.3960493925540565, + "learning_rate": 0.00013351699882766703, + "loss": 3.2887959480285645, + "step": 1627, + "token_acc": 0.27447114224883845 + }, + { + "epoch": 0.9545587804163002, + "grad_norm": 1.4895935491554844, + "learning_rate": 0.00013359906213364594, + "loss": 3.3930814266204834, + "step": 1628, + "token_acc": 0.2612280786407819 + }, + { + "epoch": 0.9551451187335093, + "grad_norm": 1.4175952400545995, + "learning_rate": 0.00013368112543962483, + "loss": 3.36912202835083, + "step": 1629, + "token_acc": 0.2625094397307488 + }, + { + "epoch": 0.9557314570507183, + "grad_norm": 1.2617666193871946, + "learning_rate": 0.00013376318874560374, + "loss": 3.4219913482666016, + "step": 1630, + "token_acc": 0.2569656660075499 + }, + { + "epoch": 0.9563177953679273, + "grad_norm": 1.8788393830980479, + "learning_rate": 0.00013384525205158263, + "loss": 3.369431495666504, + "step": 1631, + "token_acc": 0.26476876188272297 + }, + { + "epoch": 0.9569041336851363, + "grad_norm": 1.0057128508774238, + "learning_rate": 0.00013392731535756154, + "loss": 3.3941421508789062, + "step": 1632, + "token_acc": 0.2592805942916721 + }, + { + "epoch": 0.9574904720023454, + "grad_norm": 1.7114515017058227, + "learning_rate": 0.00013400937866354042, + "loss": 3.402280807495117, + "step": 1633, + "token_acc": 0.2589737730040732 + }, + { + "epoch": 0.9580768103195544, + "grad_norm": 1.1344373084871577, + "learning_rate": 0.00013409144196951933, + "loss": 3.433426856994629, + "step": 1634, + "token_acc": 0.25518585193216453 + }, + { + "epoch": 0.9586631486367634, + "grad_norm": 1.5262664679220905, + "learning_rate": 0.00013417350527549822, + "loss": 3.4048283100128174, + "step": 1635, + "token_acc": 0.26010747854088473 + }, + { + "epoch": 0.9592494869539725, + "grad_norm": 0.9996598798954925, + "learning_rate": 0.00013425556858147713, + "loss": 3.394796848297119, + "step": 1636, + "token_acc": 0.25863476439735955 + }, + { + "epoch": 0.9598358252711815, + "grad_norm": 1.9688280286815467, + "learning_rate": 0.00013433763188745604, + "loss": 3.3769078254699707, + "step": 1637, + "token_acc": 0.2644607856331523 + }, + { + "epoch": 0.9604221635883905, + "grad_norm": 1.1903855394501328, + "learning_rate": 0.00013441969519343492, + "loss": 3.3755381107330322, + "step": 1638, + "token_acc": 0.26153934257244305 + }, + { + "epoch": 0.9610085019055995, + "grad_norm": 1.7702026222573801, + "learning_rate": 0.00013450175849941384, + "loss": 3.4392783641815186, + "step": 1639, + "token_acc": 0.25585094474208986 + }, + { + "epoch": 0.9615948402228086, + "grad_norm": 1.535314503535627, + "learning_rate": 0.00013458382180539272, + "loss": 3.3950421810150146, + "step": 1640, + "token_acc": 0.25879764463493116 + }, + { + "epoch": 0.9621811785400176, + "grad_norm": 1.2740440641686512, + "learning_rate": 0.00013466588511137163, + "loss": 3.394374370574951, + "step": 1641, + "token_acc": 0.259294906070652 + }, + { + "epoch": 0.9627675168572266, + "grad_norm": 1.4624663316763096, + "learning_rate": 0.00013474794841735052, + "loss": 3.4390034675598145, + "step": 1642, + "token_acc": 0.2529298126047277 + }, + { + "epoch": 0.9633538551744356, + "grad_norm": 1.2802065447321582, + "learning_rate": 0.00013483001172332943, + "loss": 3.462399482727051, + "step": 1643, + "token_acc": 0.2524917553658991 + }, + { + "epoch": 0.9639401934916447, + "grad_norm": 1.3950324543904118, + "learning_rate": 0.0001349120750293083, + "loss": 3.4082980155944824, + "step": 1644, + "token_acc": 0.25700195497256734 + }, + { + "epoch": 0.9645265318088537, + "grad_norm": 1.4842984150979937, + "learning_rate": 0.0001349941383352872, + "loss": 3.358534812927246, + "step": 1645, + "token_acc": 0.26316448791792696 + }, + { + "epoch": 0.9651128701260627, + "grad_norm": 1.1198920698197914, + "learning_rate": 0.0001350762016412661, + "loss": 3.398050546646118, + "step": 1646, + "token_acc": 0.2582846053413669 + }, + { + "epoch": 0.9656992084432717, + "grad_norm": 2.095747294086805, + "learning_rate": 0.000135158264947245, + "loss": 3.326256275177002, + "step": 1647, + "token_acc": 0.2681122718159755 + }, + { + "epoch": 0.9662855467604808, + "grad_norm": 1.025051416285317, + "learning_rate": 0.0001352403282532239, + "loss": 3.3809971809387207, + "step": 1648, + "token_acc": 0.2628815846161387 + }, + { + "epoch": 0.9668718850776898, + "grad_norm": 1.9218514699987144, + "learning_rate": 0.0001353223915592028, + "loss": 3.411264419555664, + "step": 1649, + "token_acc": 0.2566667960310992 + }, + { + "epoch": 0.9674582233948988, + "grad_norm": 1.165383499297446, + "learning_rate": 0.0001354044548651817, + "loss": 3.3521289825439453, + "step": 1650, + "token_acc": 0.26499625262181675 + }, + { + "epoch": 0.9680445617121078, + "grad_norm": 1.5748825558118935, + "learning_rate": 0.00013548651817116059, + "loss": 3.4237351417541504, + "step": 1651, + "token_acc": 0.2564794899567264 + }, + { + "epoch": 0.968630900029317, + "grad_norm": 1.3060848671013605, + "learning_rate": 0.0001355685814771395, + "loss": 3.355648994445801, + "step": 1652, + "token_acc": 0.2655677844329881 + }, + { + "epoch": 0.969217238346526, + "grad_norm": 1.3552628428289912, + "learning_rate": 0.00013565064478311838, + "loss": 3.405456066131592, + "step": 1653, + "token_acc": 0.2572387563335105 + }, + { + "epoch": 0.969803576663735, + "grad_norm": 1.134270799255079, + "learning_rate": 0.0001357327080890973, + "loss": 3.3528923988342285, + "step": 1654, + "token_acc": 0.26592354630201565 + }, + { + "epoch": 0.970389914980944, + "grad_norm": 1.494721916776184, + "learning_rate": 0.0001358147713950762, + "loss": 3.3605828285217285, + "step": 1655, + "token_acc": 0.2634616459168713 + }, + { + "epoch": 0.9709762532981531, + "grad_norm": 1.3226596718785637, + "learning_rate": 0.0001358968347010551, + "loss": 3.3690621852874756, + "step": 1656, + "token_acc": 0.2617710714675368 + }, + { + "epoch": 0.9715625916153621, + "grad_norm": 0.9200399239416597, + "learning_rate": 0.000135978898007034, + "loss": 3.343637704849243, + "step": 1657, + "token_acc": 0.26588609575466776 + }, + { + "epoch": 0.9721489299325711, + "grad_norm": 1.1197051847172135, + "learning_rate": 0.00013606096131301288, + "loss": 3.391432523727417, + "step": 1658, + "token_acc": 0.25887319724506297 + }, + { + "epoch": 0.9727352682497801, + "grad_norm": 2.0036792000164367, + "learning_rate": 0.0001361430246189918, + "loss": 3.365065574645996, + "step": 1659, + "token_acc": 0.26424418452794507 + }, + { + "epoch": 0.9733216065669892, + "grad_norm": 1.0211775899498772, + "learning_rate": 0.00013622508792497068, + "loss": 3.3939638137817383, + "step": 1660, + "token_acc": 0.25828521596659754 + }, + { + "epoch": 0.9739079448841982, + "grad_norm": 2.5699032730336535, + "learning_rate": 0.0001363071512309496, + "loss": 3.3959455490112305, + "step": 1661, + "token_acc": 0.25860752409792936 + }, + { + "epoch": 0.9744942832014072, + "grad_norm": 1.567678565561149, + "learning_rate": 0.00013638921453692848, + "loss": 3.3992919921875, + "step": 1662, + "token_acc": 0.2567820255257223 + }, + { + "epoch": 0.9750806215186162, + "grad_norm": 1.7020167352253, + "learning_rate": 0.00013647127784290736, + "loss": 3.416536331176758, + "step": 1663, + "token_acc": 0.2571967740450754 + }, + { + "epoch": 0.9756669598358253, + "grad_norm": 1.3874585451852086, + "learning_rate": 0.00013655334114888627, + "loss": 3.4685044288635254, + "step": 1664, + "token_acc": 0.25255281221394177 + }, + { + "epoch": 0.9762532981530343, + "grad_norm": 1.2932234411064543, + "learning_rate": 0.00013663540445486516, + "loss": 3.390638828277588, + "step": 1665, + "token_acc": 0.25879916648331025 + }, + { + "epoch": 0.9768396364702433, + "grad_norm": 1.578632599646592, + "learning_rate": 0.00013671746776084407, + "loss": 3.426136016845703, + "step": 1666, + "token_acc": 0.25617224063394983 + }, + { + "epoch": 0.9774259747874524, + "grad_norm": 1.084237595221012, + "learning_rate": 0.00013679953106682295, + "loss": 3.3864731788635254, + "step": 1667, + "token_acc": 0.26110061679323754 + }, + { + "epoch": 0.9780123131046614, + "grad_norm": 1.5392218472996468, + "learning_rate": 0.00013688159437280186, + "loss": 3.3718767166137695, + "step": 1668, + "token_acc": 0.26235911023378927 + }, + { + "epoch": 0.9785986514218704, + "grad_norm": 1.051083613356463, + "learning_rate": 0.00013696365767878075, + "loss": 3.41086745262146, + "step": 1669, + "token_acc": 0.25717329787150495 + }, + { + "epoch": 0.9791849897390794, + "grad_norm": 1.258071476706584, + "learning_rate": 0.00013704572098475966, + "loss": 3.4070990085601807, + "step": 1670, + "token_acc": 0.25760990383732907 + }, + { + "epoch": 0.9797713280562885, + "grad_norm": 1.1168723465295087, + "learning_rate": 0.00013712778429073854, + "loss": 3.3141238689422607, + "step": 1671, + "token_acc": 0.2704851045252109 + }, + { + "epoch": 0.9803576663734975, + "grad_norm": 1.1693316513329723, + "learning_rate": 0.00013720984759671746, + "loss": 3.415781021118164, + "step": 1672, + "token_acc": 0.2555888666814716 + }, + { + "epoch": 0.9809440046907065, + "grad_norm": 1.4059780599864335, + "learning_rate": 0.00013729191090269637, + "loss": 3.3873393535614014, + "step": 1673, + "token_acc": 0.25857956596105264 + }, + { + "epoch": 0.9815303430079155, + "grad_norm": 1.4082246000859102, + "learning_rate": 0.00013737397420867525, + "loss": 3.3875980377197266, + "step": 1674, + "token_acc": 0.2622277439142757 + }, + { + "epoch": 0.9821166813251246, + "grad_norm": 1.542758133702623, + "learning_rate": 0.00013745603751465416, + "loss": 3.439617872238159, + "step": 1675, + "token_acc": 0.2543997878116516 + }, + { + "epoch": 0.9827030196423336, + "grad_norm": 0.786107459017105, + "learning_rate": 0.00013753810082063305, + "loss": 3.347785472869873, + "step": 1676, + "token_acc": 0.265063819436443 + }, + { + "epoch": 0.9832893579595426, + "grad_norm": 1.460477706564235, + "learning_rate": 0.00013762016412661196, + "loss": 3.4017410278320312, + "step": 1677, + "token_acc": 0.2571645640275574 + }, + { + "epoch": 0.9838756962767516, + "grad_norm": 1.3495346252343778, + "learning_rate": 0.00013770222743259084, + "loss": 3.396604537963867, + "step": 1678, + "token_acc": 0.2600650718296865 + }, + { + "epoch": 0.9844620345939608, + "grad_norm": 1.3278307049506606, + "learning_rate": 0.00013778429073856976, + "loss": 3.3793435096740723, + "step": 1679, + "token_acc": 0.2606423385094418 + }, + { + "epoch": 0.9850483729111698, + "grad_norm": 1.234983240994458, + "learning_rate": 0.00013786635404454864, + "loss": 3.3579092025756836, + "step": 1680, + "token_acc": 0.2639951965749491 + }, + { + "epoch": 0.9856347112283788, + "grad_norm": 1.473129436132707, + "learning_rate": 0.00013794841735052752, + "loss": 3.3994297981262207, + "step": 1681, + "token_acc": 0.2599592628216668 + }, + { + "epoch": 0.9862210495455878, + "grad_norm": 1.4775422083214274, + "learning_rate": 0.00013803048065650644, + "loss": 3.3531432151794434, + "step": 1682, + "token_acc": 0.2638493869295917 + }, + { + "epoch": 0.9868073878627969, + "grad_norm": 0.9724421922025391, + "learning_rate": 0.00013811254396248532, + "loss": 3.403754711151123, + "step": 1683, + "token_acc": 0.2595373937791143 + }, + { + "epoch": 0.9873937261800059, + "grad_norm": 1.3203948903903946, + "learning_rate": 0.00013819460726846423, + "loss": 3.3758339881896973, + "step": 1684, + "token_acc": 0.261963516966489 + }, + { + "epoch": 0.9879800644972149, + "grad_norm": 1.6077139309364612, + "learning_rate": 0.00013827667057444312, + "loss": 3.3747713565826416, + "step": 1685, + "token_acc": 0.26089251878471087 + }, + { + "epoch": 0.9885664028144239, + "grad_norm": 1.1762764058122244, + "learning_rate": 0.00013835873388042203, + "loss": 3.365751266479492, + "step": 1686, + "token_acc": 0.2632811895506101 + }, + { + "epoch": 0.989152741131633, + "grad_norm": 1.0317129729236414, + "learning_rate": 0.0001384407971864009, + "loss": 3.3322577476501465, + "step": 1687, + "token_acc": 0.26728618969697127 + }, + { + "epoch": 0.989739079448842, + "grad_norm": 1.4496924096116623, + "learning_rate": 0.00013852286049237982, + "loss": 3.387169361114502, + "step": 1688, + "token_acc": 0.2596358654350455 + }, + { + "epoch": 0.990325417766051, + "grad_norm": 0.9412790832358663, + "learning_rate": 0.00013860492379835874, + "loss": 3.386284828186035, + "step": 1689, + "token_acc": 0.2614359772586368 + }, + { + "epoch": 0.99091175608326, + "grad_norm": 1.3581972488065917, + "learning_rate": 0.00013868698710433762, + "loss": 3.3547544479370117, + "step": 1690, + "token_acc": 0.2654190764195818 + }, + { + "epoch": 0.9914980944004691, + "grad_norm": 1.2899137442832138, + "learning_rate": 0.00013876905041031653, + "loss": 3.3697428703308105, + "step": 1691, + "token_acc": 0.26258819335968214 + }, + { + "epoch": 0.9920844327176781, + "grad_norm": 1.4137471218165154, + "learning_rate": 0.00013885111371629542, + "loss": 3.412522792816162, + "step": 1692, + "token_acc": 0.2563867588814132 + }, + { + "epoch": 0.9926707710348871, + "grad_norm": 1.067623006279297, + "learning_rate": 0.00013893317702227433, + "loss": 3.3776886463165283, + "step": 1693, + "token_acc": 0.2613984545381115 + }, + { + "epoch": 0.9932571093520962, + "grad_norm": 1.2379028749060677, + "learning_rate": 0.0001390152403282532, + "loss": 3.3797287940979004, + "step": 1694, + "token_acc": 0.2597831009542593 + }, + { + "epoch": 0.9938434476693052, + "grad_norm": 1.1923467832475487, + "learning_rate": 0.00013909730363423212, + "loss": 3.3570010662078857, + "step": 1695, + "token_acc": 0.2651219117945139 + }, + { + "epoch": 0.9944297859865142, + "grad_norm": 1.1936438759807122, + "learning_rate": 0.000139179366940211, + "loss": 3.32651424407959, + "step": 1696, + "token_acc": 0.26843226723487323 + }, + { + "epoch": 0.9950161243037232, + "grad_norm": 1.3047963527016206, + "learning_rate": 0.00013926143024618992, + "loss": 3.3679392337799072, + "step": 1697, + "token_acc": 0.26188865350305496 + }, + { + "epoch": 0.9956024626209323, + "grad_norm": 1.3365674570676664, + "learning_rate": 0.0001393434935521688, + "loss": 3.3872809410095215, + "step": 1698, + "token_acc": 0.25937124676036155 + }, + { + "epoch": 0.9961888009381413, + "grad_norm": 1.0251022863370607, + "learning_rate": 0.0001394255568581477, + "loss": 3.382136821746826, + "step": 1699, + "token_acc": 0.2603615297453439 + }, + { + "epoch": 0.9967751392553503, + "grad_norm": 1.455271015040403, + "learning_rate": 0.0001395076201641266, + "loss": 3.325009822845459, + "step": 1700, + "token_acc": 0.26742330153529054 + }, + { + "epoch": 0.9973614775725593, + "grad_norm": 1.124126346477058, + "learning_rate": 0.00013958968347010548, + "loss": 3.3369157314300537, + "step": 1701, + "token_acc": 0.26697835093050737 + }, + { + "epoch": 0.9979478158897684, + "grad_norm": 1.2281047611347757, + "learning_rate": 0.0001396717467760844, + "loss": 3.333434820175171, + "step": 1702, + "token_acc": 0.26504732668201586 + }, + { + "epoch": 0.9985341542069774, + "grad_norm": 1.3783098049339384, + "learning_rate": 0.00013975381008206328, + "loss": 3.3815784454345703, + "step": 1703, + "token_acc": 0.2627994321347063 + }, + { + "epoch": 0.9991204925241864, + "grad_norm": 0.9796998919694639, + "learning_rate": 0.0001398358733880422, + "loss": 3.3336775302886963, + "step": 1704, + "token_acc": 0.2677462755050459 + }, + { + "epoch": 0.9997068308413954, + "grad_norm": 1.1024303780641105, + "learning_rate": 0.00013991793669402108, + "loss": 3.3751683235168457, + "step": 1705, + "token_acc": 0.26120042872454446 + }, + { + "epoch": 1.0, + "grad_norm": 1.52881410902098, + "learning_rate": 0.00014, + "loss": 3.2945046424865723, + "step": 1706, + "token_acc": 0.27328458828082036 + }, + { + "epoch": 1.0, + "eval_loss": 3.344470739364624, + "eval_runtime": 21.8347, + "eval_samples_per_second": 11.724, + "eval_steps_per_second": 1.466, + "eval_token_acc": 0.26507068718743504, + "step": 1706 + }, + { + "epoch": 1.0005863383172091, + "grad_norm": 1.4307452499022542, + "learning_rate": 0.00013999999967122216, + "loss": 3.364198684692383, + "step": 1707, + "token_acc": 0.26313845212054365 + }, + { + "epoch": 1.001172676634418, + "grad_norm": 1.6717730599079252, + "learning_rate": 0.00013999999868488864, + "loss": 3.320115089416504, + "step": 1708, + "token_acc": 0.2681035332216579 + }, + { + "epoch": 1.0017590149516271, + "grad_norm": 1.005714895009179, + "learning_rate": 0.00013999999704099948, + "loss": 3.3462278842926025, + "step": 1709, + "token_acc": 0.2650901594541878 + }, + { + "epoch": 1.0023453532688362, + "grad_norm": 1.3132159576443299, + "learning_rate": 0.00013999999473955467, + "loss": 3.3265597820281982, + "step": 1710, + "token_acc": 0.26728518113281075 + }, + { + "epoch": 1.0029316915860451, + "grad_norm": 0.8945381419190027, + "learning_rate": 0.00013999999178055422, + "loss": 3.343167781829834, + "step": 1711, + "token_acc": 0.2654776076621743 + }, + { + "epoch": 1.0035180299032542, + "grad_norm": 1.5733901367013092, + "learning_rate": 0.0001399999881639982, + "loss": 3.439504384994507, + "step": 1712, + "token_acc": 0.2557064935064935 + }, + { + "epoch": 1.0041043682204631, + "grad_norm": 1.083829742141001, + "learning_rate": 0.00013999998388988658, + "loss": 3.3416497707366943, + "step": 1713, + "token_acc": 0.26527284615830293 + }, + { + "epoch": 1.0046907065376722, + "grad_norm": 1.0571573769099725, + "learning_rate": 0.00013999997895821945, + "loss": 3.3300538063049316, + "step": 1714, + "token_acc": 0.2649017793838182 + }, + { + "epoch": 1.0052770448548813, + "grad_norm": 0.9639593657514132, + "learning_rate": 0.00013999997336899687, + "loss": 3.297475576400757, + "step": 1715, + "token_acc": 0.2716643580541125 + }, + { + "epoch": 1.0058633831720902, + "grad_norm": 1.2355659873631477, + "learning_rate": 0.00013999996712221884, + "loss": 3.3221805095672607, + "step": 1716, + "token_acc": 0.26723839841203223 + }, + { + "epoch": 1.0064497214892993, + "grad_norm": 2.010721436296605, + "learning_rate": 0.00013999996021788545, + "loss": 3.3547515869140625, + "step": 1717, + "token_acc": 0.26365168331581257 + }, + { + "epoch": 1.0070360598065085, + "grad_norm": 0.7836491043342813, + "learning_rate": 0.00013999995265599679, + "loss": 3.3489737510681152, + "step": 1718, + "token_acc": 0.26232613725938925 + }, + { + "epoch": 1.0076223981237173, + "grad_norm": 1.7595445241121848, + "learning_rate": 0.00013999994443655286, + "loss": 3.3461060523986816, + "step": 1719, + "token_acc": 0.2642616950939924 + }, + { + "epoch": 1.0082087364409265, + "grad_norm": 1.2906697865151768, + "learning_rate": 0.0001399999355595538, + "loss": 3.37056303024292, + "step": 1720, + "token_acc": 0.26186812043751817 + }, + { + "epoch": 1.0087950747581353, + "grad_norm": 1.3021843035735605, + "learning_rate": 0.00013999992602499965, + "loss": 3.361006736755371, + "step": 1721, + "token_acc": 0.2641722385664616 + }, + { + "epoch": 1.0093814130753445, + "grad_norm": 1.1559058217501055, + "learning_rate": 0.00013999991583289053, + "loss": 3.35701060295105, + "step": 1722, + "token_acc": 0.26399307633906244 + }, + { + "epoch": 1.0099677513925536, + "grad_norm": 1.1128761077085572, + "learning_rate": 0.00013999990498322654, + "loss": 3.292968988418579, + "step": 1723, + "token_acc": 0.2698882235208923 + }, + { + "epoch": 1.0105540897097625, + "grad_norm": 1.1775002709096303, + "learning_rate": 0.00013999989347600778, + "loss": 3.372377872467041, + "step": 1724, + "token_acc": 0.2621188690193319 + }, + { + "epoch": 1.0111404280269716, + "grad_norm": 1.6740134002541207, + "learning_rate": 0.0001399998813112343, + "loss": 3.340231418609619, + "step": 1725, + "token_acc": 0.26563895022159967 + }, + { + "epoch": 1.0117267663441807, + "grad_norm": 1.4532001287426262, + "learning_rate": 0.0001399998684889063, + "loss": 3.361891269683838, + "step": 1726, + "token_acc": 0.2601422979695018 + }, + { + "epoch": 1.0123131046613896, + "grad_norm": 1.373754460250308, + "learning_rate": 0.00013999985500902384, + "loss": 3.3556835651397705, + "step": 1727, + "token_acc": 0.2603735736987443 + }, + { + "epoch": 1.0128994429785987, + "grad_norm": 1.476105696621109, + "learning_rate": 0.0001399998408715871, + "loss": 3.384720802307129, + "step": 1728, + "token_acc": 0.25817561935291927 + }, + { + "epoch": 1.0134857812958076, + "grad_norm": 1.1608929621074666, + "learning_rate": 0.00013999982607659615, + "loss": 3.330758571624756, + "step": 1729, + "token_acc": 0.2660935727022208 + }, + { + "epoch": 1.0140721196130167, + "grad_norm": 1.14026194396528, + "learning_rate": 0.00013999981062405117, + "loss": 3.3556740283966064, + "step": 1730, + "token_acc": 0.26242149071033655 + }, + { + "epoch": 1.0146584579302258, + "grad_norm": 1.2240118783392269, + "learning_rate": 0.0001399997945139523, + "loss": 3.2899863719940186, + "step": 1731, + "token_acc": 0.26919836179426426 + }, + { + "epoch": 1.0152447962474347, + "grad_norm": 1.4454148337700305, + "learning_rate": 0.00013999977774629968, + "loss": 3.2921361923217773, + "step": 1732, + "token_acc": 0.2704427324419113 + }, + { + "epoch": 1.0158311345646438, + "grad_norm": 0.7945831802358312, + "learning_rate": 0.0001399997603210935, + "loss": 3.2521042823791504, + "step": 1733, + "token_acc": 0.27634130406376584 + }, + { + "epoch": 1.016417472881853, + "grad_norm": 1.4596530159767214, + "learning_rate": 0.00013999974223833384, + "loss": 3.3401317596435547, + "step": 1734, + "token_acc": 0.2631723149013533 + }, + { + "epoch": 1.0170038111990618, + "grad_norm": 0.9223644796862505, + "learning_rate": 0.00013999972349802096, + "loss": 3.345731496810913, + "step": 1735, + "token_acc": 0.26548395201794533 + }, + { + "epoch": 1.017590149516271, + "grad_norm": 1.1637294576974744, + "learning_rate": 0.000139999704100155, + "loss": 3.3097307682037354, + "step": 1736, + "token_acc": 0.26968218452898274 + }, + { + "epoch": 1.01817648783348, + "grad_norm": 1.1466044474144608, + "learning_rate": 0.00013999968404473616, + "loss": 3.362715721130371, + "step": 1737, + "token_acc": 0.26228215823647727 + }, + { + "epoch": 1.018762826150689, + "grad_norm": 1.331175187208717, + "learning_rate": 0.0001399996633317646, + "loss": 3.3232498168945312, + "step": 1738, + "token_acc": 0.26689275856053685 + }, + { + "epoch": 1.019349164467898, + "grad_norm": 0.9820061971371875, + "learning_rate": 0.0001399996419612405, + "loss": 3.2821009159088135, + "step": 1739, + "token_acc": 0.27197503538999385 + }, + { + "epoch": 1.019935502785107, + "grad_norm": 1.2783143507305557, + "learning_rate": 0.00013999961993316416, + "loss": 3.337312698364258, + "step": 1740, + "token_acc": 0.2649543415317064 + }, + { + "epoch": 1.020521841102316, + "grad_norm": 1.418788593329582, + "learning_rate": 0.0001399995972475357, + "loss": 3.315840721130371, + "step": 1741, + "token_acc": 0.26926669290305655 + }, + { + "epoch": 1.0211081794195251, + "grad_norm": 1.1592718796004187, + "learning_rate": 0.0001399995739043553, + "loss": 3.33168625831604, + "step": 1742, + "token_acc": 0.2683552446427854 + }, + { + "epoch": 1.021694517736734, + "grad_norm": 1.4019478133008025, + "learning_rate": 0.00013999954990362326, + "loss": 3.3924148082733154, + "step": 1743, + "token_acc": 0.2575531872216948 + }, + { + "epoch": 1.0222808560539431, + "grad_norm": 1.4229998389239662, + "learning_rate": 0.00013999952524533976, + "loss": 3.324014902114868, + "step": 1744, + "token_acc": 0.26748387262917994 + }, + { + "epoch": 1.0228671943711523, + "grad_norm": 1.641438329660123, + "learning_rate": 0.00013999949992950507, + "loss": 3.3799757957458496, + "step": 1745, + "token_acc": 0.26034774660614685 + }, + { + "epoch": 1.0234535326883611, + "grad_norm": 0.9667915164729465, + "learning_rate": 0.00013999947395611939, + "loss": 3.32035493850708, + "step": 1746, + "token_acc": 0.2693528338987886 + }, + { + "epoch": 1.0240398710055703, + "grad_norm": 1.1901312751578843, + "learning_rate": 0.00013999944732518297, + "loss": 3.3056960105895996, + "step": 1747, + "token_acc": 0.26852144029571723 + }, + { + "epoch": 1.0246262093227791, + "grad_norm": 1.0098560142913988, + "learning_rate": 0.00013999942003669607, + "loss": 3.3067548274993896, + "step": 1748, + "token_acc": 0.2688751539900233 + }, + { + "epoch": 1.0252125476399883, + "grad_norm": 1.6576624524111359, + "learning_rate": 0.00013999939209065896, + "loss": 3.3458027839660645, + "step": 1749, + "token_acc": 0.26475199816436573 + }, + { + "epoch": 1.0257988859571974, + "grad_norm": 0.8018676153235134, + "learning_rate": 0.00013999936348707188, + "loss": 3.3323605060577393, + "step": 1750, + "token_acc": 0.26473924806900667 + }, + { + "epoch": 1.0263852242744063, + "grad_norm": 1.2913185013192388, + "learning_rate": 0.0001399993342259351, + "loss": 3.3125858306884766, + "step": 1751, + "token_acc": 0.2691325709626896 + }, + { + "epoch": 1.0269715625916154, + "grad_norm": 1.2427545375944296, + "learning_rate": 0.00013999930430724891, + "loss": 3.314629554748535, + "step": 1752, + "token_acc": 0.26635188253779285 + }, + { + "epoch": 1.0275579009088245, + "grad_norm": 1.4234333177981138, + "learning_rate": 0.00013999927373101358, + "loss": 3.326265811920166, + "step": 1753, + "token_acc": 0.26543636737947307 + }, + { + "epoch": 1.0281442392260334, + "grad_norm": 1.2328991244751653, + "learning_rate": 0.00013999924249722938, + "loss": 3.318502426147461, + "step": 1754, + "token_acc": 0.2697015066294393 + }, + { + "epoch": 1.0287305775432425, + "grad_norm": 1.1151485292462047, + "learning_rate": 0.00013999921060589663, + "loss": 3.350287914276123, + "step": 1755, + "token_acc": 0.26171984487353933 + }, + { + "epoch": 1.0293169158604514, + "grad_norm": 1.1830828332974732, + "learning_rate": 0.00013999917805701564, + "loss": 3.314169406890869, + "step": 1756, + "token_acc": 0.2693471911996869 + }, + { + "epoch": 1.0299032541776605, + "grad_norm": 1.448941355344543, + "learning_rate": 0.00013999914485058666, + "loss": 3.3251256942749023, + "step": 1757, + "token_acc": 0.2660629722101878 + }, + { + "epoch": 1.0304895924948696, + "grad_norm": 0.9251562530372269, + "learning_rate": 0.0001399991109866101, + "loss": 3.293213367462158, + "step": 1758, + "token_acc": 0.270832581471724 + }, + { + "epoch": 1.0310759308120785, + "grad_norm": 1.3304710436206886, + "learning_rate": 0.00013999907646508616, + "loss": 3.3478100299835205, + "step": 1759, + "token_acc": 0.2649337326823979 + }, + { + "epoch": 1.0316622691292876, + "grad_norm": 1.3638555434275688, + "learning_rate": 0.00013999904128601524, + "loss": 3.334480047225952, + "step": 1760, + "token_acc": 0.26499975478998405 + }, + { + "epoch": 1.0322486074464967, + "grad_norm": 1.275334425678453, + "learning_rate": 0.00013999900544939763, + "loss": 3.333841323852539, + "step": 1761, + "token_acc": 0.26601950870635405 + }, + { + "epoch": 1.0328349457637056, + "grad_norm": 1.0297007561032134, + "learning_rate": 0.00013999896895523374, + "loss": 3.3275198936462402, + "step": 1762, + "token_acc": 0.26690641358471295 + }, + { + "epoch": 1.0334212840809147, + "grad_norm": 1.1464697616979012, + "learning_rate": 0.00013999893180352384, + "loss": 3.327052593231201, + "step": 1763, + "token_acc": 0.2662565361330844 + }, + { + "epoch": 1.0340076223981236, + "grad_norm": 1.3890806466930228, + "learning_rate": 0.00013999889399426827, + "loss": 3.32881498336792, + "step": 1764, + "token_acc": 0.2653418132220133 + }, + { + "epoch": 1.0345939607153327, + "grad_norm": 1.1148908338911234, + "learning_rate": 0.00013999885552746746, + "loss": 3.288656711578369, + "step": 1765, + "token_acc": 0.2705376467160576 + }, + { + "epoch": 1.0351802990325418, + "grad_norm": 0.9804558206970546, + "learning_rate": 0.00013999881640312168, + "loss": 3.2847766876220703, + "step": 1766, + "token_acc": 0.27178461885608285 + }, + { + "epoch": 1.0357666373497507, + "grad_norm": 1.0520160748981298, + "learning_rate": 0.0001399987766212314, + "loss": 3.3211231231689453, + "step": 1767, + "token_acc": 0.26816973261552973 + }, + { + "epoch": 1.0363529756669598, + "grad_norm": 1.3013189369712828, + "learning_rate": 0.00013999873618179688, + "loss": 3.3616676330566406, + "step": 1768, + "token_acc": 0.26238565470230196 + }, + { + "epoch": 1.036939313984169, + "grad_norm": 1.1876083376368192, + "learning_rate": 0.00013999869508481857, + "loss": 3.290506362915039, + "step": 1769, + "token_acc": 0.26963750873239134 + }, + { + "epoch": 1.0375256523013778, + "grad_norm": 0.9999175436543033, + "learning_rate": 0.00013999865333029688, + "loss": 3.3091888427734375, + "step": 1770, + "token_acc": 0.2684587526399189 + }, + { + "epoch": 1.038111990618587, + "grad_norm": 1.2294061108614425, + "learning_rate": 0.00013999861091823214, + "loss": 3.336608409881592, + "step": 1771, + "token_acc": 0.26666232599325457 + }, + { + "epoch": 1.038698328935796, + "grad_norm": 1.0774548200044143, + "learning_rate": 0.00013999856784862477, + "loss": 3.360105037689209, + "step": 1772, + "token_acc": 0.2623185804194118 + }, + { + "epoch": 1.039284667253005, + "grad_norm": 1.0587649497427865, + "learning_rate": 0.0001399985241214752, + "loss": 3.3470427989959717, + "step": 1773, + "token_acc": 0.2633832636845903 + }, + { + "epoch": 1.039871005570214, + "grad_norm": 0.9852565223605219, + "learning_rate": 0.00013999847973678384, + "loss": 3.2943501472473145, + "step": 1774, + "token_acc": 0.2704324064335058 + }, + { + "epoch": 1.040457343887423, + "grad_norm": 1.1513684988755293, + "learning_rate": 0.00013999843469455104, + "loss": 3.3257832527160645, + "step": 1775, + "token_acc": 0.266993504044675 + }, + { + "epoch": 1.041043682204632, + "grad_norm": 0.8900799659357775, + "learning_rate": 0.0001399983889947773, + "loss": 3.354994773864746, + "step": 1776, + "token_acc": 0.2626962462807357 + }, + { + "epoch": 1.0416300205218412, + "grad_norm": 0.7730218706339016, + "learning_rate": 0.00013999834263746298, + "loss": 3.321240186691284, + "step": 1777, + "token_acc": 0.2660936202822658 + }, + { + "epoch": 1.04221635883905, + "grad_norm": 0.833763646708899, + "learning_rate": 0.0001399982956226086, + "loss": 3.3348751068115234, + "step": 1778, + "token_acc": 0.2657338734967251 + }, + { + "epoch": 1.0428026971562592, + "grad_norm": 1.1107716362616493, + "learning_rate": 0.00013999824795021454, + "loss": 3.301572799682617, + "step": 1779, + "token_acc": 0.2695578032959372 + }, + { + "epoch": 1.0433890354734683, + "grad_norm": 1.4856542278496963, + "learning_rate": 0.00013999819962028125, + "loss": 3.3440115451812744, + "step": 1780, + "token_acc": 0.2624542820679986 + }, + { + "epoch": 1.0439753737906772, + "grad_norm": 1.040143884670671, + "learning_rate": 0.00013999815063280921, + "loss": 3.319406509399414, + "step": 1781, + "token_acc": 0.26633087930714633 + }, + { + "epoch": 1.0445617121078863, + "grad_norm": 1.2654119906126347, + "learning_rate": 0.0001399981009877989, + "loss": 3.2569477558135986, + "step": 1782, + "token_acc": 0.27400824709301574 + }, + { + "epoch": 1.0451480504250952, + "grad_norm": 0.9213956527880066, + "learning_rate": 0.00013999805068525068, + "loss": 3.323514461517334, + "step": 1783, + "token_acc": 0.2672733729671408 + }, + { + "epoch": 1.0457343887423043, + "grad_norm": 1.0725791999185954, + "learning_rate": 0.00013999799972516517, + "loss": 3.2772116661071777, + "step": 1784, + "token_acc": 0.2731096022657571 + }, + { + "epoch": 1.0463207270595134, + "grad_norm": 1.205998028856346, + "learning_rate": 0.00013999794810754275, + "loss": 3.3316054344177246, + "step": 1785, + "token_acc": 0.2652874581482302 + }, + { + "epoch": 1.0469070653767223, + "grad_norm": 1.1401862560703175, + "learning_rate": 0.00013999789583238394, + "loss": 3.3418891429901123, + "step": 1786, + "token_acc": 0.26512752976590426 + }, + { + "epoch": 1.0474934036939314, + "grad_norm": 0.9815506604671348, + "learning_rate": 0.0001399978428996892, + "loss": 3.3635711669921875, + "step": 1787, + "token_acc": 0.26306641578174056 + }, + { + "epoch": 1.0480797420111405, + "grad_norm": 1.1673272600976379, + "learning_rate": 0.00013999778930945907, + "loss": 3.317405939102173, + "step": 1788, + "token_acc": 0.26699733818087384 + }, + { + "epoch": 1.0486660803283494, + "grad_norm": 1.3283198698217937, + "learning_rate": 0.000139997735061694, + "loss": 3.3545851707458496, + "step": 1789, + "token_acc": 0.2629396898175641 + }, + { + "epoch": 1.0492524186455585, + "grad_norm": 0.8064309407733516, + "learning_rate": 0.00013999768015639458, + "loss": 3.2974672317504883, + "step": 1790, + "token_acc": 0.27078108907608955 + }, + { + "epoch": 1.0498387569627674, + "grad_norm": 1.1086236393177713, + "learning_rate": 0.00013999762459356125, + "loss": 3.3527579307556152, + "step": 1791, + "token_acc": 0.26184795633071495 + }, + { + "epoch": 1.0504250952799765, + "grad_norm": 1.1115559787350677, + "learning_rate": 0.00013999756837319456, + "loss": 3.3284459114074707, + "step": 1792, + "token_acc": 0.26580342656636935 + }, + { + "epoch": 1.0510114335971856, + "grad_norm": 0.9671510985756274, + "learning_rate": 0.00013999751149529503, + "loss": 3.2788405418395996, + "step": 1793, + "token_acc": 0.2719465648854962 + }, + { + "epoch": 1.0515977719143945, + "grad_norm": 1.5588599662360043, + "learning_rate": 0.0001399974539598632, + "loss": 3.346158981323242, + "step": 1794, + "token_acc": 0.26401828262632787 + }, + { + "epoch": 1.0521841102316036, + "grad_norm": 0.8747284511001189, + "learning_rate": 0.00013999739576689963, + "loss": 3.3254544734954834, + "step": 1795, + "token_acc": 0.26666192155808277 + }, + { + "epoch": 1.0527704485488127, + "grad_norm": 1.1488101779700044, + "learning_rate": 0.00013999733691640487, + "loss": 3.31069278717041, + "step": 1796, + "token_acc": 0.2670141648965976 + }, + { + "epoch": 1.0533567868660216, + "grad_norm": 0.7678092789412517, + "learning_rate": 0.0001399972774083794, + "loss": 3.296076774597168, + "step": 1797, + "token_acc": 0.2692060439129685 + }, + { + "epoch": 1.0539431251832307, + "grad_norm": 1.0599135325538214, + "learning_rate": 0.00013999721724282388, + "loss": 3.3422317504882812, + "step": 1798, + "token_acc": 0.26471112275645226 + }, + { + "epoch": 1.0545294635004399, + "grad_norm": 1.2805634509628596, + "learning_rate": 0.0001399971564197388, + "loss": 3.3032217025756836, + "step": 1799, + "token_acc": 0.26969189086366185 + }, + { + "epoch": 1.0551158018176487, + "grad_norm": 0.9879500681445028, + "learning_rate": 0.00013999709493912475, + "loss": 3.295577049255371, + "step": 1800, + "token_acc": 0.26896933982904986 + }, + { + "epoch": 1.0557021401348579, + "grad_norm": 1.01711912746615, + "learning_rate": 0.00013999703280098236, + "loss": 3.357044219970703, + "step": 1801, + "token_acc": 0.2624168178676903 + }, + { + "epoch": 1.0562884784520667, + "grad_norm": 0.9297610177216736, + "learning_rate": 0.00013999697000531214, + "loss": 3.3503241539001465, + "step": 1802, + "token_acc": 0.2630404081818596 + }, + { + "epoch": 1.0568748167692759, + "grad_norm": 1.1275169653231805, + "learning_rate": 0.0001399969065521147, + "loss": 3.3427200317382812, + "step": 1803, + "token_acc": 0.2631673201880645 + }, + { + "epoch": 1.057461155086485, + "grad_norm": 1.5168705046673456, + "learning_rate": 0.00013999684244139066, + "loss": 3.3364458084106445, + "step": 1804, + "token_acc": 0.2648679766453736 + }, + { + "epoch": 1.0580474934036939, + "grad_norm": 0.7758619041488104, + "learning_rate": 0.0001399967776731406, + "loss": 3.321136474609375, + "step": 1805, + "token_acc": 0.2664464859205005 + }, + { + "epoch": 1.058633831720903, + "grad_norm": 1.3018297683569777, + "learning_rate": 0.00013999671224736512, + "loss": 3.308809280395508, + "step": 1806, + "token_acc": 0.2677616782301705 + }, + { + "epoch": 1.059220170038112, + "grad_norm": 1.3814469554642985, + "learning_rate": 0.00013999664616406486, + "loss": 3.313599109649658, + "step": 1807, + "token_acc": 0.26609996474556297 + }, + { + "epoch": 1.059806508355321, + "grad_norm": 0.7960372984128132, + "learning_rate": 0.00013999657942324043, + "loss": 3.304856300354004, + "step": 1808, + "token_acc": 0.2698606652569042 + }, + { + "epoch": 1.06039284667253, + "grad_norm": 1.4294149614854543, + "learning_rate": 0.00013999651202489246, + "loss": 3.336911916732788, + "step": 1809, + "token_acc": 0.26349588953227615 + }, + { + "epoch": 1.060979184989739, + "grad_norm": 0.751744999311584, + "learning_rate": 0.0001399964439690216, + "loss": 3.327693462371826, + "step": 1810, + "token_acc": 0.2646924043333196 + }, + { + "epoch": 1.061565523306948, + "grad_norm": 0.9909147085150155, + "learning_rate": 0.00013999637525562847, + "loss": 3.3280014991760254, + "step": 1811, + "token_acc": 0.2655039607297313 + }, + { + "epoch": 1.0621518616241572, + "grad_norm": 1.3325706766957, + "learning_rate": 0.00013999630588471367, + "loss": 3.3167190551757812, + "step": 1812, + "token_acc": 0.2676403597341912 + }, + { + "epoch": 1.062738199941366, + "grad_norm": 1.0086630479858323, + "learning_rate": 0.00013999623585627795, + "loss": 3.305112838745117, + "step": 1813, + "token_acc": 0.26868874043009705 + }, + { + "epoch": 1.0633245382585752, + "grad_norm": 1.089611211061808, + "learning_rate": 0.0001399961651703219, + "loss": 3.343794345855713, + "step": 1814, + "token_acc": 0.2644560030085224 + }, + { + "epoch": 1.0639108765757843, + "grad_norm": 1.071721484523487, + "learning_rate": 0.00013999609382684617, + "loss": 3.3233747482299805, + "step": 1815, + "token_acc": 0.26702088422538073 + }, + { + "epoch": 1.0644972148929932, + "grad_norm": 1.0832754803123985, + "learning_rate": 0.0001399960218258515, + "loss": 3.331878185272217, + "step": 1816, + "token_acc": 0.26516927479754004 + }, + { + "epoch": 1.0650835532102023, + "grad_norm": 1.1567334820171873, + "learning_rate": 0.0001399959491673385, + "loss": 3.2944395542144775, + "step": 1817, + "token_acc": 0.26997135208754053 + }, + { + "epoch": 1.0656698915274112, + "grad_norm": 1.0439031025417007, + "learning_rate": 0.0001399958758513079, + "loss": 3.322361946105957, + "step": 1818, + "token_acc": 0.2668553888481103 + }, + { + "epoch": 1.0662562298446203, + "grad_norm": 1.613010241333247, + "learning_rate": 0.00013999580187776034, + "loss": 3.3395676612854004, + "step": 1819, + "token_acc": 0.2616043627724483 + }, + { + "epoch": 1.0668425681618294, + "grad_norm": 0.7473231670673615, + "learning_rate": 0.00013999572724669656, + "loss": 3.3388588428497314, + "step": 1820, + "token_acc": 0.26432208865170653 + }, + { + "epoch": 1.0674289064790383, + "grad_norm": 1.1500927404570311, + "learning_rate": 0.00013999565195811723, + "loss": 3.3333349227905273, + "step": 1821, + "token_acc": 0.26493488741331905 + }, + { + "epoch": 1.0680152447962474, + "grad_norm": 1.33547826635988, + "learning_rate": 0.00013999557601202306, + "loss": 3.330199956893921, + "step": 1822, + "token_acc": 0.26604604589340675 + }, + { + "epoch": 1.0686015831134565, + "grad_norm": 1.0368455000288705, + "learning_rate": 0.00013999549940841479, + "loss": 3.294459819793701, + "step": 1823, + "token_acc": 0.26879343102816744 + }, + { + "epoch": 1.0691879214306654, + "grad_norm": 1.2093250485907292, + "learning_rate": 0.00013999542214729313, + "loss": 3.3290820121765137, + "step": 1824, + "token_acc": 0.2657802680473478 + }, + { + "epoch": 1.0697742597478745, + "grad_norm": 0.8713838553403832, + "learning_rate": 0.0001399953442286588, + "loss": 3.2968857288360596, + "step": 1825, + "token_acc": 0.26941374382721633 + }, + { + "epoch": 1.0703605980650837, + "grad_norm": 1.4551536889062209, + "learning_rate": 0.00013999526565251254, + "loss": 3.2816882133483887, + "step": 1826, + "token_acc": 0.2716676066929874 + }, + { + "epoch": 1.0709469363822925, + "grad_norm": 0.8507611938412795, + "learning_rate": 0.00013999518641885506, + "loss": 3.2701101303100586, + "step": 1827, + "token_acc": 0.27357226394435313 + }, + { + "epoch": 1.0715332746995017, + "grad_norm": 1.5293370848418248, + "learning_rate": 0.00013999510652768713, + "loss": 3.2855193614959717, + "step": 1828, + "token_acc": 0.26786718169123913 + }, + { + "epoch": 1.0721196130167105, + "grad_norm": 0.8636019805976295, + "learning_rate": 0.0001399950259790095, + "loss": 3.3260278701782227, + "step": 1829, + "token_acc": 0.26390238891127915 + }, + { + "epoch": 1.0727059513339197, + "grad_norm": 1.0369510616555084, + "learning_rate": 0.00013999494477282288, + "loss": 3.3669614791870117, + "step": 1830, + "token_acc": 0.25850625699096197 + }, + { + "epoch": 1.0732922896511288, + "grad_norm": 1.2398367043044685, + "learning_rate": 0.0001399948629091281, + "loss": 3.2878429889678955, + "step": 1831, + "token_acc": 0.2720432207594776 + }, + { + "epoch": 1.0738786279683377, + "grad_norm": 1.273925486807277, + "learning_rate": 0.00013999478038792594, + "loss": 3.2479612827301025, + "step": 1832, + "token_acc": 0.27589032261474944 + }, + { + "epoch": 1.0744649662855468, + "grad_norm": 0.9146816916096273, + "learning_rate": 0.0001399946972092171, + "loss": 3.301370859146118, + "step": 1833, + "token_acc": 0.26903573100327666 + }, + { + "epoch": 1.0750513046027559, + "grad_norm": 1.6017705278626457, + "learning_rate": 0.0001399946133730024, + "loss": 3.281545639038086, + "step": 1834, + "token_acc": 0.2708565469747056 + }, + { + "epoch": 1.0756376429199648, + "grad_norm": 0.9833885208929952, + "learning_rate": 0.00013999452887928263, + "loss": 3.338879346847534, + "step": 1835, + "token_acc": 0.2650875780920624 + }, + { + "epoch": 1.0762239812371739, + "grad_norm": 1.3698570040137303, + "learning_rate": 0.00013999444372805858, + "loss": 3.2998180389404297, + "step": 1836, + "token_acc": 0.26981970184550835 + }, + { + "epoch": 1.0768103195543828, + "grad_norm": 1.2877304166096115, + "learning_rate": 0.00013999435791933103, + "loss": 3.348214864730835, + "step": 1837, + "token_acc": 0.26316915060853335 + }, + { + "epoch": 1.077396657871592, + "grad_norm": 1.1564317398648107, + "learning_rate": 0.00013999427145310083, + "loss": 3.303995132446289, + "step": 1838, + "token_acc": 0.2669063967215784 + }, + { + "epoch": 1.077982996188801, + "grad_norm": 1.222992538862914, + "learning_rate": 0.00013999418432936877, + "loss": 3.3253049850463867, + "step": 1839, + "token_acc": 0.2655641017688594 + }, + { + "epoch": 1.07856933450601, + "grad_norm": 0.8980403231458256, + "learning_rate": 0.00013999409654813564, + "loss": 3.3390817642211914, + "step": 1840, + "token_acc": 0.2630144347902473 + }, + { + "epoch": 1.079155672823219, + "grad_norm": 1.1559855607120255, + "learning_rate": 0.0001399940081094023, + "loss": 3.277944803237915, + "step": 1841, + "token_acc": 0.2717599401398457 + }, + { + "epoch": 1.0797420111404281, + "grad_norm": 0.8730834210500212, + "learning_rate": 0.00013999391901316961, + "loss": 3.2907848358154297, + "step": 1842, + "token_acc": 0.27076429388477885 + }, + { + "epoch": 1.080328349457637, + "grad_norm": 0.8635917251124942, + "learning_rate": 0.00013999382925943834, + "loss": 3.3359527587890625, + "step": 1843, + "token_acc": 0.2648700550163284 + }, + { + "epoch": 1.0809146877748461, + "grad_norm": 0.6098258875311179, + "learning_rate": 0.00013999373884820935, + "loss": 3.3045454025268555, + "step": 1844, + "token_acc": 0.26686216213283565 + }, + { + "epoch": 1.081501026092055, + "grad_norm": 1.1251604261023664, + "learning_rate": 0.00013999364777948352, + "loss": 3.290310859680176, + "step": 1845, + "token_acc": 0.26986378890570506 + }, + { + "epoch": 1.0820873644092641, + "grad_norm": 1.2461749520837204, + "learning_rate": 0.0001399935560532617, + "loss": 3.252135992050171, + "step": 1846, + "token_acc": 0.27515763637857377 + }, + { + "epoch": 1.0826737027264732, + "grad_norm": 0.8100914081760897, + "learning_rate": 0.00013999346366954472, + "loss": 3.2760884761810303, + "step": 1847, + "token_acc": 0.2718901671404027 + }, + { + "epoch": 1.0832600410436821, + "grad_norm": 1.2656913646658259, + "learning_rate": 0.00013999337062833346, + "loss": 3.2524685859680176, + "step": 1848, + "token_acc": 0.2741837960437387 + }, + { + "epoch": 1.0838463793608912, + "grad_norm": 0.7247251981099396, + "learning_rate": 0.0001399932769296288, + "loss": 3.2182953357696533, + "step": 1849, + "token_acc": 0.28022362706776244 + }, + { + "epoch": 1.0844327176781003, + "grad_norm": 0.908585700878057, + "learning_rate": 0.00013999318257343162, + "loss": 3.3057150840759277, + "step": 1850, + "token_acc": 0.26736236697724414 + }, + { + "epoch": 1.0850190559953092, + "grad_norm": 1.1017466213489573, + "learning_rate": 0.0001399930875597428, + "loss": 3.2950050830841064, + "step": 1851, + "token_acc": 0.2686912610464301 + }, + { + "epoch": 1.0856053943125183, + "grad_norm": 1.1491698301440563, + "learning_rate": 0.00013999299188856328, + "loss": 3.301602363586426, + "step": 1852, + "token_acc": 0.27003996254691753 + }, + { + "epoch": 1.0861917326297275, + "grad_norm": 0.9181808351619032, + "learning_rate": 0.00013999289555989387, + "loss": 3.316830635070801, + "step": 1853, + "token_acc": 0.2664258454991918 + }, + { + "epoch": 1.0867780709469363, + "grad_norm": 1.0216041658426336, + "learning_rate": 0.00013999279857373556, + "loss": 3.343573808670044, + "step": 1854, + "token_acc": 0.2636155454515283 + }, + { + "epoch": 1.0873644092641455, + "grad_norm": 1.2934262879052087, + "learning_rate": 0.00013999270093008922, + "loss": 3.25911283493042, + "step": 1855, + "token_acc": 0.2745379775058217 + }, + { + "epoch": 1.0879507475813543, + "grad_norm": 0.6503468020493017, + "learning_rate": 0.00013999260262895574, + "loss": 3.306527614593506, + "step": 1856, + "token_acc": 0.26668879982925847 + }, + { + "epoch": 1.0885370858985635, + "grad_norm": 0.9405648187380918, + "learning_rate": 0.0001399925036703361, + "loss": 3.364485740661621, + "step": 1857, + "token_acc": 0.2604449170868718 + }, + { + "epoch": 1.0891234242157726, + "grad_norm": 1.497570455005679, + "learning_rate": 0.0001399924040542312, + "loss": 3.309842824935913, + "step": 1858, + "token_acc": 0.2660288891941558 + }, + { + "epoch": 1.0897097625329815, + "grad_norm": 0.7199694119735058, + "learning_rate": 0.00013999230378064197, + "loss": 3.308772087097168, + "step": 1859, + "token_acc": 0.26789840644914703 + }, + { + "epoch": 1.0902961008501906, + "grad_norm": 1.5180060422852206, + "learning_rate": 0.00013999220284956936, + "loss": 3.3184075355529785, + "step": 1860, + "token_acc": 0.26377301378245727 + }, + { + "epoch": 1.0908824391673997, + "grad_norm": 0.7701802885184044, + "learning_rate": 0.00013999210126101433, + "loss": 3.3614258766174316, + "step": 1861, + "token_acc": 0.26163085084334 + }, + { + "epoch": 1.0914687774846086, + "grad_norm": 1.2566696807955915, + "learning_rate": 0.00013999199901497782, + "loss": 3.2930243015289307, + "step": 1862, + "token_acc": 0.27156288394509687 + }, + { + "epoch": 1.0920551158018177, + "grad_norm": 0.9182559800217669, + "learning_rate": 0.00013999189611146081, + "loss": 3.31693172454834, + "step": 1863, + "token_acc": 0.2675856386356599 + }, + { + "epoch": 1.0926414541190266, + "grad_norm": 1.1426842117053462, + "learning_rate": 0.00013999179255046423, + "loss": 3.252509593963623, + "step": 1864, + "token_acc": 0.274149438645499 + }, + { + "epoch": 1.0932277924362357, + "grad_norm": 1.1106417396325976, + "learning_rate": 0.0001399916883319891, + "loss": 3.261803150177002, + "step": 1865, + "token_acc": 0.2728094106750961 + }, + { + "epoch": 1.0938141307534448, + "grad_norm": 0.9954299856150943, + "learning_rate": 0.00013999158345603637, + "loss": 3.2489876747131348, + "step": 1866, + "token_acc": 0.2752487289506927 + }, + { + "epoch": 1.0944004690706537, + "grad_norm": 0.9836350810312638, + "learning_rate": 0.000139991477922607, + "loss": 3.2866289615631104, + "step": 1867, + "token_acc": 0.2723171984892768 + }, + { + "epoch": 1.0949868073878628, + "grad_norm": 0.9507489955610542, + "learning_rate": 0.00013999137173170202, + "loss": 3.323457956314087, + "step": 1868, + "token_acc": 0.2655584383140287 + }, + { + "epoch": 1.095573145705072, + "grad_norm": 0.8450988642634238, + "learning_rate": 0.0001399912648833224, + "loss": 3.3118367195129395, + "step": 1869, + "token_acc": 0.26796890995653716 + }, + { + "epoch": 1.0961594840222808, + "grad_norm": 0.9351306648347011, + "learning_rate": 0.0001399911573774692, + "loss": 3.322701930999756, + "step": 1870, + "token_acc": 0.26708102472918555 + }, + { + "epoch": 1.09674582233949, + "grad_norm": 0.9842668059515658, + "learning_rate": 0.00013999104921414335, + "loss": 3.2851157188415527, + "step": 1871, + "token_acc": 0.27061894851619056 + }, + { + "epoch": 1.0973321606566988, + "grad_norm": 1.166577425011579, + "learning_rate": 0.00013999094039334595, + "loss": 3.2874763011932373, + "step": 1872, + "token_acc": 0.2716030379007073 + }, + { + "epoch": 1.097918498973908, + "grad_norm": 1.0197718254792618, + "learning_rate": 0.00013999083091507797, + "loss": 3.2692391872406006, + "step": 1873, + "token_acc": 0.2719374756427013 + }, + { + "epoch": 1.098504837291117, + "grad_norm": 0.8781846958233227, + "learning_rate": 0.00013999072077934042, + "loss": 3.2783310413360596, + "step": 1874, + "token_acc": 0.2736820895788859 + }, + { + "epoch": 1.099091175608326, + "grad_norm": 1.1599809443604552, + "learning_rate": 0.00013999060998613438, + "loss": 3.3003063201904297, + "step": 1875, + "token_acc": 0.26700517664958484 + }, + { + "epoch": 1.099677513925535, + "grad_norm": 0.8997034460378547, + "learning_rate": 0.00013999049853546087, + "loss": 3.331218957901001, + "step": 1876, + "token_acc": 0.2643905678492462 + }, + { + "epoch": 1.1002638522427441, + "grad_norm": 0.9296153918164964, + "learning_rate": 0.00013999038642732093, + "loss": 3.241473436355591, + "step": 1877, + "token_acc": 0.2776678517654293 + }, + { + "epoch": 1.100850190559953, + "grad_norm": 1.3537777951583687, + "learning_rate": 0.00013999027366171565, + "loss": 3.2870283126831055, + "step": 1878, + "token_acc": 0.27148064527024396 + }, + { + "epoch": 1.1014365288771621, + "grad_norm": 0.6647397589705146, + "learning_rate": 0.00013999016023864602, + "loss": 3.379164218902588, + "step": 1879, + "token_acc": 0.2570523384304858 + }, + { + "epoch": 1.1020228671943713, + "grad_norm": 0.9822821036040421, + "learning_rate": 0.00013999004615811319, + "loss": 3.259047508239746, + "step": 1880, + "token_acc": 0.27398084581172627 + }, + { + "epoch": 1.1026092055115801, + "grad_norm": 1.0733352997023877, + "learning_rate": 0.00013998993142011818, + "loss": 3.3657634258270264, + "step": 1881, + "token_acc": 0.26218121938861677 + }, + { + "epoch": 1.1031955438287893, + "grad_norm": 1.0465187261581401, + "learning_rate": 0.00013998981602466204, + "loss": 3.271768093109131, + "step": 1882, + "token_acc": 0.2705745599468615 + }, + { + "epoch": 1.1037818821459981, + "grad_norm": 1.4709603791923103, + "learning_rate": 0.00013998969997174593, + "loss": 3.3049778938293457, + "step": 1883, + "token_acc": 0.26826877922290054 + }, + { + "epoch": 1.1043682204632073, + "grad_norm": 0.7146593027189652, + "learning_rate": 0.0001399895832613709, + "loss": 3.348053455352783, + "step": 1884, + "token_acc": 0.2631329418936027 + }, + { + "epoch": 1.1049545587804164, + "grad_norm": 1.1079262161660524, + "learning_rate": 0.00013998946589353803, + "loss": 3.3031368255615234, + "step": 1885, + "token_acc": 0.26803077990179847 + }, + { + "epoch": 1.1055408970976253, + "grad_norm": 0.9520136137114626, + "learning_rate": 0.00013998934786824845, + "loss": 3.2916035652160645, + "step": 1886, + "token_acc": 0.2701066001969407 + }, + { + "epoch": 1.1061272354148344, + "grad_norm": 1.340398865809543, + "learning_rate": 0.00013998922918550326, + "loss": 3.2986459732055664, + "step": 1887, + "token_acc": 0.2704262373105026 + }, + { + "epoch": 1.1067135737320435, + "grad_norm": 0.8223712073681388, + "learning_rate": 0.00013998910984530357, + "loss": 3.3266139030456543, + "step": 1888, + "token_acc": 0.26573040198102355 + }, + { + "epoch": 1.1072999120492524, + "grad_norm": 0.8724142106525006, + "learning_rate": 0.0001399889898476505, + "loss": 3.324303150177002, + "step": 1889, + "token_acc": 0.26524718436327904 + }, + { + "epoch": 1.1078862503664615, + "grad_norm": 0.8455126963491224, + "learning_rate": 0.00013998886919254518, + "loss": 3.272357940673828, + "step": 1890, + "token_acc": 0.2717668141953352 + }, + { + "epoch": 1.1084725886836704, + "grad_norm": 1.2108692741823819, + "learning_rate": 0.00013998874787998875, + "loss": 3.323394298553467, + "step": 1891, + "token_acc": 0.26520179699009366 + }, + { + "epoch": 1.1090589270008795, + "grad_norm": 0.7913665158157078, + "learning_rate": 0.00013998862590998236, + "loss": 3.2900800704956055, + "step": 1892, + "token_acc": 0.26916836175261033 + }, + { + "epoch": 1.1096452653180886, + "grad_norm": 0.8356301268618571, + "learning_rate": 0.0001399885032825271, + "loss": 3.289419412612915, + "step": 1893, + "token_acc": 0.26937441154932523 + }, + { + "epoch": 1.1102316036352975, + "grad_norm": 0.684766429557567, + "learning_rate": 0.0001399883799976242, + "loss": 3.287341833114624, + "step": 1894, + "token_acc": 0.2711999360954857 + }, + { + "epoch": 1.1108179419525066, + "grad_norm": 0.9783662583349133, + "learning_rate": 0.00013998825605527476, + "loss": 3.303997039794922, + "step": 1895, + "token_acc": 0.2671409038462516 + }, + { + "epoch": 1.1114042802697157, + "grad_norm": 1.0050496371266775, + "learning_rate": 0.00013998813145547998, + "loss": 3.3054754734039307, + "step": 1896, + "token_acc": 0.26853206534991464 + }, + { + "epoch": 1.1119906185869246, + "grad_norm": 1.3996467425054169, + "learning_rate": 0.00013998800619824102, + "loss": 3.289036273956299, + "step": 1897, + "token_acc": 0.2697812952526949 + }, + { + "epoch": 1.1125769569041337, + "grad_norm": 0.9218744468740792, + "learning_rate": 0.00013998788028355905, + "loss": 3.2724738121032715, + "step": 1898, + "token_acc": 0.27360321231602824 + }, + { + "epoch": 1.1131632952213426, + "grad_norm": 1.1667947411399546, + "learning_rate": 0.00013998775371143522, + "loss": 3.3260960578918457, + "step": 1899, + "token_acc": 0.26720866923828845 + }, + { + "epoch": 1.1137496335385517, + "grad_norm": 1.0971500792293096, + "learning_rate": 0.0001399876264818708, + "loss": 3.244976043701172, + "step": 1900, + "token_acc": 0.27503414508439633 + }, + { + "epoch": 1.1143359718557608, + "grad_norm": 0.9848058593145909, + "learning_rate": 0.0001399874985948669, + "loss": 3.330976963043213, + "step": 1901, + "token_acc": 0.26608724283276586 + }, + { + "epoch": 1.1149223101729697, + "grad_norm": 0.7727261311315157, + "learning_rate": 0.0001399873700504248, + "loss": 3.3092007637023926, + "step": 1902, + "token_acc": 0.26821043457057847 + }, + { + "epoch": 1.1155086484901788, + "grad_norm": 0.8764433813856298, + "learning_rate": 0.00013998724084854564, + "loss": 3.258882999420166, + "step": 1903, + "token_acc": 0.27473534278397616 + }, + { + "epoch": 1.116094986807388, + "grad_norm": 1.0004384674682756, + "learning_rate": 0.00013998711098923066, + "loss": 3.3339600563049316, + "step": 1904, + "token_acc": 0.2648401704349408 + }, + { + "epoch": 1.1166813251245968, + "grad_norm": 0.9693524331929907, + "learning_rate": 0.00013998698047248108, + "loss": 3.2253172397613525, + "step": 1905, + "token_acc": 0.27831982631635677 + }, + { + "epoch": 1.117267663441806, + "grad_norm": 0.88397228043664, + "learning_rate": 0.00013998684929829815, + "loss": 3.320296287536621, + "step": 1906, + "token_acc": 0.2665719137228772 + }, + { + "epoch": 1.117854001759015, + "grad_norm": 1.3086384267931546, + "learning_rate": 0.00013998671746668305, + "loss": 3.3414483070373535, + "step": 1907, + "token_acc": 0.26346636029839854 + }, + { + "epoch": 1.118440340076224, + "grad_norm": 1.2290662260882046, + "learning_rate": 0.00013998658497763706, + "loss": 3.290722608566284, + "step": 1908, + "token_acc": 0.26859515871191675 + }, + { + "epoch": 1.119026678393433, + "grad_norm": 0.9739474848104016, + "learning_rate": 0.00013998645183116142, + "loss": 3.252053737640381, + "step": 1909, + "token_acc": 0.27211002740295326 + }, + { + "epoch": 1.119613016710642, + "grad_norm": 0.9609520537502815, + "learning_rate": 0.00013998631802725737, + "loss": 3.2660207748413086, + "step": 1910, + "token_acc": 0.2732543827728664 + }, + { + "epoch": 1.120199355027851, + "grad_norm": 0.8863912761997428, + "learning_rate": 0.00013998618356592614, + "loss": 3.3199892044067383, + "step": 1911, + "token_acc": 0.2651269738760885 + }, + { + "epoch": 1.1207856933450602, + "grad_norm": 0.962278852525161, + "learning_rate": 0.00013998604844716906, + "loss": 3.3091344833374023, + "step": 1912, + "token_acc": 0.2663750680714458 + }, + { + "epoch": 1.121372031662269, + "grad_norm": 1.0154201273351295, + "learning_rate": 0.00013998591267098736, + "loss": 3.284541130065918, + "step": 1913, + "token_acc": 0.2716598255515649 + }, + { + "epoch": 1.1219583699794782, + "grad_norm": 0.820760376225301, + "learning_rate": 0.0001399857762373823, + "loss": 3.3344931602478027, + "step": 1914, + "token_acc": 0.2649089155763059 + }, + { + "epoch": 1.1225447082966873, + "grad_norm": 0.8550779949304272, + "learning_rate": 0.0001399856391463552, + "loss": 3.297438144683838, + "step": 1915, + "token_acc": 0.26884452383821134 + }, + { + "epoch": 1.1231310466138962, + "grad_norm": 0.793883068887793, + "learning_rate": 0.00013998550139790732, + "loss": 3.295935869216919, + "step": 1916, + "token_acc": 0.26726366107897076 + }, + { + "epoch": 1.1237173849311053, + "grad_norm": 0.6925294249109684, + "learning_rate": 0.00013998536299203996, + "loss": 3.2794947624206543, + "step": 1917, + "token_acc": 0.2713245132340919 + }, + { + "epoch": 1.1243037232483142, + "grad_norm": 0.8790466568877678, + "learning_rate": 0.00013998522392875441, + "loss": 3.3019251823425293, + "step": 1918, + "token_acc": 0.26880011596972975 + }, + { + "epoch": 1.1248900615655233, + "grad_norm": 1.142344284370931, + "learning_rate": 0.000139985084208052, + "loss": 3.3230037689208984, + "step": 1919, + "token_acc": 0.2657927790204032 + }, + { + "epoch": 1.1254763998827324, + "grad_norm": 1.0506056379450228, + "learning_rate": 0.000139984943829934, + "loss": 3.3077712059020996, + "step": 1920, + "token_acc": 0.26848174746787623 + }, + { + "epoch": 1.1260627381999413, + "grad_norm": 1.0067958912755068, + "learning_rate": 0.00013998480279440182, + "loss": 3.271796226501465, + "step": 1921, + "token_acc": 0.2695801832670575 + }, + { + "epoch": 1.1266490765171504, + "grad_norm": 0.909357420572101, + "learning_rate": 0.00013998466110145665, + "loss": 3.307565212249756, + "step": 1922, + "token_acc": 0.2700992003785921 + }, + { + "epoch": 1.1272354148343595, + "grad_norm": 0.8916113891904401, + "learning_rate": 0.00013998451875109994, + "loss": 3.2779183387756348, + "step": 1923, + "token_acc": 0.27172233892995934 + }, + { + "epoch": 1.1278217531515684, + "grad_norm": 1.0548486588114818, + "learning_rate": 0.00013998437574333297, + "loss": 3.287111759185791, + "step": 1924, + "token_acc": 0.27082784265118404 + }, + { + "epoch": 1.1284080914687775, + "grad_norm": 0.767115472119786, + "learning_rate": 0.00013998423207815713, + "loss": 3.2825584411621094, + "step": 1925, + "token_acc": 0.27224032385466035 + }, + { + "epoch": 1.1289944297859864, + "grad_norm": 0.9290370087135374, + "learning_rate": 0.00013998408775557368, + "loss": 3.297276496887207, + "step": 1926, + "token_acc": 0.2677517185604529 + }, + { + "epoch": 1.1295807681031955, + "grad_norm": 1.397897744791598, + "learning_rate": 0.00013998394277558405, + "loss": 3.290966510772705, + "step": 1927, + "token_acc": 0.2690842094188711 + }, + { + "epoch": 1.1301671064204046, + "grad_norm": 0.8497468891904235, + "learning_rate": 0.0001399837971381896, + "loss": 3.2938361167907715, + "step": 1928, + "token_acc": 0.2695718717470395 + }, + { + "epoch": 1.1307534447376135, + "grad_norm": 0.816515040224627, + "learning_rate": 0.00013998365084339168, + "loss": 3.245223045349121, + "step": 1929, + "token_acc": 0.27621646654962284 + }, + { + "epoch": 1.1313397830548226, + "grad_norm": 0.6734272779440157, + "learning_rate": 0.00013998350389119162, + "loss": 3.2750918865203857, + "step": 1930, + "token_acc": 0.27203950961473444 + }, + { + "epoch": 1.1319261213720317, + "grad_norm": 0.618889470596775, + "learning_rate": 0.00013998335628159086, + "loss": 3.2781543731689453, + "step": 1931, + "token_acc": 0.27140484462381553 + }, + { + "epoch": 1.1325124596892406, + "grad_norm": 0.7517126522012283, + "learning_rate": 0.00013998320801459081, + "loss": 3.278299331665039, + "step": 1932, + "token_acc": 0.2724875716136167 + }, + { + "epoch": 1.1330987980064497, + "grad_norm": 0.8107779686763992, + "learning_rate": 0.0001399830590901928, + "loss": 3.301764488220215, + "step": 1933, + "token_acc": 0.2683763002444233 + }, + { + "epoch": 1.1336851363236589, + "grad_norm": 0.7971392793476056, + "learning_rate": 0.00013998290950839826, + "loss": 3.3297338485717773, + "step": 1934, + "token_acc": 0.26359942396974473 + }, + { + "epoch": 1.1342714746408677, + "grad_norm": 0.7216806327999759, + "learning_rate": 0.00013998275926920857, + "loss": 3.2673377990722656, + "step": 1935, + "token_acc": 0.2733427894474017 + }, + { + "epoch": 1.1348578129580769, + "grad_norm": 0.8779734375967914, + "learning_rate": 0.0001399826083726252, + "loss": 3.258728504180908, + "step": 1936, + "token_acc": 0.2753815304630046 + }, + { + "epoch": 1.1354441512752858, + "grad_norm": 1.1227995532520973, + "learning_rate": 0.00013998245681864948, + "loss": 3.294938087463379, + "step": 1937, + "token_acc": 0.2678384981900476 + }, + { + "epoch": 1.1360304895924949, + "grad_norm": 1.035902863609576, + "learning_rate": 0.0001399823046072829, + "loss": 3.2935845851898193, + "step": 1938, + "token_acc": 0.2689313949169629 + }, + { + "epoch": 1.136616827909704, + "grad_norm": 0.9096071089849996, + "learning_rate": 0.00013998215173852688, + "loss": 3.327840566635132, + "step": 1939, + "token_acc": 0.26409631209996953 + }, + { + "epoch": 1.1372031662269129, + "grad_norm": 0.9578814886908256, + "learning_rate": 0.00013998199821238283, + "loss": 3.3148975372314453, + "step": 1940, + "token_acc": 0.2677343362102661 + }, + { + "epoch": 1.137789504544122, + "grad_norm": 1.2517582423569324, + "learning_rate": 0.0001399818440288522, + "loss": 3.2710928916931152, + "step": 1941, + "token_acc": 0.2720199915819345 + }, + { + "epoch": 1.1383758428613309, + "grad_norm": 0.7478999600166207, + "learning_rate": 0.00013998168918793647, + "loss": 3.2860074043273926, + "step": 1942, + "token_acc": 0.2711199778039081 + }, + { + "epoch": 1.13896218117854, + "grad_norm": 0.8609328848135778, + "learning_rate": 0.00013998153368963707, + "loss": 3.3212461471557617, + "step": 1943, + "token_acc": 0.26525140466718616 + }, + { + "epoch": 1.139548519495749, + "grad_norm": 1.0721584093297092, + "learning_rate": 0.00013998137753395545, + "loss": 3.335183620452881, + "step": 1944, + "token_acc": 0.26417570451562866 + }, + { + "epoch": 1.140134857812958, + "grad_norm": 1.0505126174816861, + "learning_rate": 0.0001399812207208931, + "loss": 3.250131607055664, + "step": 1945, + "token_acc": 0.2735841006585939 + }, + { + "epoch": 1.140721196130167, + "grad_norm": 0.9274660129297787, + "learning_rate": 0.00013998106325045147, + "loss": 3.2657017707824707, + "step": 1946, + "token_acc": 0.2739049672527762 + }, + { + "epoch": 1.1413075344473762, + "grad_norm": 1.2913452709511533, + "learning_rate": 0.00013998090512263206, + "loss": 3.3363468647003174, + "step": 1947, + "token_acc": 0.2634593356242841 + }, + { + "epoch": 1.141893872764585, + "grad_norm": 0.6525713895374821, + "learning_rate": 0.00013998074633743635, + "loss": 3.2542076110839844, + "step": 1948, + "token_acc": 0.27485320232478566 + }, + { + "epoch": 1.1424802110817942, + "grad_norm": 0.7356740458820169, + "learning_rate": 0.00013998058689486582, + "loss": 3.282883405685425, + "step": 1949, + "token_acc": 0.27123746362151224 + }, + { + "epoch": 1.1430665493990033, + "grad_norm": 0.85794360317439, + "learning_rate": 0.000139980426794922, + "loss": 3.340507984161377, + "step": 1950, + "token_acc": 0.2633244878079031 + }, + { + "epoch": 1.1436528877162122, + "grad_norm": 0.7414672908723083, + "learning_rate": 0.00013998026603760633, + "loss": 3.3087711334228516, + "step": 1951, + "token_acc": 0.2665588164943893 + }, + { + "epoch": 1.1442392260334213, + "grad_norm": 0.6330179418724468, + "learning_rate": 0.0001399801046229204, + "loss": 3.303285598754883, + "step": 1952, + "token_acc": 0.2676859759673309 + }, + { + "epoch": 1.1448255643506302, + "grad_norm": 0.8925808006937246, + "learning_rate": 0.00013997994255086565, + "loss": 3.2777936458587646, + "step": 1953, + "token_acc": 0.2713731467368713 + }, + { + "epoch": 1.1454119026678393, + "grad_norm": 1.1276444745587175, + "learning_rate": 0.00013997977982144365, + "loss": 3.272172212600708, + "step": 1954, + "token_acc": 0.2713280761712257 + }, + { + "epoch": 1.1459982409850484, + "grad_norm": 0.7039309572929954, + "learning_rate": 0.00013997961643465593, + "loss": 3.313575029373169, + "step": 1955, + "token_acc": 0.2674113668475534 + }, + { + "epoch": 1.1465845793022573, + "grad_norm": 1.0493796151181258, + "learning_rate": 0.000139979452390504, + "loss": 3.3273534774780273, + "step": 1956, + "token_acc": 0.26380629933144745 + }, + { + "epoch": 1.1471709176194664, + "grad_norm": 1.1357127094696116, + "learning_rate": 0.00013997928768898945, + "loss": 3.296158790588379, + "step": 1957, + "token_acc": 0.2697438254945157 + }, + { + "epoch": 1.1477572559366755, + "grad_norm": 0.8153337896490619, + "learning_rate": 0.00013997912233011374, + "loss": 3.260286808013916, + "step": 1958, + "token_acc": 0.27425285998808224 + }, + { + "epoch": 1.1483435942538844, + "grad_norm": 0.7717407694352988, + "learning_rate": 0.0001399789563138785, + "loss": 3.2907774448394775, + "step": 1959, + "token_acc": 0.2704232570057537 + }, + { + "epoch": 1.1489299325710935, + "grad_norm": 0.8626275426143295, + "learning_rate": 0.00013997878964028525, + "loss": 3.289522171020508, + "step": 1960, + "token_acc": 0.27131638632877536 + }, + { + "epoch": 1.1495162708883027, + "grad_norm": 0.8380905507944597, + "learning_rate": 0.0001399786223093356, + "loss": 3.233691692352295, + "step": 1961, + "token_acc": 0.27606119951687375 + }, + { + "epoch": 1.1501026092055116, + "grad_norm": 0.9383138095442958, + "learning_rate": 0.00013997845432103104, + "loss": 3.3120272159576416, + "step": 1962, + "token_acc": 0.26775831396330296 + }, + { + "epoch": 1.1506889475227207, + "grad_norm": 1.4280232083851079, + "learning_rate": 0.00013997828567537322, + "loss": 3.29605770111084, + "step": 1963, + "token_acc": 0.2682191520676067 + }, + { + "epoch": 1.1512752858399296, + "grad_norm": 0.5793568213274755, + "learning_rate": 0.00013997811637236372, + "loss": 3.2406818866729736, + "step": 1964, + "token_acc": 0.2752665836814385 + }, + { + "epoch": 1.1518616241571387, + "grad_norm": 1.0410063136477115, + "learning_rate": 0.00013997794641200408, + "loss": 3.319178342819214, + "step": 1965, + "token_acc": 0.26747206677261165 + }, + { + "epoch": 1.1524479624743478, + "grad_norm": 1.4843280748335523, + "learning_rate": 0.00013997777579429597, + "loss": 3.3066418170928955, + "step": 1966, + "token_acc": 0.26656780878885533 + }, + { + "epoch": 1.1530343007915567, + "grad_norm": 0.7725640772898864, + "learning_rate": 0.00013997760451924093, + "loss": 3.298063278198242, + "step": 1967, + "token_acc": 0.269319377835042 + }, + { + "epoch": 1.1536206391087658, + "grad_norm": 1.460568808752417, + "learning_rate": 0.0001399774325868406, + "loss": 3.2945804595947266, + "step": 1968, + "token_acc": 0.2691454697662438 + }, + { + "epoch": 1.1542069774259747, + "grad_norm": 0.9105052646406522, + "learning_rate": 0.00013997725999709658, + "loss": 3.244741916656494, + "step": 1969, + "token_acc": 0.27579952267303104 + }, + { + "epoch": 1.1547933157431838, + "grad_norm": 1.011474365360013, + "learning_rate": 0.0001399770867500105, + "loss": 3.247795581817627, + "step": 1970, + "token_acc": 0.27575713626475046 + }, + { + "epoch": 1.155379654060393, + "grad_norm": 1.243656398316088, + "learning_rate": 0.000139976912845584, + "loss": 3.3330931663513184, + "step": 1971, + "token_acc": 0.2658736779784171 + }, + { + "epoch": 1.1559659923776018, + "grad_norm": 0.7296939637018416, + "learning_rate": 0.00013997673828381867, + "loss": 3.2535276412963867, + "step": 1972, + "token_acc": 0.27408158390885207 + }, + { + "epoch": 1.156552330694811, + "grad_norm": 0.8402281762265186, + "learning_rate": 0.00013997656306471618, + "loss": 3.2534587383270264, + "step": 1973, + "token_acc": 0.2740848286365936 + }, + { + "epoch": 1.15713866901202, + "grad_norm": 0.9647313407763141, + "learning_rate": 0.0001399763871882782, + "loss": 3.320188522338867, + "step": 1974, + "token_acc": 0.264661799055051 + }, + { + "epoch": 1.157725007329229, + "grad_norm": 1.0192220820580344, + "learning_rate": 0.00013997621065450633, + "loss": 3.2865147590637207, + "step": 1975, + "token_acc": 0.27192734268899954 + }, + { + "epoch": 1.158311345646438, + "grad_norm": 0.9801568969967671, + "learning_rate": 0.00013997603346340229, + "loss": 3.2362523078918457, + "step": 1976, + "token_acc": 0.2774556139423525 + }, + { + "epoch": 1.1588976839636471, + "grad_norm": 0.8784503993417647, + "learning_rate": 0.00013997585561496768, + "loss": 3.318542718887329, + "step": 1977, + "token_acc": 0.26686570939936705 + }, + { + "epoch": 1.159484022280856, + "grad_norm": 0.7517384391080224, + "learning_rate": 0.0001399756771092042, + "loss": 3.3092856407165527, + "step": 1978, + "token_acc": 0.26675811283976464 + }, + { + "epoch": 1.1600703605980651, + "grad_norm": 0.768156455276333, + "learning_rate": 0.00013997549794611354, + "loss": 3.283203125, + "step": 1979, + "token_acc": 0.2714389177724322 + }, + { + "epoch": 1.160656698915274, + "grad_norm": 0.612608204651872, + "learning_rate": 0.00013997531812569736, + "loss": 3.2756786346435547, + "step": 1980, + "token_acc": 0.270528508268185 + }, + { + "epoch": 1.1612430372324831, + "grad_norm": 0.7088899619518892, + "learning_rate": 0.00013997513764795738, + "loss": 3.2976560592651367, + "step": 1981, + "token_acc": 0.2698743569140221 + }, + { + "epoch": 1.1618293755496922, + "grad_norm": 0.7059857392786851, + "learning_rate": 0.00013997495651289527, + "loss": 3.2709131240844727, + "step": 1982, + "token_acc": 0.27256450255437087 + }, + { + "epoch": 1.1624157138669011, + "grad_norm": 0.6484008510232048, + "learning_rate": 0.00013997477472051272, + "loss": 3.266232490539551, + "step": 1983, + "token_acc": 0.2733303259687669 + }, + { + "epoch": 1.1630020521841102, + "grad_norm": 0.5875863516838354, + "learning_rate": 0.00013997459227081145, + "loss": 3.2704479694366455, + "step": 1984, + "token_acc": 0.2729809076997743 + }, + { + "epoch": 1.1635883905013193, + "grad_norm": 0.6857931319123926, + "learning_rate": 0.0001399744091637932, + "loss": 3.2780685424804688, + "step": 1985, + "token_acc": 0.2701982450438739 + }, + { + "epoch": 1.1641747288185282, + "grad_norm": 0.6688390468442005, + "learning_rate": 0.00013997422539945966, + "loss": 3.2904343605041504, + "step": 1986, + "token_acc": 0.2698845113178909 + }, + { + "epoch": 1.1647610671357373, + "grad_norm": 0.9386681815814419, + "learning_rate": 0.00013997404097781255, + "loss": 3.2261009216308594, + "step": 1987, + "token_acc": 0.27882530764298147 + }, + { + "epoch": 1.1653474054529465, + "grad_norm": 1.0715909370535577, + "learning_rate": 0.0001399738558988536, + "loss": 3.286068916320801, + "step": 1988, + "token_acc": 0.27039242544648456 + }, + { + "epoch": 1.1659337437701554, + "grad_norm": 1.1455103355124707, + "learning_rate": 0.0001399736701625846, + "loss": 3.263002872467041, + "step": 1989, + "token_acc": 0.2723936933650387 + }, + { + "epoch": 1.1665200820873645, + "grad_norm": 0.9575400958748154, + "learning_rate": 0.00013997348376900724, + "loss": 3.3112387657165527, + "step": 1990, + "token_acc": 0.2672752131253016 + }, + { + "epoch": 1.1671064204045734, + "grad_norm": 0.855923533225229, + "learning_rate": 0.00013997329671812332, + "loss": 3.2915282249450684, + "step": 1991, + "token_acc": 0.2683620745753888 + }, + { + "epoch": 1.1676927587217825, + "grad_norm": 0.9132767649565504, + "learning_rate": 0.00013997310900993454, + "loss": 3.261610746383667, + "step": 1992, + "token_acc": 0.27440615590498496 + }, + { + "epoch": 1.1682790970389916, + "grad_norm": 0.9402467601171423, + "learning_rate": 0.0001399729206444427, + "loss": 3.319091558456421, + "step": 1993, + "token_acc": 0.2648558004468371 + }, + { + "epoch": 1.1688654353562005, + "grad_norm": 1.096000376730819, + "learning_rate": 0.00013997273162164956, + "loss": 3.32125186920166, + "step": 1994, + "token_acc": 0.26607096382406525 + }, + { + "epoch": 1.1694517736734096, + "grad_norm": 0.9743500299861092, + "learning_rate": 0.0001399725419415569, + "loss": 3.270928382873535, + "step": 1995, + "token_acc": 0.26980207772422216 + }, + { + "epoch": 1.1700381119906185, + "grad_norm": 1.0489637739727886, + "learning_rate": 0.00013997235160416647, + "loss": 3.3269383907318115, + "step": 1996, + "token_acc": 0.2659689991281331 + }, + { + "epoch": 1.1706244503078276, + "grad_norm": 0.6031897461874, + "learning_rate": 0.00013997216060948012, + "loss": 3.3214361667633057, + "step": 1997, + "token_acc": 0.2638300845036688 + }, + { + "epoch": 1.1712107886250367, + "grad_norm": 0.7184589396407897, + "learning_rate": 0.0001399719689574996, + "loss": 3.2780709266662598, + "step": 1998, + "token_acc": 0.27127680024605605 + }, + { + "epoch": 1.1717971269422456, + "grad_norm": 0.8084578419598454, + "learning_rate": 0.0001399717766482267, + "loss": 3.283168077468872, + "step": 1999, + "token_acc": 0.27009540489969974 + }, + { + "epoch": 1.1723834652594547, + "grad_norm": 0.9251953725991897, + "learning_rate": 0.00013997158368166327, + "loss": 3.2896370887756348, + "step": 2000, + "token_acc": 0.2705444324027856 + }, + { + "epoch": 1.1729698035766638, + "grad_norm": 0.9205641509985651, + "learning_rate": 0.0001399713900578111, + "loss": 3.273399829864502, + "step": 2001, + "token_acc": 0.27039277229787817 + }, + { + "epoch": 1.1735561418938727, + "grad_norm": 0.8926323668634545, + "learning_rate": 0.000139971195776672, + "loss": 3.2432026863098145, + "step": 2002, + "token_acc": 0.27554514144892994 + }, + { + "epoch": 1.1741424802110818, + "grad_norm": 0.9918193854914225, + "learning_rate": 0.00013997100083824778, + "loss": 3.311223030090332, + "step": 2003, + "token_acc": 0.26720193475331383 + }, + { + "epoch": 1.174728818528291, + "grad_norm": 0.9761518260911817, + "learning_rate": 0.00013997080524254032, + "loss": 3.2806928157806396, + "step": 2004, + "token_acc": 0.2702721222903077 + }, + { + "epoch": 1.1753151568454998, + "grad_norm": 0.8462092574010112, + "learning_rate": 0.00013997060898955144, + "loss": 3.290086507797241, + "step": 2005, + "token_acc": 0.26968710194354273 + }, + { + "epoch": 1.175901495162709, + "grad_norm": 0.7199809154954767, + "learning_rate": 0.00013997041207928297, + "loss": 3.2581470012664795, + "step": 2006, + "token_acc": 0.27425520644422235 + }, + { + "epoch": 1.1764878334799178, + "grad_norm": 0.7111798552281327, + "learning_rate": 0.00013997021451173677, + "loss": 3.3144493103027344, + "step": 2007, + "token_acc": 0.2649414934101375 + }, + { + "epoch": 1.177074171797127, + "grad_norm": 0.7269533251462796, + "learning_rate": 0.00013997001628691466, + "loss": 3.3021883964538574, + "step": 2008, + "token_acc": 0.2656302313811495 + }, + { + "epoch": 1.177660510114336, + "grad_norm": 0.6922427232460047, + "learning_rate": 0.00013996981740481857, + "loss": 3.256155014038086, + "step": 2009, + "token_acc": 0.27238125885266784 + }, + { + "epoch": 1.178246848431545, + "grad_norm": 0.9073039935713157, + "learning_rate": 0.0001399696178654503, + "loss": 3.269740104675293, + "step": 2010, + "token_acc": 0.2719516539638705 + }, + { + "epoch": 1.178833186748754, + "grad_norm": 0.9175877466329571, + "learning_rate": 0.00013996941766881177, + "loss": 3.21970534324646, + "step": 2011, + "token_acc": 0.27754819508249423 + }, + { + "epoch": 1.1794195250659631, + "grad_norm": 0.8993502067710845, + "learning_rate": 0.00013996921681490486, + "loss": 3.309999942779541, + "step": 2012, + "token_acc": 0.26560222109263926 + }, + { + "epoch": 1.180005863383172, + "grad_norm": 0.8548468793758872, + "learning_rate": 0.00013996901530373141, + "loss": 3.269012212753296, + "step": 2013, + "token_acc": 0.2731310040599203 + }, + { + "epoch": 1.1805922017003811, + "grad_norm": 0.8315697921166076, + "learning_rate": 0.00013996881313529336, + "loss": 3.260809898376465, + "step": 2014, + "token_acc": 0.2727009657298745 + }, + { + "epoch": 1.1811785400175903, + "grad_norm": 0.7797126073819475, + "learning_rate": 0.0001399686103095926, + "loss": 3.2540645599365234, + "step": 2015, + "token_acc": 0.27387683384238326 + }, + { + "epoch": 1.1817648783347992, + "grad_norm": 0.8879051190914242, + "learning_rate": 0.00013996840682663103, + "loss": 3.283329486846924, + "step": 2016, + "token_acc": 0.2705465334504941 + }, + { + "epoch": 1.1823512166520083, + "grad_norm": 0.8763884759057723, + "learning_rate": 0.00013996820268641057, + "loss": 3.2482025623321533, + "step": 2017, + "token_acc": 0.2745193728462942 + }, + { + "epoch": 1.1829375549692172, + "grad_norm": 0.6522520794303478, + "learning_rate": 0.00013996799788893312, + "loss": 3.273881435394287, + "step": 2018, + "token_acc": 0.271554576382023 + }, + { + "epoch": 1.1835238932864263, + "grad_norm": 0.7529803975708894, + "learning_rate": 0.0001399677924342006, + "loss": 3.247715711593628, + "step": 2019, + "token_acc": 0.27410420136477787 + }, + { + "epoch": 1.1841102316036354, + "grad_norm": 0.8418183421081263, + "learning_rate": 0.00013996758632221496, + "loss": 3.3056061267852783, + "step": 2020, + "token_acc": 0.26771928726422967 + }, + { + "epoch": 1.1846965699208443, + "grad_norm": 0.7325558729225324, + "learning_rate": 0.00013996737955297814, + "loss": 3.279817581176758, + "step": 2021, + "token_acc": 0.27071971596083205 + }, + { + "epoch": 1.1852829082380534, + "grad_norm": 0.8030224441444531, + "learning_rate": 0.00013996717212649208, + "loss": 3.2769505977630615, + "step": 2022, + "token_acc": 0.2719305892287995 + }, + { + "epoch": 1.1858692465552623, + "grad_norm": 0.8242029847771069, + "learning_rate": 0.0001399669640427587, + "loss": 3.2430787086486816, + "step": 2023, + "token_acc": 0.2758381087706697 + }, + { + "epoch": 1.1864555848724714, + "grad_norm": 0.8264266931101405, + "learning_rate": 0.00013996675530177996, + "loss": 3.300528049468994, + "step": 2024, + "token_acc": 0.27027717127924006 + }, + { + "epoch": 1.1870419231896805, + "grad_norm": 0.973956471615252, + "learning_rate": 0.00013996654590355787, + "loss": 3.2806429862976074, + "step": 2025, + "token_acc": 0.2704477102905156 + }, + { + "epoch": 1.1876282615068894, + "grad_norm": 0.7534006546208338, + "learning_rate": 0.00013996633584809434, + "loss": 3.28279447555542, + "step": 2026, + "token_acc": 0.26853968526421734 + }, + { + "epoch": 1.1882145998240985, + "grad_norm": 0.7560127380298783, + "learning_rate": 0.00013996612513539138, + "loss": 3.293433666229248, + "step": 2027, + "token_acc": 0.2693936920143699 + }, + { + "epoch": 1.1888009381413076, + "grad_norm": 1.055578746112734, + "learning_rate": 0.00013996591376545092, + "loss": 3.261115074157715, + "step": 2028, + "token_acc": 0.27160234182556187 + }, + { + "epoch": 1.1893872764585165, + "grad_norm": 1.0443696121586854, + "learning_rate": 0.00013996570173827502, + "loss": 3.2751612663269043, + "step": 2029, + "token_acc": 0.270351408859372 + }, + { + "epoch": 1.1899736147757256, + "grad_norm": 0.7233969196383906, + "learning_rate": 0.00013996548905386563, + "loss": 3.2878074645996094, + "step": 2030, + "token_acc": 0.26900639987348196 + }, + { + "epoch": 1.1905599530929347, + "grad_norm": 0.7190116803157567, + "learning_rate": 0.00013996527571222473, + "loss": 3.248185157775879, + "step": 2031, + "token_acc": 0.2727448598203162 + }, + { + "epoch": 1.1911462914101436, + "grad_norm": 0.7551477262953156, + "learning_rate": 0.00013996506171335438, + "loss": 3.256265163421631, + "step": 2032, + "token_acc": 0.2735621197171518 + }, + { + "epoch": 1.1917326297273527, + "grad_norm": 0.7080222965655164, + "learning_rate": 0.00013996484705725652, + "loss": 3.3316240310668945, + "step": 2033, + "token_acc": 0.2641333378764806 + }, + { + "epoch": 1.1923189680445616, + "grad_norm": 0.669109683810363, + "learning_rate": 0.0001399646317439332, + "loss": 3.2331409454345703, + "step": 2034, + "token_acc": 0.2776335364913024 + }, + { + "epoch": 1.1929053063617707, + "grad_norm": 0.8943186847361569, + "learning_rate": 0.00013996441577338647, + "loss": 3.31321382522583, + "step": 2035, + "token_acc": 0.2660338284608996 + }, + { + "epoch": 1.1934916446789798, + "grad_norm": 0.8394702091312193, + "learning_rate": 0.0001399641991456183, + "loss": 3.254009485244751, + "step": 2036, + "token_acc": 0.2747039750086346 + }, + { + "epoch": 1.1940779829961887, + "grad_norm": 0.6482322937941302, + "learning_rate": 0.00013996398186063075, + "loss": 3.2829437255859375, + "step": 2037, + "token_acc": 0.2698733208064981 + }, + { + "epoch": 1.1946643213133978, + "grad_norm": 0.6666697229093557, + "learning_rate": 0.00013996376391842591, + "loss": 3.2279770374298096, + "step": 2038, + "token_acc": 0.2764330159642248 + }, + { + "epoch": 1.195250659630607, + "grad_norm": 0.7812163405303421, + "learning_rate": 0.00013996354531900577, + "loss": 3.2676010131835938, + "step": 2039, + "token_acc": 0.27284011458599616 + }, + { + "epoch": 1.1958369979478158, + "grad_norm": 0.8326625457349554, + "learning_rate": 0.0001399633260623724, + "loss": 3.2590248584747314, + "step": 2040, + "token_acc": 0.2719198051456974 + }, + { + "epoch": 1.196423336265025, + "grad_norm": 1.0455332220398084, + "learning_rate": 0.00013996310614852782, + "loss": 3.2858152389526367, + "step": 2041, + "token_acc": 0.269239996594805 + }, + { + "epoch": 1.197009674582234, + "grad_norm": 1.0502970910975296, + "learning_rate": 0.00013996288557747419, + "loss": 3.2631282806396484, + "step": 2042, + "token_acc": 0.2730490531476443 + }, + { + "epoch": 1.197596012899443, + "grad_norm": 0.8054852590068742, + "learning_rate": 0.00013996266434921349, + "loss": 3.3076512813568115, + "step": 2043, + "token_acc": 0.2664837165747581 + }, + { + "epoch": 1.198182351216652, + "grad_norm": 1.0093675786483056, + "learning_rate": 0.00013996244246374786, + "loss": 3.2395107746124268, + "step": 2044, + "token_acc": 0.27610904883322773 + }, + { + "epoch": 1.198768689533861, + "grad_norm": 0.6916907559025842, + "learning_rate": 0.00013996221992107935, + "loss": 3.2929985523223877, + "step": 2045, + "token_acc": 0.26778898201617773 + }, + { + "epoch": 1.19935502785107, + "grad_norm": 0.817989571617489, + "learning_rate": 0.00013996199672121004, + "loss": 3.26029109954834, + "step": 2046, + "token_acc": 0.2742026233897139 + }, + { + "epoch": 1.1999413661682792, + "grad_norm": 0.9180749824034695, + "learning_rate": 0.00013996177286414207, + "loss": 3.211268186569214, + "step": 2047, + "token_acc": 0.2794246955529329 + }, + { + "epoch": 1.200527704485488, + "grad_norm": 0.8684883920231655, + "learning_rate": 0.00013996154834987752, + "loss": 3.3434700965881348, + "step": 2048, + "token_acc": 0.2641811308699819 + }, + { + "epoch": 1.2011140428026972, + "grad_norm": 0.8644291288375126, + "learning_rate": 0.00013996132317841846, + "loss": 3.2719063758850098, + "step": 2049, + "token_acc": 0.27049551914639675 + }, + { + "epoch": 1.201700381119906, + "grad_norm": 1.0050278038899023, + "learning_rate": 0.00013996109734976708, + "loss": 3.2906382083892822, + "step": 2050, + "token_acc": 0.2696166518835104 + }, + { + "epoch": 1.2022867194371152, + "grad_norm": 1.1136608484222708, + "learning_rate": 0.00013996087086392544, + "loss": 3.251469373703003, + "step": 2051, + "token_acc": 0.27550584688517676 + }, + { + "epoch": 1.2028730577543243, + "grad_norm": 0.7153490602730995, + "learning_rate": 0.00013996064372089572, + "loss": 3.2633090019226074, + "step": 2052, + "token_acc": 0.272893039225887 + }, + { + "epoch": 1.2034593960715332, + "grad_norm": 0.6357191713719329, + "learning_rate": 0.00013996041592068, + "loss": 3.2354817390441895, + "step": 2053, + "token_acc": 0.27708910409574355 + }, + { + "epoch": 1.2040457343887423, + "grad_norm": 0.8922975198207472, + "learning_rate": 0.00013996018746328048, + "loss": 3.2724735736846924, + "step": 2054, + "token_acc": 0.2710692319823372 + }, + { + "epoch": 1.2046320727059514, + "grad_norm": 1.057652080223769, + "learning_rate": 0.00013995995834869922, + "loss": 3.21077823638916, + "step": 2055, + "token_acc": 0.2786402022190652 + }, + { + "epoch": 1.2052184110231603, + "grad_norm": 0.8423578122477403, + "learning_rate": 0.00013995972857693846, + "loss": 3.290818214416504, + "step": 2056, + "token_acc": 0.26963003082001885 + }, + { + "epoch": 1.2058047493403694, + "grad_norm": 1.1099485819974582, + "learning_rate": 0.0001399594981480003, + "loss": 3.2558577060699463, + "step": 2057, + "token_acc": 0.2736841277826689 + }, + { + "epoch": 1.2063910876575785, + "grad_norm": 0.9684048199069077, + "learning_rate": 0.00013995926706188695, + "loss": 3.318784236907959, + "step": 2058, + "token_acc": 0.26525050751968443 + }, + { + "epoch": 1.2069774259747874, + "grad_norm": 0.8075818261541164, + "learning_rate": 0.00013995903531860055, + "loss": 3.2668185234069824, + "step": 2059, + "token_acc": 0.2729023675435253 + }, + { + "epoch": 1.2075637642919965, + "grad_norm": 0.7951807721010673, + "learning_rate": 0.00013995880291814327, + "loss": 3.2587151527404785, + "step": 2060, + "token_acc": 0.27283289043679865 + }, + { + "epoch": 1.2081501026092054, + "grad_norm": 0.7733483946839925, + "learning_rate": 0.0001399585698605173, + "loss": 3.295067548751831, + "step": 2061, + "token_acc": 0.26807482214965944 + }, + { + "epoch": 1.2087364409264145, + "grad_norm": 0.8600870866196413, + "learning_rate": 0.00013995833614572487, + "loss": 3.272195339202881, + "step": 2062, + "token_acc": 0.2732310022026432 + }, + { + "epoch": 1.2093227792436236, + "grad_norm": 1.017558241951975, + "learning_rate": 0.00013995810177376813, + "loss": 3.253392219543457, + "step": 2063, + "token_acc": 0.27330428071855556 + }, + { + "epoch": 1.2099091175608325, + "grad_norm": 1.0994980717808918, + "learning_rate": 0.0001399578667446493, + "loss": 3.197162628173828, + "step": 2064, + "token_acc": 0.28170951240044034 + }, + { + "epoch": 1.2104954558780416, + "grad_norm": 0.7554169682643791, + "learning_rate": 0.00013995763105837056, + "loss": 3.2833306789398193, + "step": 2065, + "token_acc": 0.27028180589061795 + }, + { + "epoch": 1.2110817941952507, + "grad_norm": 0.8643574044858382, + "learning_rate": 0.00013995739471493415, + "loss": 3.2892372608184814, + "step": 2066, + "token_acc": 0.2668725422336485 + }, + { + "epoch": 1.2116681325124596, + "grad_norm": 1.4286750909845534, + "learning_rate": 0.0001399571577143423, + "loss": 3.2418484687805176, + "step": 2067, + "token_acc": 0.2748956746192907 + }, + { + "epoch": 1.2122544708296688, + "grad_norm": 0.6373148224048629, + "learning_rate": 0.0001399569200565972, + "loss": 3.2708046436309814, + "step": 2068, + "token_acc": 0.27162786624285856 + }, + { + "epoch": 1.2128408091468779, + "grad_norm": 0.9826710697592247, + "learning_rate": 0.00013995668174170112, + "loss": 3.2232444286346436, + "step": 2069, + "token_acc": 0.27724404820731013 + }, + { + "epoch": 1.2134271474640868, + "grad_norm": 1.0880332011677372, + "learning_rate": 0.0001399564427696563, + "loss": 3.244549512863159, + "step": 2070, + "token_acc": 0.27509658285312905 + }, + { + "epoch": 1.2140134857812959, + "grad_norm": 0.7967802119367023, + "learning_rate": 0.00013995620314046493, + "loss": 3.3176679611206055, + "step": 2071, + "token_acc": 0.2660486143134741 + }, + { + "epoch": 1.2145998240985048, + "grad_norm": 1.0233166266062386, + "learning_rate": 0.0001399559628541293, + "loss": 3.299207925796509, + "step": 2072, + "token_acc": 0.26728944423047596 + }, + { + "epoch": 1.2151861624157139, + "grad_norm": 0.8045512762453749, + "learning_rate": 0.0001399557219106517, + "loss": 3.2874369621276855, + "step": 2073, + "token_acc": 0.26988809898236354 + }, + { + "epoch": 1.215772500732923, + "grad_norm": 0.8925296046058288, + "learning_rate": 0.00013995548031003435, + "loss": 3.2711923122406006, + "step": 2074, + "token_acc": 0.2720515005595655 + }, + { + "epoch": 1.2163588390501319, + "grad_norm": 0.8024475997723093, + "learning_rate": 0.0001399552380522795, + "loss": 3.2613604068756104, + "step": 2075, + "token_acc": 0.2742474561524505 + }, + { + "epoch": 1.216945177367341, + "grad_norm": 0.678643848820099, + "learning_rate": 0.0001399549951373895, + "loss": 3.2243504524230957, + "step": 2076, + "token_acc": 0.27744292710735663 + }, + { + "epoch": 1.2175315156845499, + "grad_norm": 0.7192385242812829, + "learning_rate": 0.00013995475156536655, + "loss": 3.2358908653259277, + "step": 2077, + "token_acc": 0.27488016212213723 + }, + { + "epoch": 1.218117854001759, + "grad_norm": 0.632856485412475, + "learning_rate": 0.000139954507336213, + "loss": 3.251486301422119, + "step": 2078, + "token_acc": 0.2743380697259104 + }, + { + "epoch": 1.218704192318968, + "grad_norm": 0.8076257263986828, + "learning_rate": 0.0001399542624499311, + "loss": 3.2412564754486084, + "step": 2079, + "token_acc": 0.27506333138467537 + }, + { + "epoch": 1.219290530636177, + "grad_norm": 0.7162834114217783, + "learning_rate": 0.00013995401690652316, + "loss": 3.2543599605560303, + "step": 2080, + "token_acc": 0.2735167377839316 + }, + { + "epoch": 1.219876868953386, + "grad_norm": 0.5783302552105949, + "learning_rate": 0.00013995377070599152, + "loss": 3.243490695953369, + "step": 2081, + "token_acc": 0.2763864161949698 + }, + { + "epoch": 1.2204632072705952, + "grad_norm": 0.8053531322373088, + "learning_rate": 0.00013995352384833844, + "loss": 3.2940032482147217, + "step": 2082, + "token_acc": 0.2686937299778419 + }, + { + "epoch": 1.221049545587804, + "grad_norm": 0.7627267985125418, + "learning_rate": 0.0001399532763335663, + "loss": 3.2762742042541504, + "step": 2083, + "token_acc": 0.27110944305004964 + }, + { + "epoch": 1.2216358839050132, + "grad_norm": 0.8501251019049632, + "learning_rate": 0.00013995302816167737, + "loss": 3.251089572906494, + "step": 2084, + "token_acc": 0.2741546773824148 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.7930636738934369, + "learning_rate": 0.00013995277933267401, + "loss": 3.293471097946167, + "step": 2085, + "token_acc": 0.26694135499605665 + }, + { + "epoch": 1.2228085605394312, + "grad_norm": 1.0517571730044282, + "learning_rate": 0.00013995252984655855, + "loss": 3.258188247680664, + "step": 2086, + "token_acc": 0.27355519954542945 + }, + { + "epoch": 1.2233948988566403, + "grad_norm": 0.9492592108768643, + "learning_rate": 0.00013995227970333332, + "loss": 3.2818026542663574, + "step": 2087, + "token_acc": 0.2710666816264986 + }, + { + "epoch": 1.2239812371738492, + "grad_norm": 0.9075878388127805, + "learning_rate": 0.0001399520289030007, + "loss": 3.2445836067199707, + "step": 2088, + "token_acc": 0.27482413668291095 + }, + { + "epoch": 1.2245675754910583, + "grad_norm": 0.7624384267735543, + "learning_rate": 0.00013995177744556303, + "loss": 3.2573299407958984, + "step": 2089, + "token_acc": 0.27402552245466766 + }, + { + "epoch": 1.2251539138082674, + "grad_norm": 0.7254844398683863, + "learning_rate": 0.00013995152533102266, + "loss": 3.257793426513672, + "step": 2090, + "token_acc": 0.2725927120619329 + }, + { + "epoch": 1.2257402521254763, + "grad_norm": 0.762945481114782, + "learning_rate": 0.000139951272559382, + "loss": 3.2640485763549805, + "step": 2091, + "token_acc": 0.27323780221652066 + }, + { + "epoch": 1.2263265904426854, + "grad_norm": 0.6901424543644455, + "learning_rate": 0.00013995101913064336, + "loss": 3.243479013442993, + "step": 2092, + "token_acc": 0.27411229244992535 + }, + { + "epoch": 1.2269129287598945, + "grad_norm": 0.8103476249573017, + "learning_rate": 0.00013995076504480917, + "loss": 3.2786288261413574, + "step": 2093, + "token_acc": 0.26982325615061875 + }, + { + "epoch": 1.2274992670771034, + "grad_norm": 0.8273412066621197, + "learning_rate": 0.00013995051030188182, + "loss": 3.2869534492492676, + "step": 2094, + "token_acc": 0.2680933832379424 + }, + { + "epoch": 1.2280856053943126, + "grad_norm": 0.6300906084649253, + "learning_rate": 0.00013995025490186365, + "loss": 3.275829792022705, + "step": 2095, + "token_acc": 0.27177046180771935 + }, + { + "epoch": 1.2286719437115217, + "grad_norm": 0.6638352130970439, + "learning_rate": 0.00013994999884475712, + "loss": 3.230128288269043, + "step": 2096, + "token_acc": 0.2759711085169794 + }, + { + "epoch": 1.2292582820287306, + "grad_norm": 0.7999811478776575, + "learning_rate": 0.0001399497421305646, + "loss": 3.28938627243042, + "step": 2097, + "token_acc": 0.2693783845831814 + }, + { + "epoch": 1.2298446203459397, + "grad_norm": 0.7432158789397402, + "learning_rate": 0.0001399494847592885, + "loss": 3.264371395111084, + "step": 2098, + "token_acc": 0.271819842396462 + }, + { + "epoch": 1.2304309586631486, + "grad_norm": 0.6103956760103257, + "learning_rate": 0.00013994922673093128, + "loss": 3.2540974617004395, + "step": 2099, + "token_acc": 0.2737752381924161 + }, + { + "epoch": 1.2310172969803577, + "grad_norm": 0.7966789308916669, + "learning_rate": 0.0001399489680454953, + "loss": 3.298779010772705, + "step": 2100, + "token_acc": 0.26808113401441835 + }, + { + "epoch": 1.2316036352975668, + "grad_norm": 1.050472356644393, + "learning_rate": 0.00013994870870298303, + "loss": 3.289454221725464, + "step": 2101, + "token_acc": 0.26979128431657956 + }, + { + "epoch": 1.2321899736147757, + "grad_norm": 0.8180846020768008, + "learning_rate": 0.00013994844870339693, + "loss": 3.3563408851623535, + "step": 2102, + "token_acc": 0.2609291936801672 + }, + { + "epoch": 1.2327763119319848, + "grad_norm": 0.7348362046380063, + "learning_rate": 0.00013994818804673938, + "loss": 3.2577309608459473, + "step": 2103, + "token_acc": 0.2746498922952132 + }, + { + "epoch": 1.2333626502491937, + "grad_norm": 1.0033770055055589, + "learning_rate": 0.00013994792673301286, + "loss": 3.2838189601898193, + "step": 2104, + "token_acc": 0.26996027255565486 + }, + { + "epoch": 1.2339489885664028, + "grad_norm": 1.0723149175367184, + "learning_rate": 0.00013994766476221985, + "loss": 3.295071601867676, + "step": 2105, + "token_acc": 0.2668412629128474 + }, + { + "epoch": 1.234535326883612, + "grad_norm": 0.7730063255573597, + "learning_rate": 0.00013994740213436275, + "loss": 3.280414581298828, + "step": 2106, + "token_acc": 0.2689909606839065 + }, + { + "epoch": 1.2351216652008208, + "grad_norm": 0.778235477399716, + "learning_rate": 0.0001399471388494441, + "loss": 3.298107147216797, + "step": 2107, + "token_acc": 0.2681080941365005 + }, + { + "epoch": 1.23570800351803, + "grad_norm": 0.7719802633313659, + "learning_rate": 0.00013994687490746628, + "loss": 3.315682888031006, + "step": 2108, + "token_acc": 0.26536364257807366 + }, + { + "epoch": 1.236294341835239, + "grad_norm": 0.5813641941027856, + "learning_rate": 0.0001399466103084319, + "loss": 3.2468855381011963, + "step": 2109, + "token_acc": 0.2745504990256989 + }, + { + "epoch": 1.236880680152448, + "grad_norm": 0.5966761416628407, + "learning_rate": 0.0001399463450523433, + "loss": 3.254228115081787, + "step": 2110, + "token_acc": 0.2745239639601787 + }, + { + "epoch": 1.237467018469657, + "grad_norm": 0.7528144098108003, + "learning_rate": 0.00013994607913920306, + "loss": 3.285797595977783, + "step": 2111, + "token_acc": 0.26844335820701487 + }, + { + "epoch": 1.2380533567868661, + "grad_norm": 0.7999341965629413, + "learning_rate": 0.0001399458125690137, + "loss": 3.222423553466797, + "step": 2112, + "token_acc": 0.27795262676985794 + }, + { + "epoch": 1.238639695104075, + "grad_norm": 0.8524942198868808, + "learning_rate": 0.00013994554534177764, + "loss": 3.279275417327881, + "step": 2113, + "token_acc": 0.2701440134889255 + }, + { + "epoch": 1.2392260334212841, + "grad_norm": 0.6335293228359163, + "learning_rate": 0.00013994527745749745, + "loss": 3.257563591003418, + "step": 2114, + "token_acc": 0.27245539663020735 + }, + { + "epoch": 1.239812371738493, + "grad_norm": 0.7609621354257848, + "learning_rate": 0.00013994500891617562, + "loss": 3.276428699493408, + "step": 2115, + "token_acc": 0.27117695265712666 + }, + { + "epoch": 1.2403987100557021, + "grad_norm": 0.9137486055077021, + "learning_rate": 0.0001399447397178147, + "loss": 3.2814674377441406, + "step": 2116, + "token_acc": 0.2706345970919962 + }, + { + "epoch": 1.2409850483729112, + "grad_norm": 0.8954839424820901, + "learning_rate": 0.0001399444698624172, + "loss": 3.2411656379699707, + "step": 2117, + "token_acc": 0.2748881460529699 + }, + { + "epoch": 1.2415713866901201, + "grad_norm": 0.8510782177440706, + "learning_rate": 0.00013994419934998563, + "loss": 3.3044254779815674, + "step": 2118, + "token_acc": 0.2667808811042733 + }, + { + "epoch": 1.2421577250073292, + "grad_norm": 0.711325446251975, + "learning_rate": 0.0001399439281805226, + "loss": 3.2630367279052734, + "step": 2119, + "token_acc": 0.2726042079451494 + }, + { + "epoch": 1.2427440633245384, + "grad_norm": 0.732955914795538, + "learning_rate": 0.0001399436563540306, + "loss": 3.2871408462524414, + "step": 2120, + "token_acc": 0.26821046698785517 + }, + { + "epoch": 1.2433304016417472, + "grad_norm": 0.9785958116921543, + "learning_rate": 0.00013994338387051218, + "loss": 3.275796890258789, + "step": 2121, + "token_acc": 0.2699258349743635 + }, + { + "epoch": 1.2439167399589564, + "grad_norm": 0.9806795031341744, + "learning_rate": 0.00013994311072996994, + "loss": 3.2688193321228027, + "step": 2122, + "token_acc": 0.2715173345339937 + }, + { + "epoch": 1.2445030782761655, + "grad_norm": 0.6757998200561142, + "learning_rate": 0.00013994283693240643, + "loss": 3.2210726737976074, + "step": 2123, + "token_acc": 0.27666671782829133 + }, + { + "epoch": 1.2450894165933744, + "grad_norm": 0.6555366961423104, + "learning_rate": 0.0001399425624778242, + "loss": 3.2891485691070557, + "step": 2124, + "token_acc": 0.26967460128479875 + }, + { + "epoch": 1.2456757549105835, + "grad_norm": 0.7217183048077976, + "learning_rate": 0.00013994228736622584, + "loss": 3.289639472961426, + "step": 2125, + "token_acc": 0.27032646953891704 + }, + { + "epoch": 1.2462620932277924, + "grad_norm": 0.6204837826883519, + "learning_rate": 0.00013994201159761395, + "loss": 3.3212976455688477, + "step": 2126, + "token_acc": 0.2638176126449759 + }, + { + "epoch": 1.2468484315450015, + "grad_norm": 0.512733277185017, + "learning_rate": 0.0001399417351719911, + "loss": 3.3063395023345947, + "step": 2127, + "token_acc": 0.26451418882833927 + }, + { + "epoch": 1.2474347698622106, + "grad_norm": 0.5962678951896425, + "learning_rate": 0.0001399414580893599, + "loss": 3.2598273754119873, + "step": 2128, + "token_acc": 0.2727725732509468 + }, + { + "epoch": 1.2480211081794195, + "grad_norm": 0.6114274931745662, + "learning_rate": 0.00013994118034972296, + "loss": 3.234663486480713, + "step": 2129, + "token_acc": 0.2764413729026331 + }, + { + "epoch": 1.2486074464966286, + "grad_norm": 0.631441212734014, + "learning_rate": 0.00013994090195308285, + "loss": 3.2678322792053223, + "step": 2130, + "token_acc": 0.27025203306673834 + }, + { + "epoch": 1.2491937848138375, + "grad_norm": 0.5954373081725135, + "learning_rate": 0.00013994062289944225, + "loss": 3.2652812004089355, + "step": 2131, + "token_acc": 0.2728173502227436 + }, + { + "epoch": 1.2497801231310466, + "grad_norm": 0.8286947267672502, + "learning_rate": 0.00013994034318880373, + "loss": 3.2767622470855713, + "step": 2132, + "token_acc": 0.27050031731230006 + }, + { + "epoch": 1.2503664614482557, + "grad_norm": 0.9711431422832624, + "learning_rate": 0.00013994006282116992, + "loss": 3.2957820892333984, + "step": 2133, + "token_acc": 0.26634402272252533 + }, + { + "epoch": 1.2509527997654648, + "grad_norm": 1.3140557543022733, + "learning_rate": 0.00013993978179654347, + "loss": 3.303408145904541, + "step": 2134, + "token_acc": 0.26676655919301107 + }, + { + "epoch": 1.2515391380826737, + "grad_norm": 0.8103839155007129, + "learning_rate": 0.000139939500114927, + "loss": 3.262760639190674, + "step": 2135, + "token_acc": 0.2722601781526456 + }, + { + "epoch": 1.2521254763998828, + "grad_norm": 0.5720632435812079, + "learning_rate": 0.00013993921777632318, + "loss": 3.2550487518310547, + "step": 2136, + "token_acc": 0.2746788766540017 + }, + { + "epoch": 1.2527118147170917, + "grad_norm": 0.5618387843506389, + "learning_rate": 0.00013993893478073468, + "loss": 3.243192672729492, + "step": 2137, + "token_acc": 0.27482293341651115 + }, + { + "epoch": 1.2532981530343008, + "grad_norm": 0.8570650550360386, + "learning_rate": 0.00013993865112816412, + "loss": 3.254253387451172, + "step": 2138, + "token_acc": 0.2717191035988224 + }, + { + "epoch": 1.25388449135151, + "grad_norm": 0.7503022875993651, + "learning_rate": 0.00013993836681861415, + "loss": 3.2357943058013916, + "step": 2139, + "token_acc": 0.27489379768102284 + }, + { + "epoch": 1.2544708296687188, + "grad_norm": 0.8244196220344667, + "learning_rate": 0.0001399380818520875, + "loss": 3.3101367950439453, + "step": 2140, + "token_acc": 0.26650898181874705 + }, + { + "epoch": 1.255057167985928, + "grad_norm": 0.8434348438490735, + "learning_rate": 0.00013993779622858678, + "loss": 3.251595973968506, + "step": 2141, + "token_acc": 0.27360839760293876 + }, + { + "epoch": 1.2556435063031368, + "grad_norm": 0.7006240332628441, + "learning_rate": 0.00013993750994811474, + "loss": 3.2532222270965576, + "step": 2142, + "token_acc": 0.2744377035022503 + }, + { + "epoch": 1.256229844620346, + "grad_norm": 0.7828525019675897, + "learning_rate": 0.00013993722301067403, + "loss": 3.2525672912597656, + "step": 2143, + "token_acc": 0.2729189545354711 + }, + { + "epoch": 1.256816182937555, + "grad_norm": 0.7609387188798483, + "learning_rate": 0.00013993693541626732, + "loss": 3.326597213745117, + "step": 2144, + "token_acc": 0.26526368113780807 + }, + { + "epoch": 1.257402521254764, + "grad_norm": 0.6039315136945768, + "learning_rate": 0.00013993664716489737, + "loss": 3.2765822410583496, + "step": 2145, + "token_acc": 0.26981815674167436 + }, + { + "epoch": 1.257988859571973, + "grad_norm": 0.9156171550963622, + "learning_rate": 0.00013993635825656687, + "loss": 3.279661178588867, + "step": 2146, + "token_acc": 0.2692610735297929 + }, + { + "epoch": 1.258575197889182, + "grad_norm": 0.8377394965404859, + "learning_rate": 0.0001399360686912785, + "loss": 3.210858106613159, + "step": 2147, + "token_acc": 0.27934817466146666 + }, + { + "epoch": 1.259161536206391, + "grad_norm": 0.8144310150812833, + "learning_rate": 0.00013993577846903502, + "loss": 3.296915054321289, + "step": 2148, + "token_acc": 0.2678595490335161 + }, + { + "epoch": 1.2597478745236002, + "grad_norm": 0.6878299396746819, + "learning_rate": 0.00013993548758983913, + "loss": 3.2164597511291504, + "step": 2149, + "token_acc": 0.2772925017172544 + }, + { + "epoch": 1.2603342128408093, + "grad_norm": 0.695183962294549, + "learning_rate": 0.0001399351960536936, + "loss": 3.2438132762908936, + "step": 2150, + "token_acc": 0.27531160705165647 + }, + { + "epoch": 1.2609205511580182, + "grad_norm": 0.7604190801224, + "learning_rate": 0.0001399349038606011, + "loss": 3.244269847869873, + "step": 2151, + "token_acc": 0.2743884765157339 + }, + { + "epoch": 1.2615068894752273, + "grad_norm": 0.6687786812743288, + "learning_rate": 0.00013993461101056444, + "loss": 3.2383170127868652, + "step": 2152, + "token_acc": 0.27533110940575195 + }, + { + "epoch": 1.2620932277924362, + "grad_norm": 0.7039604666216203, + "learning_rate": 0.00013993431750358633, + "loss": 3.2438838481903076, + "step": 2153, + "token_acc": 0.2748859424962873 + }, + { + "epoch": 1.2626795661096453, + "grad_norm": 0.5303302765718964, + "learning_rate": 0.00013993402333966959, + "loss": 3.2546653747558594, + "step": 2154, + "token_acc": 0.2726827423116382 + }, + { + "epoch": 1.2632659044268544, + "grad_norm": 0.7134035170074915, + "learning_rate": 0.0001399337285188169, + "loss": 3.2302613258361816, + "step": 2155, + "token_acc": 0.27678849452434356 + }, + { + "epoch": 1.2638522427440633, + "grad_norm": 0.7681233256057425, + "learning_rate": 0.00013993343304103108, + "loss": 3.2539103031158447, + "step": 2156, + "token_acc": 0.27249049587635293 + }, + { + "epoch": 1.2644385810612724, + "grad_norm": 0.6052256364281839, + "learning_rate": 0.0001399331369063149, + "loss": 3.318514823913574, + "step": 2157, + "token_acc": 0.2646158045155009 + }, + { + "epoch": 1.2650249193784813, + "grad_norm": 0.7278428143331697, + "learning_rate": 0.0001399328401146711, + "loss": 3.2906060218811035, + "step": 2158, + "token_acc": 0.2665939524554187 + }, + { + "epoch": 1.2656112576956904, + "grad_norm": 0.6841650320078934, + "learning_rate": 0.00013993254266610254, + "loss": 3.234330654144287, + "step": 2159, + "token_acc": 0.2754574752541239 + }, + { + "epoch": 1.2661975960128995, + "grad_norm": 0.6959107507921208, + "learning_rate": 0.00013993224456061197, + "loss": 3.236572265625, + "step": 2160, + "token_acc": 0.2746586085993726 + }, + { + "epoch": 1.2667839343301086, + "grad_norm": 0.639403218909042, + "learning_rate": 0.0001399319457982022, + "loss": 3.2537500858306885, + "step": 2161, + "token_acc": 0.27302771278772575 + }, + { + "epoch": 1.2673702726473175, + "grad_norm": 0.5497072981960509, + "learning_rate": 0.000139931646378876, + "loss": 3.2242488861083984, + "step": 2162, + "token_acc": 0.2782415443690495 + }, + { + "epoch": 1.2679566109645266, + "grad_norm": 0.7506794427588356, + "learning_rate": 0.00013993134630263624, + "loss": 3.287839412689209, + "step": 2163, + "token_acc": 0.267107909070328 + }, + { + "epoch": 1.2685429492817355, + "grad_norm": 0.7600482615707181, + "learning_rate": 0.00013993104556948572, + "loss": 3.2164316177368164, + "step": 2164, + "token_acc": 0.27654171423246277 + }, + { + "epoch": 1.2691292875989446, + "grad_norm": 0.6928488895360626, + "learning_rate": 0.00013993074417942725, + "loss": 3.219045639038086, + "step": 2165, + "token_acc": 0.279921189231621 + }, + { + "epoch": 1.2697156259161537, + "grad_norm": 0.5433994634145555, + "learning_rate": 0.0001399304421324637, + "loss": 3.2466506958007812, + "step": 2166, + "token_acc": 0.2734385088072433 + }, + { + "epoch": 1.2703019642333626, + "grad_norm": 0.7280912637355488, + "learning_rate": 0.00013993013942859784, + "loss": 3.266098976135254, + "step": 2167, + "token_acc": 0.27145218770599866 + }, + { + "epoch": 1.2708883025505717, + "grad_norm": 0.8838009378643746, + "learning_rate": 0.0001399298360678326, + "loss": 3.2824175357818604, + "step": 2168, + "token_acc": 0.26815708601226174 + }, + { + "epoch": 1.2714746408677806, + "grad_norm": 0.8937601834621671, + "learning_rate": 0.00013992953205017075, + "loss": 3.280222177505493, + "step": 2169, + "token_acc": 0.26840512223515717 + }, + { + "epoch": 1.2720609791849897, + "grad_norm": 0.951924473822936, + "learning_rate": 0.00013992922737561517, + "loss": 3.260284423828125, + "step": 2170, + "token_acc": 0.27293685486911423 + }, + { + "epoch": 1.2726473175021988, + "grad_norm": 1.1055187112944074, + "learning_rate": 0.0001399289220441687, + "loss": 3.2998034954071045, + "step": 2171, + "token_acc": 0.26654556432572746 + }, + { + "epoch": 1.2732336558194077, + "grad_norm": 0.8585618721513086, + "learning_rate": 0.0001399286160558343, + "loss": 3.2888336181640625, + "step": 2172, + "token_acc": 0.2689896313690352 + }, + { + "epoch": 1.2738199941366168, + "grad_norm": 0.8033663032148733, + "learning_rate": 0.00013992830941061477, + "loss": 3.2678914070129395, + "step": 2173, + "token_acc": 0.27175066748369586 + }, + { + "epoch": 1.2744063324538257, + "grad_norm": 0.8966734838661492, + "learning_rate": 0.00013992800210851298, + "loss": 3.25895619392395, + "step": 2174, + "token_acc": 0.27261749807894364 + }, + { + "epoch": 1.2749926707710348, + "grad_norm": 0.7665915285163452, + "learning_rate": 0.00013992769414953183, + "loss": 3.279198408126831, + "step": 2175, + "token_acc": 0.26911084106833516 + }, + { + "epoch": 1.275579009088244, + "grad_norm": 0.6305990459156104, + "learning_rate": 0.00013992738553367427, + "loss": 3.263517379760742, + "step": 2176, + "token_acc": 0.2719003525849745 + }, + { + "epoch": 1.276165347405453, + "grad_norm": 0.6599784062293278, + "learning_rate": 0.00013992707626094312, + "loss": 3.2567808628082275, + "step": 2177, + "token_acc": 0.2723966975750199 + }, + { + "epoch": 1.276751685722662, + "grad_norm": 0.5030147367007866, + "learning_rate": 0.00013992676633134134, + "loss": 3.2581636905670166, + "step": 2178, + "token_acc": 0.27204981519166077 + }, + { + "epoch": 1.277338024039871, + "grad_norm": 0.5296856079452157, + "learning_rate": 0.00013992645574487176, + "loss": 3.2410213947296143, + "step": 2179, + "token_acc": 0.2739711283617431 + }, + { + "epoch": 1.27792436235708, + "grad_norm": 0.5666534333246164, + "learning_rate": 0.0001399261445015374, + "loss": 3.2737512588500977, + "step": 2180, + "token_acc": 0.2682639806658488 + }, + { + "epoch": 1.278510700674289, + "grad_norm": 0.48618000394802513, + "learning_rate": 0.00013992583260134112, + "loss": 3.261039972305298, + "step": 2181, + "token_acc": 0.2719647774525198 + }, + { + "epoch": 1.2790970389914982, + "grad_norm": 0.527101327377743, + "learning_rate": 0.0001399255200442859, + "loss": 3.288301944732666, + "step": 2182, + "token_acc": 0.268849431525081 + }, + { + "epoch": 1.279683377308707, + "grad_norm": 0.8100077547456774, + "learning_rate": 0.00013992520683037463, + "loss": 3.251373052597046, + "step": 2183, + "token_acc": 0.2743277104698923 + }, + { + "epoch": 1.2802697156259162, + "grad_norm": 0.9152355257963801, + "learning_rate": 0.00013992489295961026, + "loss": 3.2229623794555664, + "step": 2184, + "token_acc": 0.27634457434965853 + }, + { + "epoch": 1.280856053943125, + "grad_norm": 0.8150004593288109, + "learning_rate": 0.00013992457843199575, + "loss": 3.2767679691314697, + "step": 2185, + "token_acc": 0.26813752104550215 + }, + { + "epoch": 1.2814423922603342, + "grad_norm": 0.9523348961348439, + "learning_rate": 0.00013992426324753403, + "loss": 3.2204365730285645, + "step": 2186, + "token_acc": 0.2771839018671254 + }, + { + "epoch": 1.2820287305775433, + "grad_norm": 1.0640896897353698, + "learning_rate": 0.00013992394740622812, + "loss": 3.294424533843994, + "step": 2187, + "token_acc": 0.2661403860349608 + }, + { + "epoch": 1.2826150688947524, + "grad_norm": 0.749922571141057, + "learning_rate": 0.00013992363090808093, + "loss": 3.2759032249450684, + "step": 2188, + "token_acc": 0.26952181085321375 + }, + { + "epoch": 1.2832014072119613, + "grad_norm": 0.7921172047424397, + "learning_rate": 0.00013992331375309544, + "loss": 3.232530117034912, + "step": 2189, + "token_acc": 0.27504480618926236 + }, + { + "epoch": 1.2837877455291704, + "grad_norm": 0.9163259784355665, + "learning_rate": 0.00013992299594127463, + "loss": 3.257448673248291, + "step": 2190, + "token_acc": 0.273471002620797 + }, + { + "epoch": 1.2843740838463793, + "grad_norm": 1.0882052565088902, + "learning_rate": 0.00013992267747262152, + "loss": 3.254312515258789, + "step": 2191, + "token_acc": 0.2736796592599754 + }, + { + "epoch": 1.2849604221635884, + "grad_norm": 0.9771471248614845, + "learning_rate": 0.0001399223583471391, + "loss": 3.2629873752593994, + "step": 2192, + "token_acc": 0.2735318729755063 + }, + { + "epoch": 1.2855467604807975, + "grad_norm": 0.8248492949865283, + "learning_rate": 0.0001399220385648303, + "loss": 3.293623208999634, + "step": 2193, + "token_acc": 0.26619862365048297 + }, + { + "epoch": 1.2861330987980064, + "grad_norm": 0.8763376985350948, + "learning_rate": 0.0001399217181256982, + "loss": 3.2297704219818115, + "step": 2194, + "token_acc": 0.27665187517151746 + }, + { + "epoch": 1.2867194371152155, + "grad_norm": 0.8743851267136841, + "learning_rate": 0.00013992139702974574, + "loss": 3.2616500854492188, + "step": 2195, + "token_acc": 0.272131557971755 + }, + { + "epoch": 1.2873057754324244, + "grad_norm": 0.6816364200656553, + "learning_rate": 0.000139921075276976, + "loss": 3.3285412788391113, + "step": 2196, + "token_acc": 0.26408010330017684 + }, + { + "epoch": 1.2878921137496335, + "grad_norm": 0.7374953897483747, + "learning_rate": 0.00013992075286739197, + "loss": 3.2469429969787598, + "step": 2197, + "token_acc": 0.2743279679712395 + }, + { + "epoch": 1.2884784520668426, + "grad_norm": 0.6696799842656691, + "learning_rate": 0.00013992042980099672, + "loss": 3.282231330871582, + "step": 2198, + "token_acc": 0.27053639200472296 + }, + { + "epoch": 1.2890647903840515, + "grad_norm": 0.7366887469516081, + "learning_rate": 0.00013992010607779323, + "loss": 3.2968177795410156, + "step": 2199, + "token_acc": 0.2692325320839275 + }, + { + "epoch": 1.2896511287012606, + "grad_norm": 0.6935747589713157, + "learning_rate": 0.00013991978169778453, + "loss": 3.1987712383270264, + "step": 2200, + "token_acc": 0.280801938683617 + }, + { + "epoch": 1.2902374670184695, + "grad_norm": 0.6438118179328719, + "learning_rate": 0.00013991945666097375, + "loss": 3.2876834869384766, + "step": 2201, + "token_acc": 0.2694637174139596 + }, + { + "epoch": 1.2908238053356786, + "grad_norm": 0.6707368068251847, + "learning_rate": 0.00013991913096736384, + "loss": 3.2258429527282715, + "step": 2202, + "token_acc": 0.2751306023899791 + }, + { + "epoch": 1.2914101436528878, + "grad_norm": 0.9587532855426293, + "learning_rate": 0.00013991880461695797, + "loss": 3.187530994415283, + "step": 2203, + "token_acc": 0.28280909928009373 + }, + { + "epoch": 1.2919964819700969, + "grad_norm": 0.8824033870920983, + "learning_rate": 0.00013991847760975909, + "loss": 3.28365421295166, + "step": 2204, + "token_acc": 0.271241541038526 + }, + { + "epoch": 1.2925828202873058, + "grad_norm": 0.6811511793129583, + "learning_rate": 0.00013991814994577037, + "loss": 3.2248082160949707, + "step": 2205, + "token_acc": 0.27781760514424747 + }, + { + "epoch": 1.2931691586045149, + "grad_norm": 0.6436003691980726, + "learning_rate": 0.00013991782162499483, + "loss": 3.1893463134765625, + "step": 2206, + "token_acc": 0.28056817454731586 + }, + { + "epoch": 1.2937554969217238, + "grad_norm": 0.7087569458317957, + "learning_rate": 0.00013991749264743558, + "loss": 3.2576708793640137, + "step": 2207, + "token_acc": 0.27270859350817084 + }, + { + "epoch": 1.2943418352389329, + "grad_norm": 0.8650500063591986, + "learning_rate": 0.00013991716301309568, + "loss": 3.255648612976074, + "step": 2208, + "token_acc": 0.273727862788994 + }, + { + "epoch": 1.294928173556142, + "grad_norm": 0.8037243580309142, + "learning_rate": 0.00013991683272197828, + "loss": 3.2718734741210938, + "step": 2209, + "token_acc": 0.27259017118312173 + }, + { + "epoch": 1.2955145118733509, + "grad_norm": 0.7066036507332567, + "learning_rate": 0.00013991650177408644, + "loss": 3.2621817588806152, + "step": 2210, + "token_acc": 0.27049663673034874 + }, + { + "epoch": 1.29610085019056, + "grad_norm": 0.6380005805725367, + "learning_rate": 0.00013991617016942325, + "loss": 3.2806944847106934, + "step": 2211, + "token_acc": 0.27115543491089344 + }, + { + "epoch": 1.2966871885077689, + "grad_norm": 0.6849777060859824, + "learning_rate": 0.00013991583790799188, + "loss": 3.2411704063415527, + "step": 2212, + "token_acc": 0.2750044802980471 + }, + { + "epoch": 1.297273526824978, + "grad_norm": 0.6873223156311801, + "learning_rate": 0.00013991550498979543, + "loss": 3.2397704124450684, + "step": 2213, + "token_acc": 0.2751558923967907 + }, + { + "epoch": 1.297859865142187, + "grad_norm": 0.5073235932678809, + "learning_rate": 0.00013991517141483703, + "loss": 3.2567384243011475, + "step": 2214, + "token_acc": 0.2716094135558726 + }, + { + "epoch": 1.2984462034593962, + "grad_norm": 0.5924135063748301, + "learning_rate": 0.00013991483718311977, + "loss": 3.2667055130004883, + "step": 2215, + "token_acc": 0.27239217887547945 + }, + { + "epoch": 1.299032541776605, + "grad_norm": 0.7007936114896273, + "learning_rate": 0.00013991450229464686, + "loss": 3.245711326599121, + "step": 2216, + "token_acc": 0.2756184664684661 + }, + { + "epoch": 1.2996188800938142, + "grad_norm": 0.7035594150635341, + "learning_rate": 0.0001399141667494214, + "loss": 3.2703652381896973, + "step": 2217, + "token_acc": 0.2696799164236555 + }, + { + "epoch": 1.300205218411023, + "grad_norm": 0.7891022923693524, + "learning_rate": 0.00013991383054744655, + "loss": 3.2577526569366455, + "step": 2218, + "token_acc": 0.2723859837041582 + }, + { + "epoch": 1.3007915567282322, + "grad_norm": 0.6071154305791292, + "learning_rate": 0.00013991349368872548, + "loss": 3.2174909114837646, + "step": 2219, + "token_acc": 0.276816976127321 + }, + { + "epoch": 1.3013778950454413, + "grad_norm": 0.6881930191170118, + "learning_rate": 0.00013991315617326134, + "loss": 3.222494602203369, + "step": 2220, + "token_acc": 0.2765450554791065 + }, + { + "epoch": 1.3019642333626502, + "grad_norm": 0.7950987665635977, + "learning_rate": 0.0001399128180010573, + "loss": 3.281454086303711, + "step": 2221, + "token_acc": 0.27102085591111796 + }, + { + "epoch": 1.3025505716798593, + "grad_norm": 0.7698706310306992, + "learning_rate": 0.00013991247917211657, + "loss": 3.229562759399414, + "step": 2222, + "token_acc": 0.27657702420139535 + }, + { + "epoch": 1.3031369099970682, + "grad_norm": 0.5784162569507464, + "learning_rate": 0.0001399121396864423, + "loss": 3.2224950790405273, + "step": 2223, + "token_acc": 0.27815520952285727 + }, + { + "epoch": 1.3037232483142773, + "grad_norm": 0.6093473439474515, + "learning_rate": 0.00013991179954403766, + "loss": 3.2346553802490234, + "step": 2224, + "token_acc": 0.2759812307024187 + }, + { + "epoch": 1.3043095866314864, + "grad_norm": 0.8181487706003039, + "learning_rate": 0.00013991145874490588, + "loss": 3.2413926124572754, + "step": 2225, + "token_acc": 0.2733392987554866 + }, + { + "epoch": 1.3048959249486953, + "grad_norm": 0.8629654250072551, + "learning_rate": 0.00013991111728905016, + "loss": 3.305009126663208, + "step": 2226, + "token_acc": 0.266866320101724 + }, + { + "epoch": 1.3054822632659044, + "grad_norm": 0.8908262723733779, + "learning_rate": 0.00013991077517647372, + "loss": 3.266554594039917, + "step": 2227, + "token_acc": 0.2719248604677983 + }, + { + "epoch": 1.3060686015831133, + "grad_norm": 0.8457512147509902, + "learning_rate": 0.00013991043240717973, + "loss": 3.2323203086853027, + "step": 2228, + "token_acc": 0.27640771526268265 + }, + { + "epoch": 1.3066549399003224, + "grad_norm": 0.9163951833327513, + "learning_rate": 0.00013991008898117142, + "loss": 3.2162413597106934, + "step": 2229, + "token_acc": 0.2781150258110202 + }, + { + "epoch": 1.3072412782175316, + "grad_norm": 0.7933615059819582, + "learning_rate": 0.00013990974489845205, + "loss": 3.232268810272217, + "step": 2230, + "token_acc": 0.27553774326792196 + }, + { + "epoch": 1.3078276165347407, + "grad_norm": 0.6735725682753338, + "learning_rate": 0.0001399094001590248, + "loss": 3.23925518989563, + "step": 2231, + "token_acc": 0.2760740204520568 + }, + { + "epoch": 1.3084139548519496, + "grad_norm": 0.7372256267676556, + "learning_rate": 0.000139909054762893, + "loss": 3.284658908843994, + "step": 2232, + "token_acc": 0.26910521000173426 + }, + { + "epoch": 1.3090002931691587, + "grad_norm": 0.8957993682206438, + "learning_rate": 0.0001399087087100598, + "loss": 3.254683017730713, + "step": 2233, + "token_acc": 0.27298310690384364 + }, + { + "epoch": 1.3095866314863676, + "grad_norm": 0.9659274396400697, + "learning_rate": 0.00013990836200052846, + "loss": 3.2381184101104736, + "step": 2234, + "token_acc": 0.27497735647090304 + }, + { + "epoch": 1.3101729698035767, + "grad_norm": 0.9664935716020944, + "learning_rate": 0.0001399080146343023, + "loss": 3.264007091522217, + "step": 2235, + "token_acc": 0.2719191768286989 + }, + { + "epoch": 1.3107593081207858, + "grad_norm": 0.7427389549311973, + "learning_rate": 0.00013990766661138451, + "loss": 3.287780284881592, + "step": 2236, + "token_acc": 0.2692399822057968 + }, + { + "epoch": 1.3113456464379947, + "grad_norm": 0.9116604275183391, + "learning_rate": 0.0001399073179317784, + "loss": 3.2880234718322754, + "step": 2237, + "token_acc": 0.26625443731216686 + }, + { + "epoch": 1.3119319847552038, + "grad_norm": 1.0219219832465658, + "learning_rate": 0.0001399069685954873, + "loss": 3.2666263580322266, + "step": 2238, + "token_acc": 0.2697657705616183 + }, + { + "epoch": 1.3125183230724127, + "grad_norm": 0.9070759917080985, + "learning_rate": 0.00013990661860251437, + "loss": 3.283400774002075, + "step": 2239, + "token_acc": 0.26856703586332403 + }, + { + "epoch": 1.3131046613896218, + "grad_norm": 0.690824365002751, + "learning_rate": 0.00013990626795286297, + "loss": 3.2271370887756348, + "step": 2240, + "token_acc": 0.27551172114190103 + }, + { + "epoch": 1.313690999706831, + "grad_norm": 0.6005903088295038, + "learning_rate": 0.00013990591664653638, + "loss": 3.198882579803467, + "step": 2241, + "token_acc": 0.2797137922020676 + }, + { + "epoch": 1.31427733802404, + "grad_norm": 0.7027542094852006, + "learning_rate": 0.0001399055646835379, + "loss": 3.2818074226379395, + "step": 2242, + "token_acc": 0.26854223137672467 + }, + { + "epoch": 1.314863676341249, + "grad_norm": 0.7891555955411103, + "learning_rate": 0.00013990521206387087, + "loss": 3.2377758026123047, + "step": 2243, + "token_acc": 0.2756753256214938 + }, + { + "epoch": 1.315450014658458, + "grad_norm": 0.6325728029170314, + "learning_rate": 0.00013990485878753853, + "loss": 3.2493605613708496, + "step": 2244, + "token_acc": 0.27247135545913986 + }, + { + "epoch": 1.316036352975667, + "grad_norm": 0.5574190799075625, + "learning_rate": 0.00013990450485454426, + "loss": 3.21128511428833, + "step": 2245, + "token_acc": 0.279038450211768 + }, + { + "epoch": 1.316622691292876, + "grad_norm": 0.6902634736874556, + "learning_rate": 0.00013990415026489137, + "loss": 3.250037670135498, + "step": 2246, + "token_acc": 0.27363236299309557 + }, + { + "epoch": 1.3172090296100851, + "grad_norm": 0.567815681570269, + "learning_rate": 0.00013990379501858317, + "loss": 3.2443559169769287, + "step": 2247, + "token_acc": 0.2739386962152828 + }, + { + "epoch": 1.317795367927294, + "grad_norm": 0.621490499052342, + "learning_rate": 0.00013990343911562302, + "loss": 3.2534451484680176, + "step": 2248, + "token_acc": 0.27288684605779845 + }, + { + "epoch": 1.3183817062445031, + "grad_norm": 0.624033612586763, + "learning_rate": 0.00013990308255601425, + "loss": 3.217923164367676, + "step": 2249, + "token_acc": 0.2768168523259721 + }, + { + "epoch": 1.318968044561712, + "grad_norm": 0.516788534824758, + "learning_rate": 0.00013990272533976022, + "loss": 3.248495578765869, + "step": 2250, + "token_acc": 0.27262019460509423 + }, + { + "epoch": 1.3195543828789211, + "grad_norm": 0.6323250216859816, + "learning_rate": 0.00013990236746686427, + "loss": 3.180589437484741, + "step": 2251, + "token_acc": 0.2825820524033971 + }, + { + "epoch": 1.3201407211961302, + "grad_norm": 0.6770601872728982, + "learning_rate": 0.0001399020089373298, + "loss": 3.3008813858032227, + "step": 2252, + "token_acc": 0.26704514000376445 + }, + { + "epoch": 1.3207270595133391, + "grad_norm": 0.5900596975308456, + "learning_rate": 0.00013990164975116013, + "loss": 3.2625155448913574, + "step": 2253, + "token_acc": 0.2724385521574669 + }, + { + "epoch": 1.3213133978305482, + "grad_norm": 0.5370764327915902, + "learning_rate": 0.00013990128990835866, + "loss": 3.2561521530151367, + "step": 2254, + "token_acc": 0.2731086822233148 + }, + { + "epoch": 1.3218997361477571, + "grad_norm": 0.5898362850696133, + "learning_rate": 0.00013990092940892874, + "loss": 3.2581562995910645, + "step": 2255, + "token_acc": 0.2722880857460297 + }, + { + "epoch": 1.3224860744649662, + "grad_norm": 0.7490815052290202, + "learning_rate": 0.0001399005682528738, + "loss": 3.2722625732421875, + "step": 2256, + "token_acc": 0.2713863299755255 + }, + { + "epoch": 1.3230724127821754, + "grad_norm": 0.6981011104548723, + "learning_rate": 0.00013990020644019722, + "loss": 3.1947450637817383, + "step": 2257, + "token_acc": 0.2783922531007495 + }, + { + "epoch": 1.3236587510993845, + "grad_norm": 0.5768251372899952, + "learning_rate": 0.00013989984397090238, + "loss": 3.215456485748291, + "step": 2258, + "token_acc": 0.27762382317365286 + }, + { + "epoch": 1.3242450894165934, + "grad_norm": 0.5819599066154665, + "learning_rate": 0.00013989948084499273, + "loss": 3.2729148864746094, + "step": 2259, + "token_acc": 0.2707146362251091 + }, + { + "epoch": 1.3248314277338025, + "grad_norm": 0.6581855038886422, + "learning_rate": 0.0001398991170624716, + "loss": 3.233405113220215, + "step": 2260, + "token_acc": 0.2749104741372609 + }, + { + "epoch": 1.3254177660510114, + "grad_norm": 0.7116785490287024, + "learning_rate": 0.0001398987526233425, + "loss": 3.279329776763916, + "step": 2261, + "token_acc": 0.2687734610477265 + }, + { + "epoch": 1.3260041043682205, + "grad_norm": 0.6626271327531708, + "learning_rate": 0.00013989838752760878, + "loss": 3.231142520904541, + "step": 2262, + "token_acc": 0.2738908683335133 + }, + { + "epoch": 1.3265904426854296, + "grad_norm": 0.5163449107672403, + "learning_rate": 0.0001398980217752739, + "loss": 3.2220101356506348, + "step": 2263, + "token_acc": 0.27688355266607245 + }, + { + "epoch": 1.3271767810026385, + "grad_norm": 0.5105379548865576, + "learning_rate": 0.0001398976553663413, + "loss": 3.212674140930176, + "step": 2264, + "token_acc": 0.2791656201668231 + }, + { + "epoch": 1.3277631193198476, + "grad_norm": 0.597500451973825, + "learning_rate": 0.00013989728830081442, + "loss": 3.259384870529175, + "step": 2265, + "token_acc": 0.2724031226533482 + }, + { + "epoch": 1.3283494576370565, + "grad_norm": 0.6248314555747955, + "learning_rate": 0.0001398969205786967, + "loss": 3.2520155906677246, + "step": 2266, + "token_acc": 0.2724840644010078 + }, + { + "epoch": 1.3289357959542656, + "grad_norm": 0.5753221944334733, + "learning_rate": 0.0001398965521999916, + "loss": 3.2315878868103027, + "step": 2267, + "token_acc": 0.2770375448435116 + }, + { + "epoch": 1.3295221342714747, + "grad_norm": 0.5972267405912591, + "learning_rate": 0.00013989618316470257, + "loss": 3.223360061645508, + "step": 2268, + "token_acc": 0.2743675983556913 + }, + { + "epoch": 1.3301084725886836, + "grad_norm": 0.7280199036509593, + "learning_rate": 0.0001398958134728331, + "loss": 3.2836251258850098, + "step": 2269, + "token_acc": 0.268682654627349 + }, + { + "epoch": 1.3306948109058927, + "grad_norm": 0.6239475833475651, + "learning_rate": 0.00013989544312438665, + "loss": 3.2127394676208496, + "step": 2270, + "token_acc": 0.2777651083238312 + }, + { + "epoch": 1.3312811492231018, + "grad_norm": 0.5717660346157789, + "learning_rate": 0.00013989507211936667, + "loss": 3.2350687980651855, + "step": 2271, + "token_acc": 0.2741548650819016 + }, + { + "epoch": 1.3318674875403107, + "grad_norm": 0.689603580240118, + "learning_rate": 0.0001398947004577767, + "loss": 3.2244174480438232, + "step": 2272, + "token_acc": 0.27596050115890325 + }, + { + "epoch": 1.3324538258575198, + "grad_norm": 0.7110980711380444, + "learning_rate": 0.00013989432813962018, + "loss": 3.256523609161377, + "step": 2273, + "token_acc": 0.2724349864536098 + }, + { + "epoch": 1.333040164174729, + "grad_norm": 0.5711239219723008, + "learning_rate": 0.00013989395516490066, + "loss": 3.267484188079834, + "step": 2274, + "token_acc": 0.27222280970760854 + }, + { + "epoch": 1.3336265024919378, + "grad_norm": 0.5839717509114866, + "learning_rate": 0.0001398935815336216, + "loss": 3.2843594551086426, + "step": 2275, + "token_acc": 0.26760402288542473 + }, + { + "epoch": 1.334212840809147, + "grad_norm": 0.647329681353092, + "learning_rate": 0.00013989320724578651, + "loss": 3.282942533493042, + "step": 2276, + "token_acc": 0.2695252760558561 + }, + { + "epoch": 1.3347991791263558, + "grad_norm": 0.605308842065589, + "learning_rate": 0.00013989283230139894, + "loss": 3.2511353492736816, + "step": 2277, + "token_acc": 0.2724982940633404 + }, + { + "epoch": 1.335385517443565, + "grad_norm": 0.6458725221551095, + "learning_rate": 0.00013989245670046238, + "loss": 3.1752777099609375, + "step": 2278, + "token_acc": 0.28377980894783866 + }, + { + "epoch": 1.335971855760774, + "grad_norm": 0.6230641430460644, + "learning_rate": 0.0001398920804429804, + "loss": 3.2585110664367676, + "step": 2279, + "token_acc": 0.2709860261083655 + }, + { + "epoch": 1.336558194077983, + "grad_norm": 0.7087307441032221, + "learning_rate": 0.00013989170352895648, + "loss": 3.2426064014434814, + "step": 2280, + "token_acc": 0.27449025706462504 + }, + { + "epoch": 1.337144532395192, + "grad_norm": 0.8242285848711308, + "learning_rate": 0.00013989132595839418, + "loss": 3.2599806785583496, + "step": 2281, + "token_acc": 0.27169661015607893 + }, + { + "epoch": 1.337730870712401, + "grad_norm": 1.1412306922766946, + "learning_rate": 0.00013989094773129705, + "loss": 3.186037540435791, + "step": 2282, + "token_acc": 0.2826199491797549 + }, + { + "epoch": 1.33831720902961, + "grad_norm": 0.9834134779009901, + "learning_rate": 0.00013989056884766867, + "loss": 3.2204337120056152, + "step": 2283, + "token_acc": 0.27580652916005577 + }, + { + "epoch": 1.3389035473468192, + "grad_norm": 0.7613876138471651, + "learning_rate": 0.00013989018930751253, + "loss": 3.2575087547302246, + "step": 2284, + "token_acc": 0.2708401408469113 + }, + { + "epoch": 1.3394898856640283, + "grad_norm": 0.6574144687255081, + "learning_rate": 0.0001398898091108323, + "loss": 3.2546191215515137, + "step": 2285, + "token_acc": 0.2717124869048709 + }, + { + "epoch": 1.3400762239812372, + "grad_norm": 0.6381101187137609, + "learning_rate": 0.00013988942825763145, + "loss": 3.349344253540039, + "step": 2286, + "token_acc": 0.2600196950962172 + }, + { + "epoch": 1.3406625622984463, + "grad_norm": 0.8080468526471691, + "learning_rate": 0.00013988904674791362, + "loss": 3.203268527984619, + "step": 2287, + "token_acc": 0.2799437167390296 + }, + { + "epoch": 1.3412489006156552, + "grad_norm": 0.8668917357778302, + "learning_rate": 0.00013988866458168234, + "loss": 3.2792110443115234, + "step": 2288, + "token_acc": 0.2674151991490368 + }, + { + "epoch": 1.3418352389328643, + "grad_norm": 0.6711100042152606, + "learning_rate": 0.00013988828175894128, + "loss": 3.2638742923736572, + "step": 2289, + "token_acc": 0.27239694471706005 + }, + { + "epoch": 1.3424215772500734, + "grad_norm": 0.580276376009243, + "learning_rate": 0.00013988789827969395, + "loss": 3.232861280441284, + "step": 2290, + "token_acc": 0.2746714289847373 + }, + { + "epoch": 1.3430079155672823, + "grad_norm": 0.5652726388790901, + "learning_rate": 0.000139887514143944, + "loss": 3.240216016769409, + "step": 2291, + "token_acc": 0.27436996642973094 + }, + { + "epoch": 1.3435942538844914, + "grad_norm": 0.7195392428590845, + "learning_rate": 0.00013988712935169504, + "loss": 3.2431468963623047, + "step": 2292, + "token_acc": 0.2722858337241787 + }, + { + "epoch": 1.3441805922017003, + "grad_norm": 0.8354717541266719, + "learning_rate": 0.00013988674390295064, + "loss": 3.2693238258361816, + "step": 2293, + "token_acc": 0.2717522861753631 + }, + { + "epoch": 1.3447669305189094, + "grad_norm": 0.6683114300036848, + "learning_rate": 0.0001398863577977145, + "loss": 3.2912442684173584, + "step": 2294, + "token_acc": 0.2675400232413343 + }, + { + "epoch": 1.3453532688361185, + "grad_norm": 0.548183158486377, + "learning_rate": 0.00013988597103599016, + "loss": 3.1789822578430176, + "step": 2295, + "token_acc": 0.2833757828023111 + }, + { + "epoch": 1.3459396071533274, + "grad_norm": 0.6354089163628652, + "learning_rate": 0.00013988558361778135, + "loss": 3.2483458518981934, + "step": 2296, + "token_acc": 0.2726482912797258 + }, + { + "epoch": 1.3465259454705365, + "grad_norm": 0.49226274844270673, + "learning_rate": 0.00013988519554309159, + "loss": 3.254448652267456, + "step": 2297, + "token_acc": 0.27245641278137583 + }, + { + "epoch": 1.3471122837877456, + "grad_norm": 0.5895147834176369, + "learning_rate": 0.00013988480681192465, + "loss": 3.2330968379974365, + "step": 2298, + "token_acc": 0.27545101993126064 + }, + { + "epoch": 1.3476986221049545, + "grad_norm": 0.49747834714724776, + "learning_rate": 0.00013988441742428408, + "loss": 3.29148530960083, + "step": 2299, + "token_acc": 0.26753565688902187 + }, + { + "epoch": 1.3482849604221636, + "grad_norm": 0.5601327549414155, + "learning_rate": 0.00013988402738017357, + "loss": 3.2296857833862305, + "step": 2300, + "token_acc": 0.27627369714925065 + }, + { + "epoch": 1.3488712987393727, + "grad_norm": 0.596455531898281, + "learning_rate": 0.00013988363667959684, + "loss": 3.2047741413116455, + "step": 2301, + "token_acc": 0.2790164573705095 + }, + { + "epoch": 1.3494576370565816, + "grad_norm": 0.5732680965642541, + "learning_rate": 0.00013988324532255748, + "loss": 3.205781936645508, + "step": 2302, + "token_acc": 0.2773472345372769 + }, + { + "epoch": 1.3500439753737907, + "grad_norm": 0.5868920884054873, + "learning_rate": 0.0001398828533090592, + "loss": 3.232273578643799, + "step": 2303, + "token_acc": 0.27477729808316115 + }, + { + "epoch": 1.3506303136909996, + "grad_norm": 0.8164791487132735, + "learning_rate": 0.00013988246063910566, + "loss": 3.216388702392578, + "step": 2304, + "token_acc": 0.2766196199030835 + }, + { + "epoch": 1.3512166520082087, + "grad_norm": 1.0348367988049, + "learning_rate": 0.0001398820673127006, + "loss": 3.2795491218566895, + "step": 2305, + "token_acc": 0.2686818581098538 + }, + { + "epoch": 1.3518029903254178, + "grad_norm": 0.8079211873004316, + "learning_rate": 0.0001398816733298477, + "loss": 3.2118287086486816, + "step": 2306, + "token_acc": 0.27925875729774813 + }, + { + "epoch": 1.3523893286426267, + "grad_norm": 0.7937759856415924, + "learning_rate": 0.00013988127869055063, + "loss": 3.2431528568267822, + "step": 2307, + "token_acc": 0.2752873104946824 + }, + { + "epoch": 1.3529756669598358, + "grad_norm": 0.8032954254740285, + "learning_rate": 0.0001398808833948131, + "loss": 3.32612681388855, + "step": 2308, + "token_acc": 0.26353204990577067 + }, + { + "epoch": 1.3535620052770447, + "grad_norm": 0.7511147661094716, + "learning_rate": 0.00013988048744263885, + "loss": 3.2986321449279785, + "step": 2309, + "token_acc": 0.2661949105907369 + }, + { + "epoch": 1.3541483435942538, + "grad_norm": 0.5526329316514107, + "learning_rate": 0.0001398800908340316, + "loss": 3.2262182235717773, + "step": 2310, + "token_acc": 0.27626612121975225 + }, + { + "epoch": 1.354734681911463, + "grad_norm": 0.609129984802882, + "learning_rate": 0.00013987969356899502, + "loss": 3.233227252960205, + "step": 2311, + "token_acc": 0.27597269106174366 + }, + { + "epoch": 1.355321020228672, + "grad_norm": 0.5599129746364312, + "learning_rate": 0.0001398792956475329, + "loss": 3.2175755500793457, + "step": 2312, + "token_acc": 0.27763282641527753 + }, + { + "epoch": 1.355907358545881, + "grad_norm": 0.567176584633, + "learning_rate": 0.00013987889706964897, + "loss": 3.221271276473999, + "step": 2313, + "token_acc": 0.2751948760171512 + }, + { + "epoch": 1.35649369686309, + "grad_norm": 0.6933566821265675, + "learning_rate": 0.00013987849783534697, + "loss": 3.273721218109131, + "step": 2314, + "token_acc": 0.2692191581240894 + }, + { + "epoch": 1.357080035180299, + "grad_norm": 0.5323468311091328, + "learning_rate": 0.00013987809794463064, + "loss": 3.213636636734009, + "step": 2315, + "token_acc": 0.2771746696263823 + }, + { + "epoch": 1.357666373497508, + "grad_norm": 0.5046379123632694, + "learning_rate": 0.00013987769739750374, + "loss": 3.231790542602539, + "step": 2316, + "token_acc": 0.27432450348686127 + }, + { + "epoch": 1.3582527118147172, + "grad_norm": 0.6431027410514616, + "learning_rate": 0.00013987729619397004, + "loss": 3.2239303588867188, + "step": 2317, + "token_acc": 0.2743623516245087 + }, + { + "epoch": 1.358839050131926, + "grad_norm": 0.5619226316411767, + "learning_rate": 0.00013987689433403328, + "loss": 3.2327990531921387, + "step": 2318, + "token_acc": 0.2729363313894066 + }, + { + "epoch": 1.3594253884491352, + "grad_norm": 0.5748305354267821, + "learning_rate": 0.00013987649181769729, + "loss": 3.2189276218414307, + "step": 2319, + "token_acc": 0.2779763506155758 + }, + { + "epoch": 1.360011726766344, + "grad_norm": 0.69330126014728, + "learning_rate": 0.00013987608864496578, + "loss": 3.1874217987060547, + "step": 2320, + "token_acc": 0.28190465745984744 + }, + { + "epoch": 1.3605980650835532, + "grad_norm": 0.6851989188654681, + "learning_rate": 0.0001398756848158426, + "loss": 3.2739462852478027, + "step": 2321, + "token_acc": 0.2702360996310058 + }, + { + "epoch": 1.3611844034007623, + "grad_norm": 0.5630397555433547, + "learning_rate": 0.00013987528033033154, + "loss": 3.2431530952453613, + "step": 2322, + "token_acc": 0.2737073383548755 + }, + { + "epoch": 1.3617707417179712, + "grad_norm": 0.4886874592968571, + "learning_rate": 0.00013987487518843635, + "loss": 3.252169609069824, + "step": 2323, + "token_acc": 0.2731774366023095 + }, + { + "epoch": 1.3623570800351803, + "grad_norm": 0.6279518013932495, + "learning_rate": 0.00013987446939016086, + "loss": 3.2444815635681152, + "step": 2324, + "token_acc": 0.2743564696253663 + }, + { + "epoch": 1.3629434183523892, + "grad_norm": 0.5291448242790368, + "learning_rate": 0.0001398740629355089, + "loss": 3.2627408504486084, + "step": 2325, + "token_acc": 0.2722443280244412 + }, + { + "epoch": 1.3635297566695983, + "grad_norm": 0.5696048888343826, + "learning_rate": 0.00013987365582448429, + "loss": 3.269171714782715, + "step": 2326, + "token_acc": 0.2705178228953112 + }, + { + "epoch": 1.3641160949868074, + "grad_norm": 0.6442621176747142, + "learning_rate": 0.0001398732480570908, + "loss": 3.2238848209381104, + "step": 2327, + "token_acc": 0.2756909300661825 + }, + { + "epoch": 1.3647024333040165, + "grad_norm": 0.7648645101616475, + "learning_rate": 0.00013987283963333235, + "loss": 3.2791032791137695, + "step": 2328, + "token_acc": 0.2665929699681556 + }, + { + "epoch": 1.3652887716212254, + "grad_norm": 0.5748802047665874, + "learning_rate": 0.0001398724305532127, + "loss": 3.210606336593628, + "step": 2329, + "token_acc": 0.2779026005408796 + }, + { + "epoch": 1.3658751099384345, + "grad_norm": 0.5823731082802434, + "learning_rate": 0.0001398720208167357, + "loss": 3.258233070373535, + "step": 2330, + "token_acc": 0.2721549460359101 + }, + { + "epoch": 1.3664614482556434, + "grad_norm": 0.8683458738525525, + "learning_rate": 0.00013987161042390526, + "loss": 3.2206506729125977, + "step": 2331, + "token_acc": 0.27505782575173476 + }, + { + "epoch": 1.3670477865728525, + "grad_norm": 0.8662192119290943, + "learning_rate": 0.00013987119937472516, + "loss": 3.2621169090270996, + "step": 2332, + "token_acc": 0.2720903830371868 + }, + { + "epoch": 1.3676341248900616, + "grad_norm": 0.754262797356459, + "learning_rate": 0.00013987078766919932, + "loss": 3.264309883117676, + "step": 2333, + "token_acc": 0.27255841560853805 + }, + { + "epoch": 1.3682204632072705, + "grad_norm": 0.5130684829104256, + "learning_rate": 0.00013987037530733157, + "loss": 3.231562614440918, + "step": 2334, + "token_acc": 0.27413996061476176 + }, + { + "epoch": 1.3688068015244796, + "grad_norm": 0.7270047738552735, + "learning_rate": 0.00013986996228912578, + "loss": 3.208850860595703, + "step": 2335, + "token_acc": 0.2774473342494421 + }, + { + "epoch": 1.3693931398416885, + "grad_norm": 0.7999520139967754, + "learning_rate": 0.00013986954861458587, + "loss": 3.2066574096679688, + "step": 2336, + "token_acc": 0.27890765906050446 + }, + { + "epoch": 1.3699794781588976, + "grad_norm": 0.6849005162227756, + "learning_rate": 0.0001398691342837157, + "loss": 3.2388978004455566, + "step": 2337, + "token_acc": 0.27523591025519156 + }, + { + "epoch": 1.3705658164761068, + "grad_norm": 0.7317238359187801, + "learning_rate": 0.00013986871929651913, + "loss": 3.241654872894287, + "step": 2338, + "token_acc": 0.27437202061713833 + }, + { + "epoch": 1.3711521547933159, + "grad_norm": 0.7116512988719098, + "learning_rate": 0.00013986830365300012, + "loss": 3.2158799171447754, + "step": 2339, + "token_acc": 0.2773791952769043 + }, + { + "epoch": 1.3717384931105248, + "grad_norm": 0.686242397338959, + "learning_rate": 0.00013986788735316255, + "loss": 3.2340025901794434, + "step": 2340, + "token_acc": 0.2767342364751287 + }, + { + "epoch": 1.3723248314277339, + "grad_norm": 0.5730839072840935, + "learning_rate": 0.00013986747039701033, + "loss": 3.2664332389831543, + "step": 2341, + "token_acc": 0.2733342807856667 + }, + { + "epoch": 1.3729111697449428, + "grad_norm": 0.5320028174080119, + "learning_rate": 0.00013986705278454736, + "loss": 3.2581794261932373, + "step": 2342, + "token_acc": 0.2737702095691234 + }, + { + "epoch": 1.3734975080621519, + "grad_norm": 0.52884286054953, + "learning_rate": 0.00013986663451577756, + "loss": 3.224905490875244, + "step": 2343, + "token_acc": 0.2765184988604932 + }, + { + "epoch": 1.374083846379361, + "grad_norm": 0.6029620501484472, + "learning_rate": 0.0001398662155907049, + "loss": 3.223026752471924, + "step": 2344, + "token_acc": 0.27664117615380474 + }, + { + "epoch": 1.3746701846965699, + "grad_norm": 0.4965551290674581, + "learning_rate": 0.0001398657960093333, + "loss": 3.2916502952575684, + "step": 2345, + "token_acc": 0.26645371879049906 + }, + { + "epoch": 1.375256523013779, + "grad_norm": 0.48638189298899687, + "learning_rate": 0.00013986537577166666, + "loss": 3.1991796493530273, + "step": 2346, + "token_acc": 0.2798389377791772 + }, + { + "epoch": 1.3758428613309879, + "grad_norm": 0.41998949183780476, + "learning_rate": 0.00013986495487770898, + "loss": 3.2356410026550293, + "step": 2347, + "token_acc": 0.27565120585861036 + }, + { + "epoch": 1.376429199648197, + "grad_norm": 0.5383890677692373, + "learning_rate": 0.00013986453332746418, + "loss": 3.23232102394104, + "step": 2348, + "token_acc": 0.2737869522258125 + }, + { + "epoch": 1.377015537965406, + "grad_norm": 0.6115098432933126, + "learning_rate": 0.00013986411112093625, + "loss": 3.2500758171081543, + "step": 2349, + "token_acc": 0.2716582607213404 + }, + { + "epoch": 1.377601876282615, + "grad_norm": 0.5948083199990917, + "learning_rate": 0.00013986368825812912, + "loss": 3.1810178756713867, + "step": 2350, + "token_acc": 0.28305878917397825 + }, + { + "epoch": 1.378188214599824, + "grad_norm": 0.5273200057222567, + "learning_rate": 0.0001398632647390468, + "loss": 3.279315233230591, + "step": 2351, + "token_acc": 0.2702025776880703 + }, + { + "epoch": 1.378774552917033, + "grad_norm": 0.5346339655896671, + "learning_rate": 0.00013986284056369323, + "loss": 3.2478885650634766, + "step": 2352, + "token_acc": 0.27174627448002453 + }, + { + "epoch": 1.379360891234242, + "grad_norm": 0.5745821107767187, + "learning_rate": 0.00013986241573207242, + "loss": 3.2705845832824707, + "step": 2353, + "token_acc": 0.2701598598429237 + }, + { + "epoch": 1.3799472295514512, + "grad_norm": 0.5074410359556256, + "learning_rate": 0.00013986199024418835, + "loss": 3.2543606758117676, + "step": 2354, + "token_acc": 0.27121840814786874 + }, + { + "epoch": 1.3805335678686603, + "grad_norm": 0.5771362375406369, + "learning_rate": 0.00013986156410004504, + "loss": 3.246710777282715, + "step": 2355, + "token_acc": 0.2740377202679148 + }, + { + "epoch": 1.3811199061858692, + "grad_norm": 0.7245425071904756, + "learning_rate": 0.00013986113729964647, + "loss": 3.2576050758361816, + "step": 2356, + "token_acc": 0.27226459815808995 + }, + { + "epoch": 1.3817062445030783, + "grad_norm": 0.8024161952602262, + "learning_rate": 0.00013986070984299664, + "loss": 3.2356467247009277, + "step": 2357, + "token_acc": 0.2748091603053435 + }, + { + "epoch": 1.3822925828202872, + "grad_norm": 0.9561597859494859, + "learning_rate": 0.00013986028173009962, + "loss": 3.191582441329956, + "step": 2358, + "token_acc": 0.2803484942236536 + }, + { + "epoch": 1.3828789211374963, + "grad_norm": 0.8378281243184932, + "learning_rate": 0.00013985985296095934, + "loss": 3.25502610206604, + "step": 2359, + "token_acc": 0.2713004012642717 + }, + { + "epoch": 1.3834652594547054, + "grad_norm": 0.5274182033306564, + "learning_rate": 0.0001398594235355799, + "loss": 3.230663776397705, + "step": 2360, + "token_acc": 0.27262771680482173 + }, + { + "epoch": 1.3840515977719143, + "grad_norm": 0.6439696068737398, + "learning_rate": 0.0001398589934539653, + "loss": 3.278341293334961, + "step": 2361, + "token_acc": 0.26932425369536866 + }, + { + "epoch": 1.3846379360891234, + "grad_norm": 0.8136282321210115, + "learning_rate": 0.0001398585627161196, + "loss": 3.2586827278137207, + "step": 2362, + "token_acc": 0.2712596000484279 + }, + { + "epoch": 1.3852242744063323, + "grad_norm": 0.5969839917794962, + "learning_rate": 0.00013985813132204685, + "loss": 3.21920108795166, + "step": 2363, + "token_acc": 0.275329395729214 + }, + { + "epoch": 1.3858106127235414, + "grad_norm": 0.5799498682965233, + "learning_rate": 0.00013985769927175108, + "loss": 3.251772403717041, + "step": 2364, + "token_acc": 0.27124433128712977 + }, + { + "epoch": 1.3863969510407506, + "grad_norm": 0.6495105022734131, + "learning_rate": 0.0001398572665652364, + "loss": 3.2348406314849854, + "step": 2365, + "token_acc": 0.2733937873818252 + }, + { + "epoch": 1.3869832893579597, + "grad_norm": 0.45734174272460987, + "learning_rate": 0.0001398568332025068, + "loss": 3.264463424682617, + "step": 2366, + "token_acc": 0.2700660409605294 + }, + { + "epoch": 1.3875696276751686, + "grad_norm": 0.6120249163077733, + "learning_rate": 0.00013985639918356638, + "loss": 3.2258410453796387, + "step": 2367, + "token_acc": 0.27597315223757735 + }, + { + "epoch": 1.3881559659923777, + "grad_norm": 0.7254751182018163, + "learning_rate": 0.0001398559645084192, + "loss": 3.2677292823791504, + "step": 2368, + "token_acc": 0.27128922804816924 + }, + { + "epoch": 1.3887423043095866, + "grad_norm": 0.522960436691094, + "learning_rate": 0.00013985552917706941, + "loss": 3.252495527267456, + "step": 2369, + "token_acc": 0.2712597831635689 + }, + { + "epoch": 1.3893286426267957, + "grad_norm": 0.5916230230610566, + "learning_rate": 0.00013985509318952102, + "loss": 3.244598388671875, + "step": 2370, + "token_acc": 0.27390663261275966 + }, + { + "epoch": 1.3899149809440048, + "grad_norm": 0.6306482480318539, + "learning_rate": 0.0001398546565457782, + "loss": 3.1578688621520996, + "step": 2371, + "token_acc": 0.284473575745248 + }, + { + "epoch": 1.3905013192612137, + "grad_norm": 0.6098504752027983, + "learning_rate": 0.000139854219245845, + "loss": 3.2467424869537354, + "step": 2372, + "token_acc": 0.2719936294227046 + }, + { + "epoch": 1.3910876575784228, + "grad_norm": 0.4964983693689211, + "learning_rate": 0.00013985378128972552, + "loss": 3.194394111633301, + "step": 2373, + "token_acc": 0.2783039528400571 + }, + { + "epoch": 1.3916739958956317, + "grad_norm": 0.5475183553676816, + "learning_rate": 0.0001398533426774239, + "loss": 3.222277879714966, + "step": 2374, + "token_acc": 0.27405161952056706 + }, + { + "epoch": 1.3922603342128408, + "grad_norm": 0.7150686010433159, + "learning_rate": 0.00013985290340894427, + "loss": 3.251976490020752, + "step": 2375, + "token_acc": 0.27342810913894117 + }, + { + "epoch": 1.39284667253005, + "grad_norm": 0.6562257249720154, + "learning_rate": 0.00013985246348429073, + "loss": 3.2200140953063965, + "step": 2376, + "token_acc": 0.2769852347793629 + }, + { + "epoch": 1.3934330108472588, + "grad_norm": 0.5009438376180354, + "learning_rate": 0.00013985202290346741, + "loss": 3.2666430473327637, + "step": 2377, + "token_acc": 0.2698519758012614 + }, + { + "epoch": 1.394019349164468, + "grad_norm": 0.6719709243952812, + "learning_rate": 0.0001398515816664785, + "loss": 3.246985912322998, + "step": 2378, + "token_acc": 0.2732994111673017 + }, + { + "epoch": 1.3946056874816768, + "grad_norm": 0.7914211460824498, + "learning_rate": 0.00013985113977332806, + "loss": 3.221771240234375, + "step": 2379, + "token_acc": 0.2772318972476309 + }, + { + "epoch": 1.395192025798886, + "grad_norm": 0.9393115698433752, + "learning_rate": 0.0001398506972240203, + "loss": 3.2842864990234375, + "step": 2380, + "token_acc": 0.26903255319542757 + }, + { + "epoch": 1.395778364116095, + "grad_norm": 0.9724057797178768, + "learning_rate": 0.00013985025401855937, + "loss": 3.301084280014038, + "step": 2381, + "token_acc": 0.2644298739615482 + }, + { + "epoch": 1.3963647024333041, + "grad_norm": 0.6474235003597507, + "learning_rate": 0.00013984981015694942, + "loss": 3.2261769771575928, + "step": 2382, + "token_acc": 0.2755207770328167 + }, + { + "epoch": 1.396951040750513, + "grad_norm": 0.7111312822339051, + "learning_rate": 0.00013984936563919464, + "loss": 3.252040386199951, + "step": 2383, + "token_acc": 0.27120888652539027 + }, + { + "epoch": 1.3975373790677221, + "grad_norm": 0.6062412462742552, + "learning_rate": 0.0001398489204652992, + "loss": 3.269176483154297, + "step": 2384, + "token_acc": 0.2699137787876312 + }, + { + "epoch": 1.398123717384931, + "grad_norm": 0.6596948096515953, + "learning_rate": 0.00013984847463526727, + "loss": 3.2472195625305176, + "step": 2385, + "token_acc": 0.2737935586061246 + }, + { + "epoch": 1.3987100557021401, + "grad_norm": 0.5952703158323261, + "learning_rate": 0.000139848028149103, + "loss": 3.258357048034668, + "step": 2386, + "token_acc": 0.27197776155156733 + }, + { + "epoch": 1.3992963940193492, + "grad_norm": 0.6740266715112851, + "learning_rate": 0.0001398475810068107, + "loss": 3.270463466644287, + "step": 2387, + "token_acc": 0.26961353915194985 + }, + { + "epoch": 1.3998827323365581, + "grad_norm": 0.6004534191619872, + "learning_rate": 0.00013984713320839445, + "loss": 3.249237537384033, + "step": 2388, + "token_acc": 0.2750294104369209 + }, + { + "epoch": 1.4004690706537672, + "grad_norm": 0.6206680266876937, + "learning_rate": 0.00013984668475385852, + "loss": 3.2286877632141113, + "step": 2389, + "token_acc": 0.27423864938450704 + }, + { + "epoch": 1.4010554089709761, + "grad_norm": 0.5738526176746466, + "learning_rate": 0.0001398462356432071, + "loss": 3.224648952484131, + "step": 2390, + "token_acc": 0.27637327699376435 + }, + { + "epoch": 1.4016417472881852, + "grad_norm": 0.5407964034418798, + "learning_rate": 0.00013984578587644442, + "loss": 3.2323837280273438, + "step": 2391, + "token_acc": 0.2739146905467192 + }, + { + "epoch": 1.4022280856053944, + "grad_norm": 0.5673861611556864, + "learning_rate": 0.00013984533545357468, + "loss": 3.258605480194092, + "step": 2392, + "token_acc": 0.27303377317909666 + }, + { + "epoch": 1.4028144239226035, + "grad_norm": 0.6688576029284901, + "learning_rate": 0.00013984488437460214, + "loss": 3.2233543395996094, + "step": 2393, + "token_acc": 0.27544254390404044 + }, + { + "epoch": 1.4034007622398124, + "grad_norm": 0.6303158626622627, + "learning_rate": 0.00013984443263953105, + "loss": 3.2401535511016846, + "step": 2394, + "token_acc": 0.27658400634068764 + }, + { + "epoch": 1.4039871005570215, + "grad_norm": 0.529217700035354, + "learning_rate": 0.00013984398024836562, + "loss": 3.201978921890259, + "step": 2395, + "token_acc": 0.2781445856654812 + }, + { + "epoch": 1.4045734388742304, + "grad_norm": 0.56607946532355, + "learning_rate": 0.00013984352720111012, + "loss": 3.2326149940490723, + "step": 2396, + "token_acc": 0.27355680344197375 + }, + { + "epoch": 1.4051597771914395, + "grad_norm": 0.512343779762048, + "learning_rate": 0.00013984307349776878, + "loss": 3.2144575119018555, + "step": 2397, + "token_acc": 0.27770261504531385 + }, + { + "epoch": 1.4057461155086486, + "grad_norm": 0.6349051206000139, + "learning_rate": 0.0001398426191383459, + "loss": 3.215420722961426, + "step": 2398, + "token_acc": 0.27814626881180404 + }, + { + "epoch": 1.4063324538258575, + "grad_norm": 0.703018170398127, + "learning_rate": 0.0001398421641228457, + "loss": 3.2789053916931152, + "step": 2399, + "token_acc": 0.2683592564198871 + }, + { + "epoch": 1.4069187921430666, + "grad_norm": 0.7951526408804213, + "learning_rate": 0.0001398417084512725, + "loss": 3.250777244567871, + "step": 2400, + "token_acc": 0.2720056727380486 + }, + { + "epoch": 1.4075051304602755, + "grad_norm": 0.7735204236643429, + "learning_rate": 0.00013984125212363054, + "loss": 3.2899956703186035, + "step": 2401, + "token_acc": 0.2664959626817427 + }, + { + "epoch": 1.4080914687774846, + "grad_norm": 0.6762058240514218, + "learning_rate": 0.00013984079513992416, + "loss": 3.2738871574401855, + "step": 2402, + "token_acc": 0.26916248301042695 + }, + { + "epoch": 1.4086778070946937, + "grad_norm": 0.6554651196168081, + "learning_rate": 0.0001398403375001576, + "loss": 3.228583812713623, + "step": 2403, + "token_acc": 0.27528303819275746 + }, + { + "epoch": 1.4092641454119026, + "grad_norm": 0.7460082060750334, + "learning_rate": 0.00013983987920433518, + "loss": 3.2071847915649414, + "step": 2404, + "token_acc": 0.2807932326334222 + }, + { + "epoch": 1.4098504837291117, + "grad_norm": 0.5417904242913776, + "learning_rate": 0.00013983942025246122, + "loss": 3.222723960876465, + "step": 2405, + "token_acc": 0.2740727833109164 + }, + { + "epoch": 1.4104368220463206, + "grad_norm": 0.6641558497790798, + "learning_rate": 0.00013983896064454003, + "loss": 3.243206739425659, + "step": 2406, + "token_acc": 0.273592803473468 + }, + { + "epoch": 1.4110231603635297, + "grad_norm": 0.5861446436590138, + "learning_rate": 0.00013983850038057588, + "loss": 3.2592337131500244, + "step": 2407, + "token_acc": 0.2714180917646936 + }, + { + "epoch": 1.4116094986807388, + "grad_norm": 0.710919946430669, + "learning_rate": 0.00013983803946057314, + "loss": 3.22426176071167, + "step": 2408, + "token_acc": 0.276631834649132 + }, + { + "epoch": 1.412195836997948, + "grad_norm": 0.8212010231974327, + "learning_rate": 0.00013983757788453612, + "loss": 3.2760419845581055, + "step": 2409, + "token_acc": 0.26962649608980893 + }, + { + "epoch": 1.4127821753151568, + "grad_norm": 0.7552492593883318, + "learning_rate": 0.00013983711565246918, + "loss": 3.2396395206451416, + "step": 2410, + "token_acc": 0.27469004363092897 + }, + { + "epoch": 1.413368513632366, + "grad_norm": 0.641132158730426, + "learning_rate": 0.00013983665276437663, + "loss": 3.1882057189941406, + "step": 2411, + "token_acc": 0.28013891914884786 + }, + { + "epoch": 1.4139548519495748, + "grad_norm": 0.5501504600250186, + "learning_rate": 0.00013983618922026284, + "loss": 3.243659019470215, + "step": 2412, + "token_acc": 0.27248790546435087 + }, + { + "epoch": 1.414541190266784, + "grad_norm": 0.4872316805940302, + "learning_rate": 0.00013983572502013217, + "loss": 3.2369706630706787, + "step": 2413, + "token_acc": 0.2750343893736835 + }, + { + "epoch": 1.415127528583993, + "grad_norm": 0.6545925367804918, + "learning_rate": 0.00013983526016398895, + "loss": 3.202843189239502, + "step": 2414, + "token_acc": 0.27963820229860514 + }, + { + "epoch": 1.415713866901202, + "grad_norm": 0.6920926122356945, + "learning_rate": 0.00013983479465183755, + "loss": 3.2559728622436523, + "step": 2415, + "token_acc": 0.2708508187064386 + }, + { + "epoch": 1.416300205218411, + "grad_norm": 0.6317835047164144, + "learning_rate": 0.0001398343284836824, + "loss": 3.2196531295776367, + "step": 2416, + "token_acc": 0.2762013747396038 + }, + { + "epoch": 1.41688654353562, + "grad_norm": 0.601052014115884, + "learning_rate": 0.00013983386165952781, + "loss": 3.224344253540039, + "step": 2417, + "token_acc": 0.2764773682744379 + }, + { + "epoch": 1.417472881852829, + "grad_norm": 0.5588230081471142, + "learning_rate": 0.0001398333941793782, + "loss": 3.230104923248291, + "step": 2418, + "token_acc": 0.2758006405657887 + }, + { + "epoch": 1.4180592201700382, + "grad_norm": 0.4828314834208616, + "learning_rate": 0.00013983292604323794, + "loss": 3.266181468963623, + "step": 2419, + "token_acc": 0.2693253415399971 + }, + { + "epoch": 1.4186455584872473, + "grad_norm": 0.511727736689772, + "learning_rate": 0.00013983245725111146, + "loss": 3.2578468322753906, + "step": 2420, + "token_acc": 0.2722493178808299 + }, + { + "epoch": 1.4192318968044562, + "grad_norm": 0.5166091308562472, + "learning_rate": 0.00013983198780300311, + "loss": 3.254639148712158, + "step": 2421, + "token_acc": 0.2705831903945112 + }, + { + "epoch": 1.4198182351216653, + "grad_norm": 0.5700928412157392, + "learning_rate": 0.00013983151769891738, + "loss": 3.214261770248413, + "step": 2422, + "token_acc": 0.276788647427155 + }, + { + "epoch": 1.4204045734388742, + "grad_norm": 0.6366111276433738, + "learning_rate": 0.00013983104693885863, + "loss": 3.2175686359405518, + "step": 2423, + "token_acc": 0.2763402208487419 + }, + { + "epoch": 1.4209909117560833, + "grad_norm": 0.5983932926702524, + "learning_rate": 0.0001398305755228313, + "loss": 3.1876025199890137, + "step": 2424, + "token_acc": 0.2799941676941416 + }, + { + "epoch": 1.4215772500732924, + "grad_norm": 0.47244505946272425, + "learning_rate": 0.00013983010345083977, + "loss": 3.1966333389282227, + "step": 2425, + "token_acc": 0.28019370843449803 + }, + { + "epoch": 1.4221635883905013, + "grad_norm": 0.5461328205963042, + "learning_rate": 0.00013982963072288856, + "loss": 3.237764358520508, + "step": 2426, + "token_acc": 0.2748588952968515 + }, + { + "epoch": 1.4227499267077104, + "grad_norm": 0.6377304766389197, + "learning_rate": 0.00013982915733898202, + "loss": 3.253542900085449, + "step": 2427, + "token_acc": 0.27375972870657517 + }, + { + "epoch": 1.4233362650249193, + "grad_norm": 0.5438557474585347, + "learning_rate": 0.00013982868329912468, + "loss": 3.2908785343170166, + "step": 2428, + "token_acc": 0.2678315574928361 + }, + { + "epoch": 1.4239226033421284, + "grad_norm": 0.5964805824243781, + "learning_rate": 0.00013982820860332094, + "loss": 3.2137675285339355, + "step": 2429, + "token_acc": 0.277348267304219 + }, + { + "epoch": 1.4245089416593375, + "grad_norm": 0.6325379556253067, + "learning_rate": 0.0001398277332515753, + "loss": 3.231764793395996, + "step": 2430, + "token_acc": 0.2729342754299427 + }, + { + "epoch": 1.4250952799765464, + "grad_norm": 0.6067508645464627, + "learning_rate": 0.00013982725724389215, + "loss": 3.2056338787078857, + "step": 2431, + "token_acc": 0.27848917339073004 + }, + { + "epoch": 1.4256816182937555, + "grad_norm": 0.5839311101274166, + "learning_rate": 0.00013982678058027605, + "loss": 3.1988816261291504, + "step": 2432, + "token_acc": 0.2788803993790143 + }, + { + "epoch": 1.4262679566109644, + "grad_norm": 0.656760848134075, + "learning_rate": 0.00013982630326073143, + "loss": 3.269249439239502, + "step": 2433, + "token_acc": 0.2686118280776568 + }, + { + "epoch": 1.4268542949281735, + "grad_norm": 0.6449637479988619, + "learning_rate": 0.00013982582528526276, + "loss": 3.2284135818481445, + "step": 2434, + "token_acc": 0.27500480722374265 + }, + { + "epoch": 1.4274406332453826, + "grad_norm": 0.618894237909221, + "learning_rate": 0.00013982534665387458, + "loss": 3.222695827484131, + "step": 2435, + "token_acc": 0.27693057349369815 + }, + { + "epoch": 1.4280269715625917, + "grad_norm": 0.6985254532677547, + "learning_rate": 0.00013982486736657137, + "loss": 3.2316129207611084, + "step": 2436, + "token_acc": 0.27562998987150295 + }, + { + "epoch": 1.4286133098798006, + "grad_norm": 0.6274405318836794, + "learning_rate": 0.0001398243874233576, + "loss": 3.2195935249328613, + "step": 2437, + "token_acc": 0.27584777319894743 + }, + { + "epoch": 1.4291996481970097, + "grad_norm": 0.551814661063161, + "learning_rate": 0.00013982390682423782, + "loss": 3.2372705936431885, + "step": 2438, + "token_acc": 0.27380889832289523 + }, + { + "epoch": 1.4297859865142186, + "grad_norm": 0.4987265102792151, + "learning_rate": 0.0001398234255692165, + "loss": 3.2298762798309326, + "step": 2439, + "token_acc": 0.2737696335078534 + }, + { + "epoch": 1.4303723248314277, + "grad_norm": 0.5893724824781764, + "learning_rate": 0.00013982294365829818, + "loss": 3.2411513328552246, + "step": 2440, + "token_acc": 0.27373193189586453 + }, + { + "epoch": 1.4309586631486368, + "grad_norm": 0.8000977343325651, + "learning_rate": 0.0001398224610914874, + "loss": 3.2341487407684326, + "step": 2441, + "token_acc": 0.2736898429551218 + }, + { + "epoch": 1.4315450014658457, + "grad_norm": 0.7012828912966524, + "learning_rate": 0.0001398219778687887, + "loss": 3.1969716548919678, + "step": 2442, + "token_acc": 0.27902733519927014 + }, + { + "epoch": 1.4321313397830548, + "grad_norm": 0.6541723189277936, + "learning_rate": 0.0001398214939902066, + "loss": 3.222557783126831, + "step": 2443, + "token_acc": 0.27698512154455573 + }, + { + "epoch": 1.4327176781002637, + "grad_norm": 0.5831884616370641, + "learning_rate": 0.00013982100945574566, + "loss": 3.2349190711975098, + "step": 2444, + "token_acc": 0.2730755761380327 + }, + { + "epoch": 1.4333040164174728, + "grad_norm": 0.6127999475862088, + "learning_rate": 0.00013982052426541038, + "loss": 3.2241790294647217, + "step": 2445, + "token_acc": 0.275428150835272 + }, + { + "epoch": 1.433890354734682, + "grad_norm": 0.6460185068876323, + "learning_rate": 0.0001398200384192054, + "loss": 3.231858730316162, + "step": 2446, + "token_acc": 0.2754465230136903 + }, + { + "epoch": 1.434476693051891, + "grad_norm": 0.8507695748449875, + "learning_rate": 0.00013981955191713524, + "loss": 3.2536540031433105, + "step": 2447, + "token_acc": 0.2720825510059989 + }, + { + "epoch": 1.4350630313691, + "grad_norm": 1.050952069242236, + "learning_rate": 0.00013981906475920444, + "loss": 3.217353105545044, + "step": 2448, + "token_acc": 0.2759810951418911 + }, + { + "epoch": 1.435649369686309, + "grad_norm": 0.7885430134156483, + "learning_rate": 0.00013981857694541765, + "loss": 3.2596235275268555, + "step": 2449, + "token_acc": 0.27104769635188297 + }, + { + "epoch": 1.436235708003518, + "grad_norm": 0.587804477966935, + "learning_rate": 0.00013981808847577938, + "loss": 3.1938605308532715, + "step": 2450, + "token_acc": 0.28141183798281244 + }, + { + "epoch": 1.436822046320727, + "grad_norm": 0.7077522822114317, + "learning_rate": 0.00013981759935029425, + "loss": 3.233181953430176, + "step": 2451, + "token_acc": 0.27429254493830546 + }, + { + "epoch": 1.4374083846379362, + "grad_norm": 0.8260550515530766, + "learning_rate": 0.0001398171095689669, + "loss": 3.174018383026123, + "step": 2452, + "token_acc": 0.2815628461133155 + }, + { + "epoch": 1.437994722955145, + "grad_norm": 0.7219596264006004, + "learning_rate": 0.00013981661913180183, + "loss": 3.2035703659057617, + "step": 2453, + "token_acc": 0.2794444414409057 + }, + { + "epoch": 1.4385810612723542, + "grad_norm": 0.9086604556673789, + "learning_rate": 0.00013981612803880373, + "loss": 3.261613130569458, + "step": 2454, + "token_acc": 0.2691828193554097 + }, + { + "epoch": 1.439167399589563, + "grad_norm": 0.6248868631385788, + "learning_rate": 0.00013981563628997717, + "loss": 3.187314033508301, + "step": 2455, + "token_acc": 0.2815027476678622 + }, + { + "epoch": 1.4397537379067722, + "grad_norm": 0.5246225522196596, + "learning_rate": 0.0001398151438853268, + "loss": 3.238859176635742, + "step": 2456, + "token_acc": 0.27350756857768566 + }, + { + "epoch": 1.4403400762239813, + "grad_norm": 0.5735549648006387, + "learning_rate": 0.0001398146508248572, + "loss": 3.2098145484924316, + "step": 2457, + "token_acc": 0.27673749307927753 + }, + { + "epoch": 1.4409264145411902, + "grad_norm": 0.5053864241414865, + "learning_rate": 0.00013981415710857307, + "loss": 3.266812562942505, + "step": 2458, + "token_acc": 0.27105644267410284 + }, + { + "epoch": 1.4415127528583993, + "grad_norm": 0.4913960409943003, + "learning_rate": 0.000139813662736479, + "loss": 3.222018003463745, + "step": 2459, + "token_acc": 0.27398282000821456 + }, + { + "epoch": 1.4420990911756082, + "grad_norm": 0.6327417579947565, + "learning_rate": 0.00013981316770857963, + "loss": 3.226259708404541, + "step": 2460, + "token_acc": 0.27503610391436745 + }, + { + "epoch": 1.4426854294928173, + "grad_norm": 0.6324405253818951, + "learning_rate": 0.00013981267202487963, + "loss": 3.216109037399292, + "step": 2461, + "token_acc": 0.2765765363832238 + }, + { + "epoch": 1.4432717678100264, + "grad_norm": 0.48994333422123504, + "learning_rate": 0.00013981217568538368, + "loss": 3.2009785175323486, + "step": 2462, + "token_acc": 0.27727835286441477 + }, + { + "epoch": 1.4438581061272355, + "grad_norm": 0.5333057242858208, + "learning_rate": 0.00013981167869009636, + "loss": 3.2006654739379883, + "step": 2463, + "token_acc": 0.2797878475852389 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.7237844336245356, + "learning_rate": 0.00013981118103902242, + "loss": 3.227724075317383, + "step": 2464, + "token_acc": 0.2746798822152202 + }, + { + "epoch": 1.4450307827616535, + "grad_norm": 0.7140014959929148, + "learning_rate": 0.00013981068273216651, + "loss": 3.255819320678711, + "step": 2465, + "token_acc": 0.270896989447626 + }, + { + "epoch": 1.4456171210788624, + "grad_norm": 0.5742959378883638, + "learning_rate": 0.0001398101837695333, + "loss": 3.260256052017212, + "step": 2466, + "token_acc": 0.2703417139854478 + }, + { + "epoch": 1.4462034593960715, + "grad_norm": 0.38680497756062954, + "learning_rate": 0.00013980968415112748, + "loss": 3.209787607192993, + "step": 2467, + "token_acc": 0.2775748684355136 + }, + { + "epoch": 1.4467897977132806, + "grad_norm": 0.5614170195556832, + "learning_rate": 0.00013980918387695375, + "loss": 3.22489070892334, + "step": 2468, + "token_acc": 0.27501184768248743 + }, + { + "epoch": 1.4473761360304895, + "grad_norm": 0.6967380425106702, + "learning_rate": 0.0001398086829470168, + "loss": 3.277099609375, + "step": 2469, + "token_acc": 0.2697925388408224 + }, + { + "epoch": 1.4479624743476986, + "grad_norm": 0.7198215198735929, + "learning_rate": 0.00013980818136132136, + "loss": 3.225374698638916, + "step": 2470, + "token_acc": 0.2746195453586335 + }, + { + "epoch": 1.4485488126649075, + "grad_norm": 0.5950042314659139, + "learning_rate": 0.00013980767911987208, + "loss": 3.1966466903686523, + "step": 2471, + "token_acc": 0.28012465202137743 + }, + { + "epoch": 1.4491351509821166, + "grad_norm": 0.4836010137646836, + "learning_rate": 0.00013980717622267378, + "loss": 3.2653489112854004, + "step": 2472, + "token_acc": 0.270239411442565 + }, + { + "epoch": 1.4497214892993258, + "grad_norm": 0.42797447511473263, + "learning_rate": 0.0001398066726697311, + "loss": 3.1798219680786133, + "step": 2473, + "token_acc": 0.2802706571658161 + }, + { + "epoch": 1.4503078276165349, + "grad_norm": 0.6730463660294633, + "learning_rate": 0.0001398061684610488, + "loss": 3.232506275177002, + "step": 2474, + "token_acc": 0.2745652864884141 + }, + { + "epoch": 1.4508941659337438, + "grad_norm": 0.5714172431360138, + "learning_rate": 0.00013980566359663162, + "loss": 3.2706761360168457, + "step": 2475, + "token_acc": 0.2676581684554179 + }, + { + "epoch": 1.4514805042509529, + "grad_norm": 0.41679095839410296, + "learning_rate": 0.00013980515807648426, + "loss": 3.285898447036743, + "step": 2476, + "token_acc": 0.26596671094462826 + }, + { + "epoch": 1.4520668425681618, + "grad_norm": 0.390898732953776, + "learning_rate": 0.00013980465190061153, + "loss": 3.193160057067871, + "step": 2477, + "token_acc": 0.2793697476788869 + }, + { + "epoch": 1.4526531808853709, + "grad_norm": 0.4886332203034683, + "learning_rate": 0.00013980414506901815, + "loss": 3.2722482681274414, + "step": 2478, + "token_acc": 0.2697327615707075 + }, + { + "epoch": 1.45323951920258, + "grad_norm": 0.45924284494060735, + "learning_rate": 0.0001398036375817089, + "loss": 3.1913957595825195, + "step": 2479, + "token_acc": 0.2794312222454347 + }, + { + "epoch": 1.4538258575197889, + "grad_norm": 0.5781556347793593, + "learning_rate": 0.00013980312943868853, + "loss": 3.226372241973877, + "step": 2480, + "token_acc": 0.27423203416583525 + }, + { + "epoch": 1.454412195836998, + "grad_norm": 0.6914780890266232, + "learning_rate": 0.00013980262063996183, + "loss": 3.2139101028442383, + "step": 2481, + "token_acc": 0.2773929937813242 + }, + { + "epoch": 1.4549985341542069, + "grad_norm": 0.474447746757703, + "learning_rate": 0.00013980211118553356, + "loss": 3.1854515075683594, + "step": 2482, + "token_acc": 0.2811818024747146 + }, + { + "epoch": 1.455584872471416, + "grad_norm": 0.5711247404906978, + "learning_rate": 0.0001398016010754085, + "loss": 3.2113542556762695, + "step": 2483, + "token_acc": 0.2777541405097819 + }, + { + "epoch": 1.456171210788625, + "grad_norm": 0.7172604018289751, + "learning_rate": 0.00013980109030959148, + "loss": 3.21480655670166, + "step": 2484, + "token_acc": 0.2777329654001274 + }, + { + "epoch": 1.456757549105834, + "grad_norm": 0.5268505454016253, + "learning_rate": 0.00013980057888808727, + "loss": 3.1692748069763184, + "step": 2485, + "token_acc": 0.2821465620016412 + }, + { + "epoch": 1.457343887423043, + "grad_norm": 0.48488537251429903, + "learning_rate": 0.00013980006681090068, + "loss": 3.170778274536133, + "step": 2486, + "token_acc": 0.28302928009777084 + }, + { + "epoch": 1.457930225740252, + "grad_norm": 0.5139727052086006, + "learning_rate": 0.0001397995540780365, + "loss": 3.2047383785247803, + "step": 2487, + "token_acc": 0.2788228302372857 + }, + { + "epoch": 1.458516564057461, + "grad_norm": 0.5400128521648689, + "learning_rate": 0.0001397990406894996, + "loss": 3.2094039916992188, + "step": 2488, + "token_acc": 0.2788425544658232 + }, + { + "epoch": 1.4591029023746702, + "grad_norm": 0.5352485067628382, + "learning_rate": 0.00013979852664529474, + "loss": 3.2051069736480713, + "step": 2489, + "token_acc": 0.2768586238727544 + }, + { + "epoch": 1.4596892406918793, + "grad_norm": 0.6868011774118865, + "learning_rate": 0.00013979801194542678, + "loss": 3.1778130531311035, + "step": 2490, + "token_acc": 0.27994833203466146 + }, + { + "epoch": 1.4602755790090882, + "grad_norm": 0.6304069717996058, + "learning_rate": 0.00013979749658990054, + "loss": 3.184601306915283, + "step": 2491, + "token_acc": 0.2794661881798811 + }, + { + "epoch": 1.4608619173262973, + "grad_norm": 0.6630661438258401, + "learning_rate": 0.0001397969805787209, + "loss": 3.2271370887756348, + "step": 2492, + "token_acc": 0.274853347202923 + }, + { + "epoch": 1.4614482556435062, + "grad_norm": 0.6019424681475308, + "learning_rate": 0.00013979646391189268, + "loss": 3.2426910400390625, + "step": 2493, + "token_acc": 0.2742297941763723 + }, + { + "epoch": 1.4620345939607153, + "grad_norm": 0.4666156274953772, + "learning_rate": 0.00013979594658942074, + "loss": 3.1951870918273926, + "step": 2494, + "token_acc": 0.278254932354771 + }, + { + "epoch": 1.4626209322779244, + "grad_norm": 0.6528939611936597, + "learning_rate": 0.0001397954286113099, + "loss": 3.22921085357666, + "step": 2495, + "token_acc": 0.27374029739815864 + }, + { + "epoch": 1.4632072705951333, + "grad_norm": 0.5292733738515044, + "learning_rate": 0.00013979490997756506, + "loss": 3.248718738555908, + "step": 2496, + "token_acc": 0.2713083369361766 + }, + { + "epoch": 1.4637936089123424, + "grad_norm": 0.7062491992807611, + "learning_rate": 0.0001397943906881911, + "loss": 3.248420238494873, + "step": 2497, + "token_acc": 0.27059643147424234 + }, + { + "epoch": 1.4643799472295513, + "grad_norm": 0.7066871850317076, + "learning_rate": 0.00013979387074319292, + "loss": 3.2616121768951416, + "step": 2498, + "token_acc": 0.2704224685292246 + }, + { + "epoch": 1.4649662855467604, + "grad_norm": 0.7268616005454495, + "learning_rate": 0.00013979335014257532, + "loss": 3.2437503337860107, + "step": 2499, + "token_acc": 0.27294004555618245 + }, + { + "epoch": 1.4655526238639696, + "grad_norm": 0.6170402127330895, + "learning_rate": 0.00013979282888634326, + "loss": 3.2574422359466553, + "step": 2500, + "token_acc": 0.2713613598421612 + }, + { + "epoch": 1.4661389621811787, + "grad_norm": 0.5382448827022154, + "learning_rate": 0.00013979230697450164, + "loss": 3.207439661026001, + "step": 2501, + "token_acc": 0.27919188230383973 + }, + { + "epoch": 1.4667253004983876, + "grad_norm": 0.5079250397474497, + "learning_rate": 0.00013979178440705535, + "loss": 3.1744720935821533, + "step": 2502, + "token_acc": 0.2840293181344636 + }, + { + "epoch": 1.4673116388155967, + "grad_norm": 0.5535709138033333, + "learning_rate": 0.00013979126118400927, + "loss": 3.255910634994507, + "step": 2503, + "token_acc": 0.2704175567150695 + }, + { + "epoch": 1.4678979771328056, + "grad_norm": 0.5204048645281864, + "learning_rate": 0.00013979073730536833, + "loss": 3.203619956970215, + "step": 2504, + "token_acc": 0.2759190856775004 + }, + { + "epoch": 1.4684843154500147, + "grad_norm": 0.7119167447735097, + "learning_rate": 0.00013979021277113748, + "loss": 3.2156076431274414, + "step": 2505, + "token_acc": 0.2758602844715386 + }, + { + "epoch": 1.4690706537672238, + "grad_norm": 0.8657641734171144, + "learning_rate": 0.0001397896875813216, + "loss": 3.2258713245391846, + "step": 2506, + "token_acc": 0.27484766822870343 + }, + { + "epoch": 1.4696569920844327, + "grad_norm": 0.5779194822953634, + "learning_rate": 0.00013978916173592565, + "loss": 3.221388816833496, + "step": 2507, + "token_acc": 0.27556908924426654 + }, + { + "epoch": 1.4702433304016418, + "grad_norm": 0.49016828665022155, + "learning_rate": 0.00013978863523495457, + "loss": 3.2357470989227295, + "step": 2508, + "token_acc": 0.27485042927195685 + }, + { + "epoch": 1.4708296687188507, + "grad_norm": 0.7461309520005489, + "learning_rate": 0.00013978810807841334, + "loss": 3.213592529296875, + "step": 2509, + "token_acc": 0.2779036452153582 + }, + { + "epoch": 1.4714160070360598, + "grad_norm": 0.7562593173002234, + "learning_rate": 0.00013978758026630681, + "loss": 3.1985175609588623, + "step": 2510, + "token_acc": 0.2805806469933731 + }, + { + "epoch": 1.472002345353269, + "grad_norm": 0.6376132801211046, + "learning_rate": 0.00013978705179864005, + "loss": 3.215303421020508, + "step": 2511, + "token_acc": 0.2771642043717421 + }, + { + "epoch": 1.4725886836704778, + "grad_norm": 0.5532629199214752, + "learning_rate": 0.00013978652267541798, + "loss": 3.2638301849365234, + "step": 2512, + "token_acc": 0.2705271676658127 + }, + { + "epoch": 1.473175021987687, + "grad_norm": 0.5069247076286572, + "learning_rate": 0.00013978599289664553, + "loss": 3.2155191898345947, + "step": 2513, + "token_acc": 0.27790124674812156 + }, + { + "epoch": 1.4737613603048958, + "grad_norm": 0.5990413151469272, + "learning_rate": 0.00013978546246232773, + "loss": 3.2099194526672363, + "step": 2514, + "token_acc": 0.2781155368303658 + }, + { + "epoch": 1.474347698622105, + "grad_norm": 0.6019848848978954, + "learning_rate": 0.00013978493137246957, + "loss": 3.219420909881592, + "step": 2515, + "token_acc": 0.2762705175304535 + }, + { + "epoch": 1.474934036939314, + "grad_norm": 0.7194820460351192, + "learning_rate": 0.000139784399627076, + "loss": 3.2263007164001465, + "step": 2516, + "token_acc": 0.27486264065836513 + }, + { + "epoch": 1.4755203752565231, + "grad_norm": 0.5047145803987207, + "learning_rate": 0.00013978386722615204, + "loss": 3.2403948307037354, + "step": 2517, + "token_acc": 0.27288338678772667 + }, + { + "epoch": 1.476106713573732, + "grad_norm": 0.6764947504908534, + "learning_rate": 0.00013978333416970266, + "loss": 3.227323055267334, + "step": 2518, + "token_acc": 0.2747396129658709 + }, + { + "epoch": 1.4766930518909411, + "grad_norm": 0.7695876560517256, + "learning_rate": 0.00013978280045773292, + "loss": 3.1751251220703125, + "step": 2519, + "token_acc": 0.281470214288288 + }, + { + "epoch": 1.47727939020815, + "grad_norm": 0.8037666210516372, + "learning_rate": 0.0001397822660902478, + "loss": 3.236229181289673, + "step": 2520, + "token_acc": 0.27324519832958427 + }, + { + "epoch": 1.4778657285253591, + "grad_norm": 0.6063796890613761, + "learning_rate": 0.0001397817310672523, + "loss": 3.237603187561035, + "step": 2521, + "token_acc": 0.2726031982438979 + }, + { + "epoch": 1.4784520668425682, + "grad_norm": 0.675771075930974, + "learning_rate": 0.0001397811953887515, + "loss": 3.1562178134918213, + "step": 2522, + "token_acc": 0.28379850415656843 + }, + { + "epoch": 1.4790384051597771, + "grad_norm": 0.5892495820450392, + "learning_rate": 0.00013978065905475036, + "loss": 3.2719407081604004, + "step": 2523, + "token_acc": 0.2670912962455904 + }, + { + "epoch": 1.4796247434769862, + "grad_norm": 0.6090631597146475, + "learning_rate": 0.00013978012206525398, + "loss": 3.2301523685455322, + "step": 2524, + "token_acc": 0.27433591901401205 + }, + { + "epoch": 1.4802110817941951, + "grad_norm": 0.5986708011967483, + "learning_rate": 0.00013977958442026737, + "loss": 3.1978931427001953, + "step": 2525, + "token_acc": 0.2784000125317209 + }, + { + "epoch": 1.4807974201114043, + "grad_norm": 0.5931181978951686, + "learning_rate": 0.00013977904611979562, + "loss": 3.2427494525909424, + "step": 2526, + "token_acc": 0.2746901311747913 + }, + { + "epoch": 1.4813837584286134, + "grad_norm": 0.5895789615773188, + "learning_rate": 0.00013977850716384373, + "loss": 3.2395873069763184, + "step": 2527, + "token_acc": 0.2732250815900701 + }, + { + "epoch": 1.4819700967458225, + "grad_norm": 0.573596649649976, + "learning_rate": 0.00013977796755241682, + "loss": 3.2323994636535645, + "step": 2528, + "token_acc": 0.2738754014129737 + }, + { + "epoch": 1.4825564350630314, + "grad_norm": 0.5647731156625904, + "learning_rate": 0.00013977742728551993, + "loss": 3.2399463653564453, + "step": 2529, + "token_acc": 0.2724099852494297 + }, + { + "epoch": 1.4831427733802405, + "grad_norm": 0.5331510568883826, + "learning_rate": 0.00013977688636315812, + "loss": 3.203166961669922, + "step": 2530, + "token_acc": 0.27739621626468214 + }, + { + "epoch": 1.4837291116974494, + "grad_norm": 0.494995542538171, + "learning_rate": 0.00013977634478533647, + "loss": 3.221698760986328, + "step": 2531, + "token_acc": 0.27680370584829184 + }, + { + "epoch": 1.4843154500146585, + "grad_norm": 0.4839130654360426, + "learning_rate": 0.00013977580255206012, + "loss": 3.2197742462158203, + "step": 2532, + "token_acc": 0.27495191801540164 + }, + { + "epoch": 1.4849017883318676, + "grad_norm": 0.5315838931611194, + "learning_rate": 0.00013977525966333412, + "loss": 3.2125704288482666, + "step": 2533, + "token_acc": 0.27576937832351367 + }, + { + "epoch": 1.4854881266490765, + "grad_norm": 0.5340332320376607, + "learning_rate": 0.0001397747161191636, + "loss": 3.2286314964294434, + "step": 2534, + "token_acc": 0.275389078711613 + }, + { + "epoch": 1.4860744649662856, + "grad_norm": 0.5202661657795244, + "learning_rate": 0.0001397741719195536, + "loss": 3.2081031799316406, + "step": 2535, + "token_acc": 0.2769927679393088 + }, + { + "epoch": 1.4866608032834945, + "grad_norm": 0.47925131304338453, + "learning_rate": 0.0001397736270645093, + "loss": 3.240163564682007, + "step": 2536, + "token_acc": 0.2730591898646211 + }, + { + "epoch": 1.4872471416007036, + "grad_norm": 0.45236753685903575, + "learning_rate": 0.00013977308155403581, + "loss": 3.2147579193115234, + "step": 2537, + "token_acc": 0.2794105957106291 + }, + { + "epoch": 1.4878334799179127, + "grad_norm": 0.4634027635638455, + "learning_rate": 0.0001397725353881382, + "loss": 3.2071025371551514, + "step": 2538, + "token_acc": 0.2786465723040396 + }, + { + "epoch": 1.4884198182351216, + "grad_norm": 0.4087393809612668, + "learning_rate": 0.00013977198856682168, + "loss": 3.203801393508911, + "step": 2539, + "token_acc": 0.27953028281715436 + }, + { + "epoch": 1.4890061565523307, + "grad_norm": 0.46582580326654033, + "learning_rate": 0.00013977144109009133, + "loss": 3.2602481842041016, + "step": 2540, + "token_acc": 0.2701335676350733 + }, + { + "epoch": 1.4895924948695396, + "grad_norm": 0.4712333528120587, + "learning_rate": 0.00013977089295795232, + "loss": 3.245431900024414, + "step": 2541, + "token_acc": 0.27227248426495315 + }, + { + "epoch": 1.4901788331867487, + "grad_norm": 0.44486816320629674, + "learning_rate": 0.00013977034417040975, + "loss": 3.196918487548828, + "step": 2542, + "token_acc": 0.28048146319427664 + }, + { + "epoch": 1.4907651715039578, + "grad_norm": 0.4360632167334556, + "learning_rate": 0.00013976979472746885, + "loss": 3.240436553955078, + "step": 2543, + "token_acc": 0.27336708743581106 + }, + { + "epoch": 1.491351509821167, + "grad_norm": 0.43872272178873406, + "learning_rate": 0.00013976924462913475, + "loss": 3.211341381072998, + "step": 2544, + "token_acc": 0.27600242712757783 + }, + { + "epoch": 1.4919378481383758, + "grad_norm": 0.4862351547669937, + "learning_rate": 0.0001397686938754126, + "loss": 3.1960270404815674, + "step": 2545, + "token_acc": 0.2789416641829463 + }, + { + "epoch": 1.492524186455585, + "grad_norm": 0.5103539037336897, + "learning_rate": 0.00013976814246630757, + "loss": 3.250208854675293, + "step": 2546, + "token_acc": 0.27065510266342957 + }, + { + "epoch": 1.4931105247727938, + "grad_norm": 0.5342999525575971, + "learning_rate": 0.00013976759040182487, + "loss": 3.2290921211242676, + "step": 2547, + "token_acc": 0.2732762067827268 + }, + { + "epoch": 1.493696863090003, + "grad_norm": 0.5803633655434483, + "learning_rate": 0.00013976703768196966, + "loss": 3.2352733612060547, + "step": 2548, + "token_acc": 0.2729484084583218 + }, + { + "epoch": 1.494283201407212, + "grad_norm": 0.703839718059476, + "learning_rate": 0.00013976648430674718, + "loss": 3.2148866653442383, + "step": 2549, + "token_acc": 0.27710088606452504 + }, + { + "epoch": 1.494869539724421, + "grad_norm": 0.8732420956696452, + "learning_rate": 0.00013976593027616255, + "loss": 3.207047462463379, + "step": 2550, + "token_acc": 0.273959138177811 + }, + { + "epoch": 1.49545587804163, + "grad_norm": 0.9287503036010925, + "learning_rate": 0.00013976537559022103, + "loss": 3.1876115798950195, + "step": 2551, + "token_acc": 0.279940076481333 + }, + { + "epoch": 1.496042216358839, + "grad_norm": 0.9104196927077189, + "learning_rate": 0.00013976482024892782, + "loss": 3.2507317066192627, + "step": 2552, + "token_acc": 0.2715939221533291 + }, + { + "epoch": 1.496628554676048, + "grad_norm": 0.9164196052930418, + "learning_rate": 0.00013976426425228814, + "loss": 3.1873207092285156, + "step": 2553, + "token_acc": 0.28021149752672125 + }, + { + "epoch": 1.4972148929932572, + "grad_norm": 0.9170878486243227, + "learning_rate": 0.0001397637076003072, + "loss": 3.1836254596710205, + "step": 2554, + "token_acc": 0.28118229017619883 + }, + { + "epoch": 1.4978012313104663, + "grad_norm": 0.6773100640272034, + "learning_rate": 0.0001397631502929902, + "loss": 3.2218804359436035, + "step": 2555, + "token_acc": 0.2756324551055184 + }, + { + "epoch": 1.4983875696276752, + "grad_norm": 0.6590707684674438, + "learning_rate": 0.00013976259233034244, + "loss": 3.213777542114258, + "step": 2556, + "token_acc": 0.2779460464142201 + }, + { + "epoch": 1.4989739079448843, + "grad_norm": 0.6303619828252052, + "learning_rate": 0.00013976203371236917, + "loss": 3.264509439468384, + "step": 2557, + "token_acc": 0.27112036598160505 + }, + { + "epoch": 1.4995602462620932, + "grad_norm": 0.6791792897465988, + "learning_rate": 0.00013976147443907556, + "loss": 3.222604751586914, + "step": 2558, + "token_acc": 0.27526599575244787 + }, + { + "epoch": 1.5001465845793023, + "grad_norm": 0.5363781061411426, + "learning_rate": 0.00013976091451046687, + "loss": 3.199427604675293, + "step": 2559, + "token_acc": 0.2784718699000058 + }, + { + "epoch": 1.5007329228965114, + "grad_norm": 0.4913536477762996, + "learning_rate": 0.00013976035392654842, + "loss": 3.2020766735076904, + "step": 2560, + "token_acc": 0.278448969902151 + }, + { + "epoch": 1.5013192612137203, + "grad_norm": 0.7390118155711631, + "learning_rate": 0.0001397597926873255, + "loss": 3.1984505653381348, + "step": 2561, + "token_acc": 0.2784253175764316 + }, + { + "epoch": 1.5019055995309294, + "grad_norm": 0.5557912274998127, + "learning_rate": 0.00013975923079280326, + "loss": 3.2683253288269043, + "step": 2562, + "token_acc": 0.26881237170844274 + }, + { + "epoch": 1.5024919378481383, + "grad_norm": 0.718210430407268, + "learning_rate": 0.00013975866824298707, + "loss": 3.190762519836426, + "step": 2563, + "token_acc": 0.27946874983875714 + }, + { + "epoch": 1.5030782761653474, + "grad_norm": 0.8341838804040289, + "learning_rate": 0.00013975810503788217, + "loss": 3.218371629714966, + "step": 2564, + "token_acc": 0.277456997955713 + }, + { + "epoch": 1.5036646144825565, + "grad_norm": 0.6672144367073933, + "learning_rate": 0.00013975754117749391, + "loss": 3.208378791809082, + "step": 2565, + "token_acc": 0.27657754578900534 + }, + { + "epoch": 1.5042509527997656, + "grad_norm": 0.7602384122378035, + "learning_rate": 0.00013975697666182752, + "loss": 3.20758056640625, + "step": 2566, + "token_acc": 0.27617974237344867 + }, + { + "epoch": 1.5048372911169745, + "grad_norm": 0.5737244827267608, + "learning_rate": 0.00013975641149088837, + "loss": 3.2075133323669434, + "step": 2567, + "token_acc": 0.2775379136120207 + }, + { + "epoch": 1.5054236294341834, + "grad_norm": 0.5355928937599357, + "learning_rate": 0.0001397558456646817, + "loss": 3.2195966243743896, + "step": 2568, + "token_acc": 0.27568037934774386 + }, + { + "epoch": 1.5060099677513925, + "grad_norm": 0.5474300730251878, + "learning_rate": 0.00013975527918321288, + "loss": 3.2337286472320557, + "step": 2569, + "token_acc": 0.2724674299249901 + }, + { + "epoch": 1.5065963060686016, + "grad_norm": 0.5552195678643156, + "learning_rate": 0.0001397547120464872, + "loss": 3.1930017471313477, + "step": 2570, + "token_acc": 0.2784622825996505 + }, + { + "epoch": 1.5071826443858107, + "grad_norm": 0.5622014804370437, + "learning_rate": 0.00013975414425451, + "loss": 3.231153964996338, + "step": 2571, + "token_acc": 0.27402338180781294 + }, + { + "epoch": 1.5077689827030196, + "grad_norm": 0.5328862852165819, + "learning_rate": 0.0001397535758072866, + "loss": 3.2259230613708496, + "step": 2572, + "token_acc": 0.27617590723862656 + }, + { + "epoch": 1.5083553210202285, + "grad_norm": 0.5921937301181955, + "learning_rate": 0.00013975300670482235, + "loss": 3.2031431198120117, + "step": 2573, + "token_acc": 0.2786465179075032 + }, + { + "epoch": 1.5089416593374376, + "grad_norm": 0.5282550134232902, + "learning_rate": 0.0001397524369471226, + "loss": 3.2419960498809814, + "step": 2574, + "token_acc": 0.27262604849312555 + }, + { + "epoch": 1.5095279976546467, + "grad_norm": 0.4408835654341376, + "learning_rate": 0.0001397518665341927, + "loss": 3.238924026489258, + "step": 2575, + "token_acc": 0.2741587826383922 + }, + { + "epoch": 1.5101143359718558, + "grad_norm": 0.5485542825061888, + "learning_rate": 0.000139751295466038, + "loss": 3.20003604888916, + "step": 2576, + "token_acc": 0.2815849854340148 + }, + { + "epoch": 1.5107006742890647, + "grad_norm": 0.5138402363938792, + "learning_rate": 0.00013975072374266387, + "loss": 3.20914888381958, + "step": 2577, + "token_acc": 0.2778364342690868 + }, + { + "epoch": 1.5112870126062738, + "grad_norm": 0.4831710307680923, + "learning_rate": 0.0001397501513640757, + "loss": 3.266451597213745, + "step": 2578, + "token_acc": 0.26808576318183786 + }, + { + "epoch": 1.5118733509234827, + "grad_norm": 0.4123326723912222, + "learning_rate": 0.00013974957833027881, + "loss": 3.2247297763824463, + "step": 2579, + "token_acc": 0.27471056052011494 + }, + { + "epoch": 1.5124596892406919, + "grad_norm": 0.3966383392726758, + "learning_rate": 0.00013974900464127865, + "loss": 3.2108302116394043, + "step": 2580, + "token_acc": 0.2760295129636624 + }, + { + "epoch": 1.513046027557901, + "grad_norm": 0.4637874302659689, + "learning_rate": 0.00013974843029708058, + "loss": 3.260533094406128, + "step": 2581, + "token_acc": 0.2710347431013111 + }, + { + "epoch": 1.51363236587511, + "grad_norm": 0.5033068607067129, + "learning_rate": 0.00013974785529768997, + "loss": 3.264974594116211, + "step": 2582, + "token_acc": 0.26873702483826467 + }, + { + "epoch": 1.514218704192319, + "grad_norm": 0.5479386848991177, + "learning_rate": 0.00013974727964311226, + "loss": 3.247617721557617, + "step": 2583, + "token_acc": 0.27269674182833337 + }, + { + "epoch": 1.5148050425095279, + "grad_norm": 0.5511319581522282, + "learning_rate": 0.00013974670333335285, + "loss": 3.2106881141662598, + "step": 2584, + "token_acc": 0.2779150413512032 + }, + { + "epoch": 1.515391380826737, + "grad_norm": 0.42797504203201625, + "learning_rate": 0.00013974612636841714, + "loss": 3.2237043380737305, + "step": 2585, + "token_acc": 0.2761071667146331 + }, + { + "epoch": 1.515977719143946, + "grad_norm": 0.5094174097076807, + "learning_rate": 0.00013974554874831053, + "loss": 3.242367744445801, + "step": 2586, + "token_acc": 0.27059423683834916 + }, + { + "epoch": 1.5165640574611552, + "grad_norm": 0.475937285256912, + "learning_rate": 0.00013974497047303851, + "loss": 3.202700614929199, + "step": 2587, + "token_acc": 0.2780571784896095 + }, + { + "epoch": 1.517150395778364, + "grad_norm": 0.49121387181486104, + "learning_rate": 0.00013974439154260647, + "loss": 3.2149429321289062, + "step": 2588, + "token_acc": 0.27425128193927745 + }, + { + "epoch": 1.5177367340955732, + "grad_norm": 0.5629072001881763, + "learning_rate": 0.00013974381195701986, + "loss": 3.2113990783691406, + "step": 2589, + "token_acc": 0.27589473821955274 + }, + { + "epoch": 1.518323072412782, + "grad_norm": 0.5148768215410927, + "learning_rate": 0.00013974323171628408, + "loss": 3.206923007965088, + "step": 2590, + "token_acc": 0.2772624850729868 + }, + { + "epoch": 1.5189094107299912, + "grad_norm": 0.3961598795941667, + "learning_rate": 0.00013974265082040467, + "loss": 3.2035489082336426, + "step": 2591, + "token_acc": 0.27713900622708143 + }, + { + "epoch": 1.5194957490472003, + "grad_norm": 0.49056788178072985, + "learning_rate": 0.000139742069269387, + "loss": 3.22664213180542, + "step": 2592, + "token_acc": 0.27438000737030316 + }, + { + "epoch": 1.5200820873644094, + "grad_norm": 0.4650880503826025, + "learning_rate": 0.0001397414870632366, + "loss": 3.23069167137146, + "step": 2593, + "token_acc": 0.27507264060221903 + }, + { + "epoch": 1.5206684256816183, + "grad_norm": 0.5172017760352943, + "learning_rate": 0.00013974090420195887, + "loss": 3.2184481620788574, + "step": 2594, + "token_acc": 0.2763294718598924 + }, + { + "epoch": 1.5212547639988272, + "grad_norm": 0.48141327061611217, + "learning_rate": 0.00013974032068555934, + "loss": 3.244694232940674, + "step": 2595, + "token_acc": 0.2756314080179723 + }, + { + "epoch": 1.5218411023160363, + "grad_norm": 0.6805649871045893, + "learning_rate": 0.00013973973651404346, + "loss": 3.168689250946045, + "step": 2596, + "token_acc": 0.28162553431728365 + }, + { + "epoch": 1.5224274406332454, + "grad_norm": 0.7452226258591964, + "learning_rate": 0.00013973915168741675, + "loss": 3.232879877090454, + "step": 2597, + "token_acc": 0.27488484470178004 + }, + { + "epoch": 1.5230137789504545, + "grad_norm": 0.7406056835949393, + "learning_rate": 0.00013973856620568467, + "loss": 3.210618495941162, + "step": 2598, + "token_acc": 0.2747979555941398 + }, + { + "epoch": 1.5236001172676634, + "grad_norm": 0.802318184778563, + "learning_rate": 0.00013973798006885276, + "loss": 3.20450496673584, + "step": 2599, + "token_acc": 0.27701186277653095 + }, + { + "epoch": 1.5241864555848723, + "grad_norm": 0.7665844554070165, + "learning_rate": 0.00013973739327692645, + "loss": 3.254924774169922, + "step": 2600, + "token_acc": 0.26971113422559795 + }, + { + "epoch": 1.5247727939020814, + "grad_norm": 0.5736590224490088, + "learning_rate": 0.00013973680582991135, + "loss": 3.1640803813934326, + "step": 2601, + "token_acc": 0.28265301662634557 + }, + { + "epoch": 1.5253591322192905, + "grad_norm": 0.6473824667132135, + "learning_rate": 0.0001397362177278129, + "loss": 3.167708396911621, + "step": 2602, + "token_acc": 0.2832496297979363 + }, + { + "epoch": 1.5259454705364996, + "grad_norm": 0.6092904724266451, + "learning_rate": 0.00013973562897063666, + "loss": 3.194204568862915, + "step": 2603, + "token_acc": 0.27825271196966295 + }, + { + "epoch": 1.5265318088537085, + "grad_norm": 0.48113946267623403, + "learning_rate": 0.00013973503955838816, + "loss": 3.2055134773254395, + "step": 2604, + "token_acc": 0.2767244471010161 + }, + { + "epoch": 1.5271181471709177, + "grad_norm": 0.5905113277618578, + "learning_rate": 0.00013973444949107294, + "loss": 3.213676929473877, + "step": 2605, + "token_acc": 0.27729837995156503 + }, + { + "epoch": 1.5277044854881265, + "grad_norm": 0.5243213524984546, + "learning_rate": 0.00013973385876869655, + "loss": 3.2026448249816895, + "step": 2606, + "token_acc": 0.27694738281003556 + }, + { + "epoch": 1.5282908238053357, + "grad_norm": 0.5018075340712047, + "learning_rate": 0.0001397332673912645, + "loss": 3.2417831420898438, + "step": 2607, + "token_acc": 0.27177275534676787 + }, + { + "epoch": 1.5288771621225448, + "grad_norm": 0.5453507701816291, + "learning_rate": 0.00013973267535878238, + "loss": 3.208800792694092, + "step": 2608, + "token_acc": 0.2770779104389769 + }, + { + "epoch": 1.5294635004397539, + "grad_norm": 0.5521765681122655, + "learning_rate": 0.00013973208267125572, + "loss": 3.174708604812622, + "step": 2609, + "token_acc": 0.28094055611760016 + }, + { + "epoch": 1.5300498387569628, + "grad_norm": 0.6253174657455257, + "learning_rate": 0.00013973148932869015, + "loss": 3.2505578994750977, + "step": 2610, + "token_acc": 0.27095122784061143 + }, + { + "epoch": 1.5306361770741717, + "grad_norm": 0.5013950607113463, + "learning_rate": 0.00013973089533109116, + "loss": 3.2260923385620117, + "step": 2611, + "token_acc": 0.2761453060970745 + }, + { + "epoch": 1.5312225153913808, + "grad_norm": 0.616590712216024, + "learning_rate": 0.00013973030067846438, + "loss": 3.2257394790649414, + "step": 2612, + "token_acc": 0.2743145766431603 + }, + { + "epoch": 1.5318088537085899, + "grad_norm": 0.7497297528940498, + "learning_rate": 0.00013972970537081542, + "loss": 3.167748212814331, + "step": 2613, + "token_acc": 0.2811995696611081 + }, + { + "epoch": 1.532395192025799, + "grad_norm": 0.6103722596840169, + "learning_rate": 0.0001397291094081498, + "loss": 3.2516868114471436, + "step": 2614, + "token_acc": 0.2732351499046027 + }, + { + "epoch": 1.5329815303430079, + "grad_norm": 0.6428805358346728, + "learning_rate": 0.00013972851279047318, + "loss": 3.1654186248779297, + "step": 2615, + "token_acc": 0.28194262338071013 + }, + { + "epoch": 1.533567868660217, + "grad_norm": 0.6544186066295447, + "learning_rate": 0.00013972791551779113, + "loss": 3.1971282958984375, + "step": 2616, + "token_acc": 0.27876981489261005 + }, + { + "epoch": 1.5341542069774259, + "grad_norm": 0.4607028201270057, + "learning_rate": 0.00013972731759010927, + "loss": 3.247673988342285, + "step": 2617, + "token_acc": 0.27348627903902134 + }, + { + "epoch": 1.534740545294635, + "grad_norm": 0.5504314125793278, + "learning_rate": 0.00013972671900743325, + "loss": 3.2553300857543945, + "step": 2618, + "token_acc": 0.271510496671787 + }, + { + "epoch": 1.535326883611844, + "grad_norm": 0.5726110299754713, + "learning_rate": 0.00013972611976976866, + "loss": 3.2078166007995605, + "step": 2619, + "token_acc": 0.27762963040349775 + }, + { + "epoch": 1.5359132219290532, + "grad_norm": 0.48802102804155106, + "learning_rate": 0.0001397255198771211, + "loss": 3.193302631378174, + "step": 2620, + "token_acc": 0.27857141038599315 + }, + { + "epoch": 1.536499560246262, + "grad_norm": 0.5389231663669414, + "learning_rate": 0.00013972491932949627, + "loss": 3.2187061309814453, + "step": 2621, + "token_acc": 0.2755332529401849 + }, + { + "epoch": 1.537085898563471, + "grad_norm": 0.48219155120755386, + "learning_rate": 0.00013972431812689975, + "loss": 3.209031581878662, + "step": 2622, + "token_acc": 0.27597583041702933 + }, + { + "epoch": 1.53767223688068, + "grad_norm": 0.5451966788804595, + "learning_rate": 0.0001397237162693372, + "loss": 3.236480712890625, + "step": 2623, + "token_acc": 0.274015594687919 + }, + { + "epoch": 1.5382585751978892, + "grad_norm": 0.5544783305451163, + "learning_rate": 0.00013972311375681434, + "loss": 3.2720773220062256, + "step": 2624, + "token_acc": 0.2698395964345392 + }, + { + "epoch": 1.5388449135150983, + "grad_norm": 0.5261476409920377, + "learning_rate": 0.00013972251058933676, + "loss": 3.211249828338623, + "step": 2625, + "token_acc": 0.27527843642566313 + }, + { + "epoch": 1.5394312518323072, + "grad_norm": 0.5044610104357428, + "learning_rate": 0.00013972190676691016, + "loss": 3.2361197471618652, + "step": 2626, + "token_acc": 0.27251663804353427 + }, + { + "epoch": 1.5400175901495161, + "grad_norm": 0.48800613665888454, + "learning_rate": 0.00013972130228954017, + "loss": 3.193969488143921, + "step": 2627, + "token_acc": 0.27747698770618395 + }, + { + "epoch": 1.5406039284667252, + "grad_norm": 0.5805291189691962, + "learning_rate": 0.0001397206971572325, + "loss": 3.1746785640716553, + "step": 2628, + "token_acc": 0.28094163662317473 + }, + { + "epoch": 1.5411902667839343, + "grad_norm": 0.6026930720886005, + "learning_rate": 0.00013972009136999284, + "loss": 3.2027015686035156, + "step": 2629, + "token_acc": 0.2782227337884141 + }, + { + "epoch": 1.5417766051011434, + "grad_norm": 0.4738573533417063, + "learning_rate": 0.00013971948492782685, + "loss": 3.1654419898986816, + "step": 2630, + "token_acc": 0.2825279422787062 + }, + { + "epoch": 1.5423629434183523, + "grad_norm": 0.5022218588875001, + "learning_rate": 0.00013971887783074027, + "loss": 3.1926217079162598, + "step": 2631, + "token_acc": 0.27997435518595126 + }, + { + "epoch": 1.5429492817355615, + "grad_norm": 0.5382075467289994, + "learning_rate": 0.00013971827007873877, + "loss": 3.161971092224121, + "step": 2632, + "token_acc": 0.28338649356472473 + }, + { + "epoch": 1.5435356200527703, + "grad_norm": 0.5324507414149843, + "learning_rate": 0.00013971766167182806, + "loss": 3.1841793060302734, + "step": 2633, + "token_acc": 0.28069247660515734 + }, + { + "epoch": 1.5441219583699795, + "grad_norm": 0.5934542366965995, + "learning_rate": 0.00013971705261001387, + "loss": 3.2289042472839355, + "step": 2634, + "token_acc": 0.27506286156849097 + }, + { + "epoch": 1.5447082966871886, + "grad_norm": 0.5736611516335997, + "learning_rate": 0.00013971644289330194, + "loss": 3.2282755374908447, + "step": 2635, + "token_acc": 0.27538602327768397 + }, + { + "epoch": 1.5452946350043977, + "grad_norm": 0.5263259124731735, + "learning_rate": 0.00013971583252169796, + "loss": 3.1658763885498047, + "step": 2636, + "token_acc": 0.28384548932494136 + }, + { + "epoch": 1.5458809733216066, + "grad_norm": 0.5208041852420772, + "learning_rate": 0.00013971522149520767, + "loss": 3.219151020050049, + "step": 2637, + "token_acc": 0.2759978507630051 + }, + { + "epoch": 1.5464673116388155, + "grad_norm": 0.5038314288809179, + "learning_rate": 0.0001397146098138368, + "loss": 3.238635778427124, + "step": 2638, + "token_acc": 0.27330765952745467 + }, + { + "epoch": 1.5470536499560246, + "grad_norm": 0.5125176924383831, + "learning_rate": 0.00013971399747759113, + "loss": 3.188901424407959, + "step": 2639, + "token_acc": 0.2791226151112927 + }, + { + "epoch": 1.5476399882732337, + "grad_norm": 0.5283162790329297, + "learning_rate": 0.0001397133844864764, + "loss": 3.195950508117676, + "step": 2640, + "token_acc": 0.27902536666261757 + }, + { + "epoch": 1.5482263265904428, + "grad_norm": 0.6558315475509426, + "learning_rate": 0.00013971277084049837, + "loss": 3.274303913116455, + "step": 2641, + "token_acc": 0.2694335294941482 + }, + { + "epoch": 1.5488126649076517, + "grad_norm": 0.7085779800656861, + "learning_rate": 0.00013971215653966278, + "loss": 3.1904537677764893, + "step": 2642, + "token_acc": 0.280076866126644 + }, + { + "epoch": 1.5493990032248608, + "grad_norm": 0.6524284319210493, + "learning_rate": 0.0001397115415839754, + "loss": 3.215273857116699, + "step": 2643, + "token_acc": 0.276452617355032 + }, + { + "epoch": 1.5499853415420697, + "grad_norm": 0.6589293839963091, + "learning_rate": 0.00013971092597344208, + "loss": 3.208240032196045, + "step": 2644, + "token_acc": 0.2785690608879405 + }, + { + "epoch": 1.5505716798592788, + "grad_norm": 0.5590672785062665, + "learning_rate": 0.00013971030970806852, + "loss": 3.189129114151001, + "step": 2645, + "token_acc": 0.2780788712885639 + }, + { + "epoch": 1.551158018176488, + "grad_norm": 0.45446240869728055, + "learning_rate": 0.0001397096927878605, + "loss": 3.2364468574523926, + "step": 2646, + "token_acc": 0.2731970589768994 + }, + { + "epoch": 1.551744356493697, + "grad_norm": 0.4913708214508635, + "learning_rate": 0.0001397090752128239, + "loss": 3.19134259223938, + "step": 2647, + "token_acc": 0.27929070809222734 + }, + { + "epoch": 1.552330694810906, + "grad_norm": 0.5200967765371806, + "learning_rate": 0.00013970845698296443, + "loss": 3.1862666606903076, + "step": 2648, + "token_acc": 0.2810164661596496 + }, + { + "epoch": 1.5529170331281148, + "grad_norm": 0.46779491552111524, + "learning_rate": 0.000139707838098288, + "loss": 3.2281904220581055, + "step": 2649, + "token_acc": 0.2748163999805457 + }, + { + "epoch": 1.553503371445324, + "grad_norm": 0.49990038498996026, + "learning_rate": 0.0001397072185588003, + "loss": 3.2237701416015625, + "step": 2650, + "token_acc": 0.2756789928548511 + }, + { + "epoch": 1.554089709762533, + "grad_norm": 0.6189599194610861, + "learning_rate": 0.00013970659836450724, + "loss": 3.256709098815918, + "step": 2651, + "token_acc": 0.2712075950403077 + }, + { + "epoch": 1.5546760480797421, + "grad_norm": 0.5301405773276009, + "learning_rate": 0.00013970597751541462, + "loss": 3.193275213241577, + "step": 2652, + "token_acc": 0.27916180505187477 + }, + { + "epoch": 1.555262386396951, + "grad_norm": 0.5801232337212008, + "learning_rate": 0.0001397053560115283, + "loss": 3.2288732528686523, + "step": 2653, + "token_acc": 0.27589025498532344 + }, + { + "epoch": 1.55584872471416, + "grad_norm": 0.445167932547961, + "learning_rate": 0.00013970473385285404, + "loss": 3.2229137420654297, + "step": 2654, + "token_acc": 0.2759732479938658 + }, + { + "epoch": 1.556435063031369, + "grad_norm": 0.5181136068088518, + "learning_rate": 0.00013970411103939775, + "loss": 3.2555737495422363, + "step": 2655, + "token_acc": 0.27262459638741404 + }, + { + "epoch": 1.5570214013485781, + "grad_norm": 0.5451827853320111, + "learning_rate": 0.00013970348757116527, + "loss": 3.234294891357422, + "step": 2656, + "token_acc": 0.2733629562846809 + }, + { + "epoch": 1.5576077396657872, + "grad_norm": 0.4737621942648672, + "learning_rate": 0.00013970286344816245, + "loss": 3.2374162673950195, + "step": 2657, + "token_acc": 0.2737173777204621 + }, + { + "epoch": 1.5581940779829961, + "grad_norm": 0.5586768672720697, + "learning_rate": 0.00013970223867039517, + "loss": 3.209918737411499, + "step": 2658, + "token_acc": 0.2752484803679974 + }, + { + "epoch": 1.5587804163002053, + "grad_norm": 0.5367479916347963, + "learning_rate": 0.00013970161323786925, + "loss": 3.198747158050537, + "step": 2659, + "token_acc": 0.27792302879085673 + }, + { + "epoch": 1.5593667546174141, + "grad_norm": 0.47471411690306475, + "learning_rate": 0.0001397009871505906, + "loss": 3.206704616546631, + "step": 2660, + "token_acc": 0.2767899079009169 + }, + { + "epoch": 1.5599530929346233, + "grad_norm": 0.42285735979847067, + "learning_rate": 0.00013970036040856511, + "loss": 3.2120614051818848, + "step": 2661, + "token_acc": 0.27517074017191856 + }, + { + "epoch": 1.5605394312518324, + "grad_norm": 0.45244278774427144, + "learning_rate": 0.00013969973301179868, + "loss": 3.189943790435791, + "step": 2662, + "token_acc": 0.2794773841847151 + }, + { + "epoch": 1.5611257695690415, + "grad_norm": 0.5391173159679198, + "learning_rate": 0.00013969910496029715, + "loss": 3.17856502532959, + "step": 2663, + "token_acc": 0.2789332906470585 + }, + { + "epoch": 1.5617121078862504, + "grad_norm": 0.46566225921400406, + "learning_rate": 0.00013969847625406646, + "loss": 3.1868972778320312, + "step": 2664, + "token_acc": 0.2787448227234567 + }, + { + "epoch": 1.5622984462034593, + "grad_norm": 0.46469903409964847, + "learning_rate": 0.0001396978468931125, + "loss": 3.2437336444854736, + "step": 2665, + "token_acc": 0.2732312344994297 + }, + { + "epoch": 1.5628847845206684, + "grad_norm": 0.533421489289303, + "learning_rate": 0.0001396972168774412, + "loss": 3.1971049308776855, + "step": 2666, + "token_acc": 0.2775082155811092 + }, + { + "epoch": 1.5634711228378775, + "grad_norm": 0.5469994761503765, + "learning_rate": 0.00013969658620705845, + "loss": 3.2324881553649902, + "step": 2667, + "token_acc": 0.27489355727927817 + }, + { + "epoch": 1.5640574611550866, + "grad_norm": 0.621228929752348, + "learning_rate": 0.00013969595488197022, + "loss": 3.284426689147949, + "step": 2668, + "token_acc": 0.2679566467090722 + }, + { + "epoch": 1.5646437994722955, + "grad_norm": 0.5602255281147202, + "learning_rate": 0.00013969532290218235, + "loss": 3.2318644523620605, + "step": 2669, + "token_acc": 0.27433683044476587 + }, + { + "epoch": 1.5652301377895046, + "grad_norm": 0.5654545428151967, + "learning_rate": 0.0001396946902677009, + "loss": 3.1959280967712402, + "step": 2670, + "token_acc": 0.27546345860527577 + }, + { + "epoch": 1.5658164761067135, + "grad_norm": 0.7558832029167432, + "learning_rate": 0.00013969405697853172, + "loss": 3.2658772468566895, + "step": 2671, + "token_acc": 0.2701158276552468 + }, + { + "epoch": 1.5664028144239226, + "grad_norm": 0.7655953713312325, + "learning_rate": 0.00013969342303468078, + "loss": 3.1789746284484863, + "step": 2672, + "token_acc": 0.28080791386659343 + }, + { + "epoch": 1.5669891527411317, + "grad_norm": 0.6751482667912649, + "learning_rate": 0.00013969278843615406, + "loss": 3.2215323448181152, + "step": 2673, + "token_acc": 0.27552628356035763 + }, + { + "epoch": 1.5675754910583408, + "grad_norm": 0.5774194415395052, + "learning_rate": 0.00013969215318295752, + "loss": 3.1797993183135986, + "step": 2674, + "token_acc": 0.2818518250126815 + }, + { + "epoch": 1.5681618293755497, + "grad_norm": 0.5120096625019415, + "learning_rate": 0.00013969151727509708, + "loss": 3.1890034675598145, + "step": 2675, + "token_acc": 0.27998902003842985 + }, + { + "epoch": 1.5687481676927586, + "grad_norm": 0.5689822047980303, + "learning_rate": 0.00013969088071257875, + "loss": 3.2042322158813477, + "step": 2676, + "token_acc": 0.2760792999527977 + }, + { + "epoch": 1.5693345060099677, + "grad_norm": 0.5593549234821941, + "learning_rate": 0.00013969024349540853, + "loss": 3.229390859603882, + "step": 2677, + "token_acc": 0.2735966472268539 + }, + { + "epoch": 1.5699208443271768, + "grad_norm": 0.44162606885340083, + "learning_rate": 0.00013968960562359234, + "loss": 3.188323736190796, + "step": 2678, + "token_acc": 0.27845438075370155 + }, + { + "epoch": 1.570507182644386, + "grad_norm": 0.4570523309689249, + "learning_rate": 0.00013968896709713624, + "loss": 3.1839938163757324, + "step": 2679, + "token_acc": 0.28123097329738195 + }, + { + "epoch": 1.5710935209615948, + "grad_norm": 0.557876063071313, + "learning_rate": 0.0001396883279160462, + "loss": 3.171645164489746, + "step": 2680, + "token_acc": 0.2812261709003574 + }, + { + "epoch": 1.5716798592788037, + "grad_norm": 0.5182152893644817, + "learning_rate": 0.00013968768808032825, + "loss": 3.1851248741149902, + "step": 2681, + "token_acc": 0.28130034614218946 + }, + { + "epoch": 1.5722661975960128, + "grad_norm": 0.5579671628290019, + "learning_rate": 0.00013968704758998834, + "loss": 3.195298910140991, + "step": 2682, + "token_acc": 0.27879305586584835 + }, + { + "epoch": 1.572852535913222, + "grad_norm": 0.48531072530361874, + "learning_rate": 0.00013968640644503253, + "loss": 3.2068495750427246, + "step": 2683, + "token_acc": 0.2770602456143915 + }, + { + "epoch": 1.573438874230431, + "grad_norm": 0.4190035322296846, + "learning_rate": 0.00013968576464546683, + "loss": 3.203878879547119, + "step": 2684, + "token_acc": 0.27686847764423395 + }, + { + "epoch": 1.57402521254764, + "grad_norm": 0.5272757450008215, + "learning_rate": 0.00013968512219129727, + "loss": 3.218924045562744, + "step": 2685, + "token_acc": 0.27657591658275066 + }, + { + "epoch": 1.574611550864849, + "grad_norm": 0.5439370644627942, + "learning_rate": 0.0001396844790825299, + "loss": 3.212144136428833, + "step": 2686, + "token_acc": 0.2770561746358467 + }, + { + "epoch": 1.575197889182058, + "grad_norm": 0.4276587152425814, + "learning_rate": 0.00013968383531917078, + "loss": 3.232213020324707, + "step": 2687, + "token_acc": 0.27350318302034743 + }, + { + "epoch": 1.575784227499267, + "grad_norm": 0.4253823130056882, + "learning_rate": 0.00013968319090122588, + "loss": 3.1527292728424072, + "step": 2688, + "token_acc": 0.28541354601028296 + }, + { + "epoch": 1.5763705658164762, + "grad_norm": 0.5334675391936817, + "learning_rate": 0.00013968254582870132, + "loss": 3.2525062561035156, + "step": 2689, + "token_acc": 0.26966163247535696 + }, + { + "epoch": 1.5769569041336853, + "grad_norm": 0.45331283165405445, + "learning_rate": 0.00013968190010160315, + "loss": 3.2137365341186523, + "step": 2690, + "token_acc": 0.27467417023725776 + }, + { + "epoch": 1.5775432424508942, + "grad_norm": 0.40249759904470883, + "learning_rate": 0.0001396812537199374, + "loss": 3.222071647644043, + "step": 2691, + "token_acc": 0.27452846955594457 + }, + { + "epoch": 1.578129580768103, + "grad_norm": 0.446160344775638, + "learning_rate": 0.00013968060668371018, + "loss": 3.1708972454071045, + "step": 2692, + "token_acc": 0.2817640237499191 + }, + { + "epoch": 1.5787159190853122, + "grad_norm": 0.551596544767136, + "learning_rate": 0.00013967995899292758, + "loss": 3.18241810798645, + "step": 2693, + "token_acc": 0.2796375059655232 + }, + { + "epoch": 1.5793022574025213, + "grad_norm": 0.5779992223814839, + "learning_rate": 0.00013967931064759565, + "loss": 3.1835827827453613, + "step": 2694, + "token_acc": 0.2802087429719302 + }, + { + "epoch": 1.5798885957197304, + "grad_norm": 0.5832957485550658, + "learning_rate": 0.00013967866164772046, + "loss": 3.22414493560791, + "step": 2695, + "token_acc": 0.27510367998320123 + }, + { + "epoch": 1.5804749340369393, + "grad_norm": 0.5158200234322589, + "learning_rate": 0.00013967801199330816, + "loss": 3.184892177581787, + "step": 2696, + "token_acc": 0.2784660582735479 + }, + { + "epoch": 1.5810612723541484, + "grad_norm": 0.482295869367091, + "learning_rate": 0.00013967736168436483, + "loss": 3.223694324493408, + "step": 2697, + "token_acc": 0.27423646322420064 + }, + { + "epoch": 1.5816476106713573, + "grad_norm": 0.5342339275734188, + "learning_rate": 0.0001396767107208966, + "loss": 3.2016854286193848, + "step": 2698, + "token_acc": 0.2786818905251036 + }, + { + "epoch": 1.5822339489885664, + "grad_norm": 0.4936678068087708, + "learning_rate": 0.0001396760591029095, + "loss": 3.1807003021240234, + "step": 2699, + "token_acc": 0.2796874307191128 + }, + { + "epoch": 1.5828202873057755, + "grad_norm": 0.5479229909323583, + "learning_rate": 0.00013967540683040977, + "loss": 3.2324390411376953, + "step": 2700, + "token_acc": 0.2740212313781052 + }, + { + "epoch": 1.5834066256229846, + "grad_norm": 0.5662327287115109, + "learning_rate": 0.00013967475390340344, + "loss": 3.219440460205078, + "step": 2701, + "token_acc": 0.2752865225979476 + }, + { + "epoch": 1.5839929639401935, + "grad_norm": 0.5067514797158786, + "learning_rate": 0.0001396741003218967, + "loss": 3.204756259918213, + "step": 2702, + "token_acc": 0.2786231329296044 + }, + { + "epoch": 1.5845793022574024, + "grad_norm": 0.4636219209374442, + "learning_rate": 0.00013967344608589572, + "loss": 3.2148818969726562, + "step": 2703, + "token_acc": 0.27683304279768045 + }, + { + "epoch": 1.5851656405746115, + "grad_norm": 0.4306301730326108, + "learning_rate": 0.00013967279119540655, + "loss": 3.1739554405212402, + "step": 2704, + "token_acc": 0.2810820494446435 + }, + { + "epoch": 1.5857519788918206, + "grad_norm": 0.5202596264646511, + "learning_rate": 0.0001396721356504354, + "loss": 3.199911594390869, + "step": 2705, + "token_acc": 0.2771548372181004 + }, + { + "epoch": 1.5863383172090297, + "grad_norm": 0.46489748586407503, + "learning_rate": 0.00013967147945098844, + "loss": 3.1799440383911133, + "step": 2706, + "token_acc": 0.27870855101894165 + }, + { + "epoch": 1.5869246555262386, + "grad_norm": 0.43485151992400345, + "learning_rate": 0.0001396708225970718, + "loss": 3.2201790809631348, + "step": 2707, + "token_acc": 0.27671206055266157 + }, + { + "epoch": 1.5875109938434475, + "grad_norm": 0.4861233043403749, + "learning_rate": 0.00013967016508869166, + "loss": 3.150667905807495, + "step": 2708, + "token_acc": 0.2831345717977504 + }, + { + "epoch": 1.5880973321606566, + "grad_norm": 0.4539914368961883, + "learning_rate": 0.00013966950692585422, + "loss": 3.1869277954101562, + "step": 2709, + "token_acc": 0.2790298133023703 + }, + { + "epoch": 1.5886836704778657, + "grad_norm": 0.48651449268368896, + "learning_rate": 0.00013966884810856563, + "loss": 3.193302869796753, + "step": 2710, + "token_acc": 0.280412287901826 + }, + { + "epoch": 1.5892700087950749, + "grad_norm": 0.45521869240156493, + "learning_rate": 0.00013966818863683208, + "loss": 3.2114064693450928, + "step": 2711, + "token_acc": 0.276478556914192 + }, + { + "epoch": 1.5898563471122837, + "grad_norm": 0.45024023826380916, + "learning_rate": 0.00013966752851065977, + "loss": 3.203510046005249, + "step": 2712, + "token_acc": 0.2784533698723987 + }, + { + "epoch": 1.5904426854294929, + "grad_norm": 0.49262720463122334, + "learning_rate": 0.00013966686773005495, + "loss": 3.147939920425415, + "step": 2713, + "token_acc": 0.28395912450487976 + }, + { + "epoch": 1.5910290237467017, + "grad_norm": 0.495082891432347, + "learning_rate": 0.00013966620629502375, + "loss": 3.203338861465454, + "step": 2714, + "token_acc": 0.2771842173364247 + }, + { + "epoch": 1.5916153620639109, + "grad_norm": 0.47087702278270255, + "learning_rate": 0.00013966554420557242, + "loss": 3.256025552749634, + "step": 2715, + "token_acc": 0.27033430051127116 + }, + { + "epoch": 1.59220170038112, + "grad_norm": 0.5172837062159663, + "learning_rate": 0.00013966488146170718, + "loss": 3.243058204650879, + "step": 2716, + "token_acc": 0.27293518034500786 + }, + { + "epoch": 1.592788038698329, + "grad_norm": 0.5399900968566896, + "learning_rate": 0.00013966421806343426, + "loss": 3.2418084144592285, + "step": 2717, + "token_acc": 0.27424793621598453 + }, + { + "epoch": 1.593374377015538, + "grad_norm": 0.5258309131959893, + "learning_rate": 0.0001396635540107599, + "loss": 3.1864821910858154, + "step": 2718, + "token_acc": 0.28033251371904055 + }, + { + "epoch": 1.5939607153327469, + "grad_norm": 0.5997194349395937, + "learning_rate": 0.0001396628893036903, + "loss": 3.1837596893310547, + "step": 2719, + "token_acc": 0.27872559911125216 + }, + { + "epoch": 1.594547053649956, + "grad_norm": 0.675793619010458, + "learning_rate": 0.00013966222394223173, + "loss": 3.2070446014404297, + "step": 2720, + "token_acc": 0.2770892645655716 + }, + { + "epoch": 1.595133391967165, + "grad_norm": 0.522843521991948, + "learning_rate": 0.00013966155792639046, + "loss": 3.1932358741760254, + "step": 2721, + "token_acc": 0.27688507513485083 + }, + { + "epoch": 1.5957197302843742, + "grad_norm": 0.4270349720290733, + "learning_rate": 0.0001396608912561727, + "loss": 3.196841239929199, + "step": 2722, + "token_acc": 0.27971137628872517 + }, + { + "epoch": 1.596306068601583, + "grad_norm": 0.5833833956154665, + "learning_rate": 0.00013966022393158473, + "loss": 3.2072596549987793, + "step": 2723, + "token_acc": 0.2758335107404888 + }, + { + "epoch": 1.5968924069187922, + "grad_norm": 0.682242413805912, + "learning_rate": 0.00013965955595263285, + "loss": 3.222935438156128, + "step": 2724, + "token_acc": 0.2738472950076335 + }, + { + "epoch": 1.597478745236001, + "grad_norm": 0.4612932820499706, + "learning_rate": 0.0001396588873193233, + "loss": 3.172611951828003, + "step": 2725, + "token_acc": 0.28070929806188155 + }, + { + "epoch": 1.5980650835532102, + "grad_norm": 0.520011141771683, + "learning_rate": 0.00013965821803166233, + "loss": 3.2100110054016113, + "step": 2726, + "token_acc": 0.27539065550979175 + }, + { + "epoch": 1.5986514218704193, + "grad_norm": 0.5972673251339681, + "learning_rate": 0.0001396575480896563, + "loss": 3.2253332138061523, + "step": 2727, + "token_acc": 0.2758635004981734 + }, + { + "epoch": 1.5992377601876284, + "grad_norm": 0.6352499263432669, + "learning_rate": 0.00013965687749331149, + "loss": 3.191011905670166, + "step": 2728, + "token_acc": 0.2788634131472889 + }, + { + "epoch": 1.5998240985048373, + "grad_norm": 0.7112319344339046, + "learning_rate": 0.00013965620624263416, + "loss": 3.3048880100250244, + "step": 2729, + "token_acc": 0.2640569734487061 + }, + { + "epoch": 1.6004104368220462, + "grad_norm": 0.4849191580174677, + "learning_rate": 0.00013965553433763065, + "loss": 3.164024591445923, + "step": 2730, + "token_acc": 0.2824449357242283 + }, + { + "epoch": 1.6009967751392553, + "grad_norm": 0.7342063589068669, + "learning_rate": 0.00013965486177830723, + "loss": 3.197573661804199, + "step": 2731, + "token_acc": 0.2776699758720548 + }, + { + "epoch": 1.6015831134564644, + "grad_norm": 0.7611103716885684, + "learning_rate": 0.00013965418856467027, + "loss": 3.235909938812256, + "step": 2732, + "token_acc": 0.274509753348482 + }, + { + "epoch": 1.6021694517736735, + "grad_norm": 0.5904509818956125, + "learning_rate": 0.00013965351469672605, + "loss": 3.190512180328369, + "step": 2733, + "token_acc": 0.2791747194076582 + }, + { + "epoch": 1.6027557900908824, + "grad_norm": 0.5907846175429431, + "learning_rate": 0.00013965284017448094, + "loss": 3.2232768535614014, + "step": 2734, + "token_acc": 0.27475108556798056 + }, + { + "epoch": 1.6033421284080913, + "grad_norm": 0.4636028391864659, + "learning_rate": 0.00013965216499794124, + "loss": 3.239224910736084, + "step": 2735, + "token_acc": 0.2703204493239914 + }, + { + "epoch": 1.6039284667253004, + "grad_norm": 0.5382538201530409, + "learning_rate": 0.00013965148916711328, + "loss": 3.229917049407959, + "step": 2736, + "token_acc": 0.2745889590592335 + }, + { + "epoch": 1.6045148050425095, + "grad_norm": 0.5707164978923229, + "learning_rate": 0.00013965081268200346, + "loss": 3.2606852054595947, + "step": 2737, + "token_acc": 0.2694431726039591 + }, + { + "epoch": 1.6051011433597187, + "grad_norm": 0.5338354633061114, + "learning_rate": 0.00013965013554261812, + "loss": 3.186835765838623, + "step": 2738, + "token_acc": 0.2792937442137283 + }, + { + "epoch": 1.6056874816769275, + "grad_norm": 0.5919801764184374, + "learning_rate": 0.00013964945774896362, + "loss": 3.17567777633667, + "step": 2739, + "token_acc": 0.2811807808186103 + }, + { + "epoch": 1.6062738199941367, + "grad_norm": 0.6276235174683067, + "learning_rate": 0.00013964877930104628, + "loss": 3.1805691719055176, + "step": 2740, + "token_acc": 0.27817791094754146 + }, + { + "epoch": 1.6068601583113455, + "grad_norm": 0.5645733494323174, + "learning_rate": 0.00013964810019887256, + "loss": 3.190262794494629, + "step": 2741, + "token_acc": 0.28169470180804174 + }, + { + "epoch": 1.6074464966285547, + "grad_norm": 0.5558738467254616, + "learning_rate": 0.00013964742044244877, + "loss": 3.171491861343384, + "step": 2742, + "token_acc": 0.28051672066620953 + }, + { + "epoch": 1.6080328349457638, + "grad_norm": 0.5455773732472631, + "learning_rate": 0.0001396467400317813, + "loss": 3.182415723800659, + "step": 2743, + "token_acc": 0.2800098489740233 + }, + { + "epoch": 1.6086191732629729, + "grad_norm": 0.4797316762069997, + "learning_rate": 0.00013964605896687657, + "loss": 3.1327149868011475, + "step": 2744, + "token_acc": 0.28742276102744013 + }, + { + "epoch": 1.6092055115801818, + "grad_norm": 0.6799038648856757, + "learning_rate": 0.00013964537724774098, + "loss": 3.1731319427490234, + "step": 2745, + "token_acc": 0.28145694499912555 + }, + { + "epoch": 1.6097918498973907, + "grad_norm": 0.532791765186325, + "learning_rate": 0.0001396446948743809, + "loss": 3.2059335708618164, + "step": 2746, + "token_acc": 0.2765414092126548 + }, + { + "epoch": 1.6103781882145998, + "grad_norm": 0.4482968429900987, + "learning_rate": 0.00013964401184680275, + "loss": 3.2309787273406982, + "step": 2747, + "token_acc": 0.2733296241970563 + }, + { + "epoch": 1.6109645265318089, + "grad_norm": 0.41854285620645476, + "learning_rate": 0.00013964332816501296, + "loss": 3.2101831436157227, + "step": 2748, + "token_acc": 0.27553718695107327 + }, + { + "epoch": 1.611550864849018, + "grad_norm": 0.4597137933538926, + "learning_rate": 0.00013964264382901795, + "loss": 3.2345402240753174, + "step": 2749, + "token_acc": 0.27235935487762974 + }, + { + "epoch": 1.6121372031662269, + "grad_norm": 0.40995225982007355, + "learning_rate": 0.00013964195883882418, + "loss": 3.2008018493652344, + "step": 2750, + "token_acc": 0.27811980313867724 + }, + { + "epoch": 1.612723541483436, + "grad_norm": 0.46317095467920927, + "learning_rate": 0.00013964127319443802, + "loss": 3.16324520111084, + "step": 2751, + "token_acc": 0.28194677153394954 + }, + { + "epoch": 1.6133098798006449, + "grad_norm": 0.4601293041107578, + "learning_rate": 0.00013964058689586593, + "loss": 3.1708736419677734, + "step": 2752, + "token_acc": 0.28267571144412657 + }, + { + "epoch": 1.613896218117854, + "grad_norm": 0.3530017130378245, + "learning_rate": 0.00013963989994311438, + "loss": 3.181173801422119, + "step": 2753, + "token_acc": 0.2804247194987021 + }, + { + "epoch": 1.614482556435063, + "grad_norm": 0.437318710448947, + "learning_rate": 0.00013963921233618983, + "loss": 3.175046920776367, + "step": 2754, + "token_acc": 0.2804499210781128 + }, + { + "epoch": 1.6150688947522722, + "grad_norm": 0.4850054206775922, + "learning_rate": 0.00013963852407509867, + "loss": 3.214479923248291, + "step": 2755, + "token_acc": 0.2767477937307782 + }, + { + "epoch": 1.6156552330694811, + "grad_norm": 0.42455507234917195, + "learning_rate": 0.00013963783515984747, + "loss": 3.2068638801574707, + "step": 2756, + "token_acc": 0.27822030329627956 + }, + { + "epoch": 1.61624157138669, + "grad_norm": 0.37455412404072197, + "learning_rate": 0.0001396371455904426, + "loss": 3.2219433784484863, + "step": 2757, + "token_acc": 0.2760048091627206 + }, + { + "epoch": 1.6168279097038991, + "grad_norm": 0.3535944796059334, + "learning_rate": 0.00013963645536689063, + "loss": 3.196207284927368, + "step": 2758, + "token_acc": 0.27640335572573316 + }, + { + "epoch": 1.6174142480211082, + "grad_norm": 0.5448609349252622, + "learning_rate": 0.00013963576448919798, + "loss": 3.2348995208740234, + "step": 2759, + "token_acc": 0.270530565313946 + }, + { + "epoch": 1.6180005863383173, + "grad_norm": 0.6827737165246361, + "learning_rate": 0.00013963507295737114, + "loss": 3.218559741973877, + "step": 2760, + "token_acc": 0.2762573284579086 + }, + { + "epoch": 1.6185869246555262, + "grad_norm": 0.6984321318031556, + "learning_rate": 0.00013963438077141665, + "loss": 3.219879150390625, + "step": 2761, + "token_acc": 0.27504082316243433 + }, + { + "epoch": 1.6191732629727351, + "grad_norm": 0.6278640522164307, + "learning_rate": 0.00013963368793134097, + "loss": 3.1340649127960205, + "step": 2762, + "token_acc": 0.2865340444582863 + }, + { + "epoch": 1.6197596012899442, + "grad_norm": 0.8259758537444716, + "learning_rate": 0.00013963299443715065, + "loss": 3.167959213256836, + "step": 2763, + "token_acc": 0.2819945812415332 + }, + { + "epoch": 1.6203459396071533, + "grad_norm": 0.8187990397862871, + "learning_rate": 0.00013963230028885215, + "loss": 3.2088723182678223, + "step": 2764, + "token_acc": 0.2753867070261396 + }, + { + "epoch": 1.6209322779243625, + "grad_norm": 0.5543251553641797, + "learning_rate": 0.00013963160548645202, + "loss": 3.1469736099243164, + "step": 2765, + "token_acc": 0.28422297736525326 + }, + { + "epoch": 1.6215186162415713, + "grad_norm": 0.573506057465699, + "learning_rate": 0.0001396309100299568, + "loss": 3.178621768951416, + "step": 2766, + "token_acc": 0.2806689445290533 + }, + { + "epoch": 1.6221049545587805, + "grad_norm": 0.6534757843808497, + "learning_rate": 0.00013963021391937298, + "loss": 3.1959521770477295, + "step": 2767, + "token_acc": 0.2804754450664615 + }, + { + "epoch": 1.6226912928759893, + "grad_norm": 0.4852435553240719, + "learning_rate": 0.00013962951715470716, + "loss": 3.1643478870391846, + "step": 2768, + "token_acc": 0.28307289744366415 + }, + { + "epoch": 1.6232776311931985, + "grad_norm": 0.505429628143384, + "learning_rate": 0.00013962881973596586, + "loss": 3.221673011779785, + "step": 2769, + "token_acc": 0.27435543831729553 + }, + { + "epoch": 1.6238639695104076, + "grad_norm": 0.490235153647356, + "learning_rate": 0.0001396281216631556, + "loss": 3.1926498413085938, + "step": 2770, + "token_acc": 0.2793360322581512 + }, + { + "epoch": 1.6244503078276167, + "grad_norm": 0.5032803750847454, + "learning_rate": 0.00013962742293628297, + "loss": 3.1885128021240234, + "step": 2771, + "token_acc": 0.279042662763593 + }, + { + "epoch": 1.6250366461448256, + "grad_norm": 0.494799300505114, + "learning_rate": 0.00013962672355535453, + "loss": 3.1932482719421387, + "step": 2772, + "token_acc": 0.2792116365208841 + }, + { + "epoch": 1.6256229844620345, + "grad_norm": 0.5800864261851814, + "learning_rate": 0.00013962602352037684, + "loss": 3.2201380729675293, + "step": 2773, + "token_acc": 0.2740198767751184 + }, + { + "epoch": 1.6262093227792436, + "grad_norm": 0.6734912719486764, + "learning_rate": 0.00013962532283135647, + "loss": 3.264524459838867, + "step": 2774, + "token_acc": 0.26961632819042064 + }, + { + "epoch": 1.6267956610964527, + "grad_norm": 0.5918088352228305, + "learning_rate": 0.00013962462148830006, + "loss": 3.212893009185791, + "step": 2775, + "token_acc": 0.27533666374236276 + }, + { + "epoch": 1.6273819994136618, + "grad_norm": 0.4830231820932037, + "learning_rate": 0.00013962391949121409, + "loss": 3.2309601306915283, + "step": 2776, + "token_acc": 0.2750350481570782 + }, + { + "epoch": 1.6279683377308707, + "grad_norm": 0.5080155525980855, + "learning_rate": 0.00013962321684010524, + "loss": 3.171574831008911, + "step": 2777, + "token_acc": 0.2811130440169109 + }, + { + "epoch": 1.6285546760480798, + "grad_norm": 0.5375617014935056, + "learning_rate": 0.0001396225135349801, + "loss": 3.1722097396850586, + "step": 2778, + "token_acc": 0.2818057170807348 + }, + { + "epoch": 1.6291410143652887, + "grad_norm": 0.4939016112059976, + "learning_rate": 0.00013962180957584526, + "loss": 3.18725323677063, + "step": 2779, + "token_acc": 0.2779995923721329 + }, + { + "epoch": 1.6297273526824978, + "grad_norm": 0.4674234132395943, + "learning_rate": 0.00013962110496270735, + "loss": 3.2355170249938965, + "step": 2780, + "token_acc": 0.27360591200698287 + }, + { + "epoch": 1.630313690999707, + "grad_norm": 0.4171044963550156, + "learning_rate": 0.00013962039969557294, + "loss": 3.182727813720703, + "step": 2781, + "token_acc": 0.2794037531427519 + }, + { + "epoch": 1.630900029316916, + "grad_norm": 0.467334039402825, + "learning_rate": 0.0001396196937744487, + "loss": 3.1953125, + "step": 2782, + "token_acc": 0.2793755454783246 + }, + { + "epoch": 1.631486367634125, + "grad_norm": 0.49378748855939913, + "learning_rate": 0.00013961898719934125, + "loss": 3.1549072265625, + "step": 2783, + "token_acc": 0.284544796260944 + }, + { + "epoch": 1.6320727059513338, + "grad_norm": 0.5011754345291495, + "learning_rate": 0.00013961827997025723, + "loss": 3.1880993843078613, + "step": 2784, + "token_acc": 0.2799037819197672 + }, + { + "epoch": 1.632659044268543, + "grad_norm": 0.3785572311228852, + "learning_rate": 0.0001396175720872033, + "loss": 3.19978928565979, + "step": 2785, + "token_acc": 0.2790436165967965 + }, + { + "epoch": 1.633245382585752, + "grad_norm": 0.444949052672144, + "learning_rate": 0.00013961686355018604, + "loss": 3.1619184017181396, + "step": 2786, + "token_acc": 0.28072792893542936 + }, + { + "epoch": 1.6338317209029611, + "grad_norm": 0.5981178007039663, + "learning_rate": 0.0001396161543592122, + "loss": 3.180338144302368, + "step": 2787, + "token_acc": 0.2810062286142695 + }, + { + "epoch": 1.63441805922017, + "grad_norm": 0.565508354408448, + "learning_rate": 0.0001396154445142884, + "loss": 3.2334165573120117, + "step": 2788, + "token_acc": 0.2735622078086173 + }, + { + "epoch": 1.635004397537379, + "grad_norm": 0.35395689179785456, + "learning_rate": 0.0001396147340154213, + "loss": 3.231954574584961, + "step": 2789, + "token_acc": 0.2735847583352854 + }, + { + "epoch": 1.635590735854588, + "grad_norm": 0.3909312353810073, + "learning_rate": 0.00013961402286261757, + "loss": 3.2122340202331543, + "step": 2790, + "token_acc": 0.2757272923176382 + }, + { + "epoch": 1.6361770741717971, + "grad_norm": 0.44224363044343873, + "learning_rate": 0.0001396133110558839, + "loss": 3.2273662090301514, + "step": 2791, + "token_acc": 0.27367477305838844 + }, + { + "epoch": 1.6367634124890063, + "grad_norm": 0.44102929664807083, + "learning_rate": 0.000139612598595227, + "loss": 3.1619436740875244, + "step": 2792, + "token_acc": 0.28070656502743374 + }, + { + "epoch": 1.6373497508062151, + "grad_norm": 0.44576533621555076, + "learning_rate": 0.0001396118854806535, + "loss": 3.276240825653076, + "step": 2793, + "token_acc": 0.26777507753370716 + }, + { + "epoch": 1.6379360891234243, + "grad_norm": 0.554241648591905, + "learning_rate": 0.0001396111717121702, + "loss": 3.2208337783813477, + "step": 2794, + "token_acc": 0.2754424352127777 + }, + { + "epoch": 1.6385224274406331, + "grad_norm": 0.5942441086993988, + "learning_rate": 0.0001396104572897837, + "loss": 3.173607110977173, + "step": 2795, + "token_acc": 0.28088986200216265 + }, + { + "epoch": 1.6391087657578423, + "grad_norm": 0.5742152802886762, + "learning_rate": 0.00013960974221350077, + "loss": 3.2082912921905518, + "step": 2796, + "token_acc": 0.2778953488056202 + }, + { + "epoch": 1.6396951040750514, + "grad_norm": 0.41530914580527023, + "learning_rate": 0.0001396090264833281, + "loss": 3.246802806854248, + "step": 2797, + "token_acc": 0.2713307925643618 + }, + { + "epoch": 1.6402814423922605, + "grad_norm": 0.4250795269189028, + "learning_rate": 0.00013960831009927243, + "loss": 3.2449562549591064, + "step": 2798, + "token_acc": 0.27141191951282617 + }, + { + "epoch": 1.6408677807094694, + "grad_norm": 0.4908870937666687, + "learning_rate": 0.0001396075930613405, + "loss": 3.2376413345336914, + "step": 2799, + "token_acc": 0.2719256857120746 + }, + { + "epoch": 1.6414541190266783, + "grad_norm": 0.4515410434006724, + "learning_rate": 0.000139606875369539, + "loss": 3.2563443183898926, + "step": 2800, + "token_acc": 0.2697288735675021 + }, + { + "epoch": 1.6420404573438874, + "grad_norm": 0.4513458647871023, + "learning_rate": 0.00013960615702387472, + "loss": 3.190453052520752, + "step": 2801, + "token_acc": 0.27799955306354623 + }, + { + "epoch": 1.6426267956610965, + "grad_norm": 0.48235937748981295, + "learning_rate": 0.0001396054380243544, + "loss": 3.134134292602539, + "step": 2802, + "token_acc": 0.28566873366792483 + }, + { + "epoch": 1.6432131339783056, + "grad_norm": 0.3907157073921604, + "learning_rate": 0.00013960471837098478, + "loss": 3.1899261474609375, + "step": 2803, + "token_acc": 0.27903039085203063 + }, + { + "epoch": 1.6437994722955145, + "grad_norm": 0.49388729068348364, + "learning_rate": 0.0001396039980637726, + "loss": 3.210207462310791, + "step": 2804, + "token_acc": 0.27620284617122204 + }, + { + "epoch": 1.6443858106127234, + "grad_norm": 0.5658747770635152, + "learning_rate": 0.0001396032771027247, + "loss": 3.200040340423584, + "step": 2805, + "token_acc": 0.27748167803571294 + }, + { + "epoch": 1.6449721489299325, + "grad_norm": 0.6253117280581331, + "learning_rate": 0.00013960255548784776, + "loss": 3.2142438888549805, + "step": 2806, + "token_acc": 0.27707643763281603 + }, + { + "epoch": 1.6455584872471416, + "grad_norm": 0.5587258123217902, + "learning_rate": 0.00013960183321914862, + "loss": 3.1908645629882812, + "step": 2807, + "token_acc": 0.2785951883654367 + }, + { + "epoch": 1.6461448255643507, + "grad_norm": 0.5929474032280909, + "learning_rate": 0.00013960111029663402, + "loss": 3.201730966567993, + "step": 2808, + "token_acc": 0.2761325750147088 + }, + { + "epoch": 1.6467311638815598, + "grad_norm": 0.5180526507361575, + "learning_rate": 0.0001396003867203108, + "loss": 3.177999496459961, + "step": 2809, + "token_acc": 0.2808789462202508 + }, + { + "epoch": 1.6473175021987687, + "grad_norm": 0.5157969139759756, + "learning_rate": 0.00013959966249018575, + "loss": 3.1972265243530273, + "step": 2810, + "token_acc": 0.2783909713184379 + }, + { + "epoch": 1.6479038405159776, + "grad_norm": 0.6545660161880537, + "learning_rate": 0.00013959893760626563, + "loss": 3.125580310821533, + "step": 2811, + "token_acc": 0.28896460141379726 + }, + { + "epoch": 1.6484901788331867, + "grad_norm": 0.7082665301387975, + "learning_rate": 0.0001395982120685573, + "loss": 3.1641459465026855, + "step": 2812, + "token_acc": 0.28295544526272753 + }, + { + "epoch": 1.6490765171503958, + "grad_norm": 0.5126528682643482, + "learning_rate": 0.00013959748587706754, + "loss": 3.224123001098633, + "step": 2813, + "token_acc": 0.2772183822406136 + }, + { + "epoch": 1.649662855467605, + "grad_norm": 0.41304218403096193, + "learning_rate": 0.0001395967590318032, + "loss": 3.1737208366394043, + "step": 2814, + "token_acc": 0.27829346072240785 + }, + { + "epoch": 1.6502491937848138, + "grad_norm": 0.4673682467824826, + "learning_rate": 0.00013959603153277105, + "loss": 3.2188334465026855, + "step": 2815, + "token_acc": 0.2758300101588595 + }, + { + "epoch": 1.6508355321020227, + "grad_norm": 0.42444108445130196, + "learning_rate": 0.000139595303379978, + "loss": 3.1897037029266357, + "step": 2816, + "token_acc": 0.2789581259911065 + }, + { + "epoch": 1.6514218704192318, + "grad_norm": 0.45261859045936914, + "learning_rate": 0.00013959457457343085, + "loss": 3.1412386894226074, + "step": 2817, + "token_acc": 0.2850506843412062 + }, + { + "epoch": 1.652008208736441, + "grad_norm": 0.41976741896377545, + "learning_rate": 0.00013959384511313643, + "loss": 3.198582649230957, + "step": 2818, + "token_acc": 0.27758918716287984 + }, + { + "epoch": 1.65259454705365, + "grad_norm": 0.4586116146240429, + "learning_rate": 0.00013959311499910163, + "loss": 3.136387348175049, + "step": 2819, + "token_acc": 0.28622609214131145 + }, + { + "epoch": 1.653180885370859, + "grad_norm": 0.532615142147196, + "learning_rate": 0.00013959238423133332, + "loss": 3.2432801723480225, + "step": 2820, + "token_acc": 0.2705881419076246 + }, + { + "epoch": 1.653767223688068, + "grad_norm": 0.41890660933425833, + "learning_rate": 0.0001395916528098383, + "loss": 3.201712131500244, + "step": 2821, + "token_acc": 0.2751431868984326 + }, + { + "epoch": 1.654353562005277, + "grad_norm": 0.42273798633213167, + "learning_rate": 0.00013959092073462348, + "loss": 3.155172824859619, + "step": 2822, + "token_acc": 0.2840702908862005 + }, + { + "epoch": 1.654939900322486, + "grad_norm": 0.4290900407598739, + "learning_rate": 0.00013959018800569577, + "loss": 3.1677842140197754, + "step": 2823, + "token_acc": 0.28307488934051894 + }, + { + "epoch": 1.6555262386396952, + "grad_norm": 0.45353305655906834, + "learning_rate": 0.00013958945462306198, + "loss": 3.190258026123047, + "step": 2824, + "token_acc": 0.2787606199461607 + }, + { + "epoch": 1.6561125769569043, + "grad_norm": 0.4220465906693771, + "learning_rate": 0.00013958872058672905, + "loss": 3.206526756286621, + "step": 2825, + "token_acc": 0.27541651064547945 + }, + { + "epoch": 1.6566989152741132, + "grad_norm": 0.5002023750301355, + "learning_rate": 0.00013958798589670387, + "loss": 3.1819491386413574, + "step": 2826, + "token_acc": 0.2785203823556115 + }, + { + "epoch": 1.657285253591322, + "grad_norm": 0.6574782883823983, + "learning_rate": 0.00013958725055299333, + "loss": 3.194183349609375, + "step": 2827, + "token_acc": 0.27834771781582895 + }, + { + "epoch": 1.6578715919085312, + "grad_norm": 0.6000243826460515, + "learning_rate": 0.00013958651455560434, + "loss": 3.181319236755371, + "step": 2828, + "token_acc": 0.2798694733684817 + }, + { + "epoch": 1.6584579302257403, + "grad_norm": 0.5457597968862198, + "learning_rate": 0.0001395857779045438, + "loss": 3.1828832626342773, + "step": 2829, + "token_acc": 0.280551688963033 + }, + { + "epoch": 1.6590442685429494, + "grad_norm": 0.4815398414469595, + "learning_rate": 0.00013958504059981866, + "loss": 3.2162904739379883, + "step": 2830, + "token_acc": 0.27401734731336364 + }, + { + "epoch": 1.6596306068601583, + "grad_norm": 0.45103237904296917, + "learning_rate": 0.0001395843026414358, + "loss": 3.1280031204223633, + "step": 2831, + "token_acc": 0.28766479467877487 + }, + { + "epoch": 1.6602169451773672, + "grad_norm": 0.5525690725721473, + "learning_rate": 0.00013958356402940224, + "loss": 3.175048828125, + "step": 2832, + "token_acc": 0.2814473398618541 + }, + { + "epoch": 1.6608032834945763, + "grad_norm": 0.4197294575570815, + "learning_rate": 0.0001395828247637248, + "loss": 3.1838550567626953, + "step": 2833, + "token_acc": 0.2806887666308205 + }, + { + "epoch": 1.6613896218117854, + "grad_norm": 0.5041845872963586, + "learning_rate": 0.00013958208484441054, + "loss": 3.1758861541748047, + "step": 2834, + "token_acc": 0.2798607440351421 + }, + { + "epoch": 1.6619759601289945, + "grad_norm": 0.5176367818538399, + "learning_rate": 0.00013958134427146632, + "loss": 3.2046308517456055, + "step": 2835, + "token_acc": 0.2771461434947238 + }, + { + "epoch": 1.6625622984462036, + "grad_norm": 0.5413695710973725, + "learning_rate": 0.00013958060304489916, + "loss": 3.1764774322509766, + "step": 2836, + "token_acc": 0.2796540653868833 + }, + { + "epoch": 1.6631486367634125, + "grad_norm": 0.436111820619208, + "learning_rate": 0.00013957986116471595, + "loss": 3.1729514598846436, + "step": 2837, + "token_acc": 0.2802992815894671 + }, + { + "epoch": 1.6637349750806214, + "grad_norm": 0.4158011340181606, + "learning_rate": 0.00013957911863092374, + "loss": 3.2421836853027344, + "step": 2838, + "token_acc": 0.27218788890333573 + }, + { + "epoch": 1.6643213133978305, + "grad_norm": 0.4770905298981569, + "learning_rate": 0.00013957837544352947, + "loss": 3.204979658126831, + "step": 2839, + "token_acc": 0.2788817638932935 + }, + { + "epoch": 1.6649076517150396, + "grad_norm": 0.5109351220372688, + "learning_rate": 0.00013957763160254012, + "loss": 3.1848835945129395, + "step": 2840, + "token_acc": 0.27939617008133427 + }, + { + "epoch": 1.6654939900322487, + "grad_norm": 0.4953433308211593, + "learning_rate": 0.00013957688710796267, + "loss": 3.150716781616211, + "step": 2841, + "token_acc": 0.2833807326075835 + }, + { + "epoch": 1.6660803283494576, + "grad_norm": 0.48260382194700224, + "learning_rate": 0.0001395761419598041, + "loss": 3.1916308403015137, + "step": 2842, + "token_acc": 0.2792819662784928 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5321098351866895, + "learning_rate": 0.00013957539615807148, + "loss": 3.201857566833496, + "step": 2843, + "token_acc": 0.2777333688604672 + }, + { + "epoch": 1.6672530049838756, + "grad_norm": 0.6769952248120007, + "learning_rate": 0.00013957464970277173, + "loss": 3.209993362426758, + "step": 2844, + "token_acc": 0.2771785037853946 + }, + { + "epoch": 1.6678393433010847, + "grad_norm": 0.6247904783730667, + "learning_rate": 0.00013957390259391192, + "loss": 3.2478137016296387, + "step": 2845, + "token_acc": 0.2704606028945226 + }, + { + "epoch": 1.6684256816182939, + "grad_norm": 0.4865061364277161, + "learning_rate": 0.00013957315483149904, + "loss": 3.1949496269226074, + "step": 2846, + "token_acc": 0.27928944103289166 + }, + { + "epoch": 1.6690120199355027, + "grad_norm": 0.5200610537075193, + "learning_rate": 0.00013957240641554014, + "loss": 3.2192654609680176, + "step": 2847, + "token_acc": 0.2751930834810365 + }, + { + "epoch": 1.6695983582527119, + "grad_norm": 0.5952264104454312, + "learning_rate": 0.0001395716573460422, + "loss": 3.1681227684020996, + "step": 2848, + "token_acc": 0.28217689127771 + }, + { + "epoch": 1.6701846965699207, + "grad_norm": 0.5103471191897048, + "learning_rate": 0.0001395709076230123, + "loss": 3.1720833778381348, + "step": 2849, + "token_acc": 0.2811926652786081 + }, + { + "epoch": 1.6707710348871299, + "grad_norm": 0.43647430855270536, + "learning_rate": 0.00013957015724645747, + "loss": 3.162659168243408, + "step": 2850, + "token_acc": 0.28165469295553824 + }, + { + "epoch": 1.671357373204339, + "grad_norm": 0.5900512499680479, + "learning_rate": 0.00013956940621638475, + "loss": 3.199403762817383, + "step": 2851, + "token_acc": 0.27671144490470106 + }, + { + "epoch": 1.671943711521548, + "grad_norm": 0.4557668446225046, + "learning_rate": 0.0001395686545328012, + "loss": 3.25905704498291, + "step": 2852, + "token_acc": 0.26825369280684874 + }, + { + "epoch": 1.672530049838757, + "grad_norm": 0.45188940155679713, + "learning_rate": 0.00013956790219571392, + "loss": 3.210366725921631, + "step": 2853, + "token_acc": 0.2767666036122412 + }, + { + "epoch": 1.6731163881559659, + "grad_norm": 0.6357963831783037, + "learning_rate": 0.00013956714920512991, + "loss": 3.197892665863037, + "step": 2854, + "token_acc": 0.27773870230349923 + }, + { + "epoch": 1.673702726473175, + "grad_norm": 0.5131130881000916, + "learning_rate": 0.0001395663955610563, + "loss": 3.172527551651001, + "step": 2855, + "token_acc": 0.28195824829466515 + }, + { + "epoch": 1.674289064790384, + "grad_norm": 0.5389225955579078, + "learning_rate": 0.00013956564126350011, + "loss": 3.216134548187256, + "step": 2856, + "token_acc": 0.27647101513551 + }, + { + "epoch": 1.6748754031075932, + "grad_norm": 0.6669937565185201, + "learning_rate": 0.0001395648863124685, + "loss": 3.2324583530426025, + "step": 2857, + "token_acc": 0.2731597230490009 + }, + { + "epoch": 1.675461741424802, + "grad_norm": 0.5443823672929545, + "learning_rate": 0.00013956413070796852, + "loss": 3.1986217498779297, + "step": 2858, + "token_acc": 0.27671375199395515 + }, + { + "epoch": 1.676048079742011, + "grad_norm": 0.5074711439197201, + "learning_rate": 0.00013956337445000726, + "loss": 3.181631326675415, + "step": 2859, + "token_acc": 0.280343556190192 + }, + { + "epoch": 1.67663441805922, + "grad_norm": 0.6022047213146416, + "learning_rate": 0.00013956261753859183, + "loss": 3.174705982208252, + "step": 2860, + "token_acc": 0.279127960785658 + }, + { + "epoch": 1.6772207563764292, + "grad_norm": 0.44771633588894383, + "learning_rate": 0.00013956185997372934, + "loss": 3.1722846031188965, + "step": 2861, + "token_acc": 0.2809243925577745 + }, + { + "epoch": 1.6778070946936383, + "grad_norm": 0.5434841368538486, + "learning_rate": 0.00013956110175542693, + "loss": 3.2112722396850586, + "step": 2862, + "token_acc": 0.2757631284007533 + }, + { + "epoch": 1.6783934330108474, + "grad_norm": 0.5576693480976403, + "learning_rate": 0.00013956034288369168, + "loss": 3.179370403289795, + "step": 2863, + "token_acc": 0.2806193801019351 + }, + { + "epoch": 1.6789797713280563, + "grad_norm": 0.4765049603529197, + "learning_rate": 0.00013955958335853076, + "loss": 3.220949411392212, + "step": 2864, + "token_acc": 0.27347386901364834 + }, + { + "epoch": 1.6795661096452652, + "grad_norm": 0.539187467647457, + "learning_rate": 0.00013955882317995128, + "loss": 3.200206756591797, + "step": 2865, + "token_acc": 0.2786758172657356 + }, + { + "epoch": 1.6801524479624743, + "grad_norm": 0.43785284937773744, + "learning_rate": 0.0001395580623479604, + "loss": 3.176138401031494, + "step": 2866, + "token_acc": 0.28006461242980313 + }, + { + "epoch": 1.6807387862796834, + "grad_norm": 0.5549124981230663, + "learning_rate": 0.00013955730086256525, + "loss": 3.2419896125793457, + "step": 2867, + "token_acc": 0.2722278194493589 + }, + { + "epoch": 1.6813251245968925, + "grad_norm": 0.45518441574360713, + "learning_rate": 0.000139556538723773, + "loss": 3.207124948501587, + "step": 2868, + "token_acc": 0.2775053152288597 + }, + { + "epoch": 1.6819114629141014, + "grad_norm": 0.46256694489936956, + "learning_rate": 0.0001395557759315908, + "loss": 3.153456211090088, + "step": 2869, + "token_acc": 0.2822733824929877 + }, + { + "epoch": 1.6824978012313103, + "grad_norm": 0.46829319016249255, + "learning_rate": 0.00013955501248602575, + "loss": 3.240262031555176, + "step": 2870, + "token_acc": 0.27202005562301695 + }, + { + "epoch": 1.6830841395485194, + "grad_norm": 0.5865728864018644, + "learning_rate": 0.00013955424838708514, + "loss": 3.18632435798645, + "step": 2871, + "token_acc": 0.2772832721398916 + }, + { + "epoch": 1.6836704778657285, + "grad_norm": 0.6486525746288135, + "learning_rate": 0.00013955348363477608, + "loss": 3.1936917304992676, + "step": 2872, + "token_acc": 0.27675760479294115 + }, + { + "epoch": 1.6842568161829377, + "grad_norm": 0.49420738072095316, + "learning_rate": 0.00013955271822910576, + "loss": 3.1764581203460693, + "step": 2873, + "token_acc": 0.27913451106425663 + }, + { + "epoch": 1.6848431545001465, + "grad_norm": 0.5068494526181408, + "learning_rate": 0.00013955195217008138, + "loss": 3.197007179260254, + "step": 2874, + "token_acc": 0.2773357746815887 + }, + { + "epoch": 1.6854294928173557, + "grad_norm": 0.5318251249586854, + "learning_rate": 0.00013955118545771014, + "loss": 3.192561626434326, + "step": 2875, + "token_acc": 0.27919389694586927 + }, + { + "epoch": 1.6860158311345645, + "grad_norm": 0.5794675902339713, + "learning_rate": 0.00013955041809199923, + "loss": 3.2102742195129395, + "step": 2876, + "token_acc": 0.27673261222594464 + }, + { + "epoch": 1.6866021694517737, + "grad_norm": 0.5757665589101576, + "learning_rate": 0.00013954965007295588, + "loss": 3.2481653690338135, + "step": 2877, + "token_acc": 0.2705823846522585 + }, + { + "epoch": 1.6871885077689828, + "grad_norm": 0.6058025866406351, + "learning_rate": 0.00013954888140058725, + "loss": 3.1760916709899902, + "step": 2878, + "token_acc": 0.279160861156777 + }, + { + "epoch": 1.6877748460861919, + "grad_norm": 0.6610942810111985, + "learning_rate": 0.00013954811207490063, + "loss": 3.1604933738708496, + "step": 2879, + "token_acc": 0.28304454758020936 + }, + { + "epoch": 1.6883611844034008, + "grad_norm": 0.5414882859518526, + "learning_rate": 0.00013954734209590318, + "loss": 3.190805673599243, + "step": 2880, + "token_acc": 0.2784567913602757 + }, + { + "epoch": 1.6889475227206097, + "grad_norm": 0.4637886839462668, + "learning_rate": 0.00013954657146360218, + "loss": 3.1659021377563477, + "step": 2881, + "token_acc": 0.2838228980934497 + }, + { + "epoch": 1.6895338610378188, + "grad_norm": 0.6234768544578989, + "learning_rate": 0.0001395458001780049, + "loss": 3.219486951828003, + "step": 2882, + "token_acc": 0.2742258761715868 + }, + { + "epoch": 1.6901201993550279, + "grad_norm": 0.5641465734382756, + "learning_rate": 0.0001395450282391185, + "loss": 3.1865673065185547, + "step": 2883, + "token_acc": 0.27912131690101283 + }, + { + "epoch": 1.690706537672237, + "grad_norm": 0.4670298810627386, + "learning_rate": 0.00013954425564695027, + "loss": 3.1794309616088867, + "step": 2884, + "token_acc": 0.27910618973040074 + }, + { + "epoch": 1.6912928759894459, + "grad_norm": 0.47803505756969444, + "learning_rate": 0.00013954348240150747, + "loss": 3.2148330211639404, + "step": 2885, + "token_acc": 0.2742665479185221 + }, + { + "epoch": 1.6918792143066548, + "grad_norm": 0.44426121021753384, + "learning_rate": 0.00013954270850279735, + "loss": 3.1834311485290527, + "step": 2886, + "token_acc": 0.27809286779779996 + }, + { + "epoch": 1.692465552623864, + "grad_norm": 0.521505522671949, + "learning_rate": 0.00013954193395082724, + "loss": 3.164466619491577, + "step": 2887, + "token_acc": 0.28132161803183997 + }, + { + "epoch": 1.693051890941073, + "grad_norm": 0.4922144177860282, + "learning_rate": 0.00013954115874560433, + "loss": 3.1448230743408203, + "step": 2888, + "token_acc": 0.28569908297838653 + }, + { + "epoch": 1.6936382292582821, + "grad_norm": 0.41341266771283247, + "learning_rate": 0.00013954038288713596, + "loss": 3.1173534393310547, + "step": 2889, + "token_acc": 0.28761181175998185 + }, + { + "epoch": 1.6942245675754912, + "grad_norm": 0.3961812581171599, + "learning_rate": 0.0001395396063754294, + "loss": 3.176060438156128, + "step": 2890, + "token_acc": 0.27926382420166534 + }, + { + "epoch": 1.6948109058927001, + "grad_norm": 0.3699812721394128, + "learning_rate": 0.00013953882921049194, + "loss": 3.2018561363220215, + "step": 2891, + "token_acc": 0.27722891074145556 + }, + { + "epoch": 1.695397244209909, + "grad_norm": 0.45204004423481264, + "learning_rate": 0.00013953805139233088, + "loss": 3.2166409492492676, + "step": 2892, + "token_acc": 0.27446675505769785 + }, + { + "epoch": 1.6959835825271181, + "grad_norm": 0.5329460584295618, + "learning_rate": 0.00013953727292095354, + "loss": 3.2047715187072754, + "step": 2893, + "token_acc": 0.2760543403468903 + }, + { + "epoch": 1.6965699208443272, + "grad_norm": 0.4508953580350445, + "learning_rate": 0.0001395364937963672, + "loss": 3.195629119873047, + "step": 2894, + "token_acc": 0.276438663748726 + }, + { + "epoch": 1.6971562591615363, + "grad_norm": 0.44147930439888555, + "learning_rate": 0.00013953571401857925, + "loss": 3.17089581489563, + "step": 2895, + "token_acc": 0.2804166494397817 + }, + { + "epoch": 1.6977425974787452, + "grad_norm": 0.4524174428338693, + "learning_rate": 0.00013953493358759693, + "loss": 3.175448417663574, + "step": 2896, + "token_acc": 0.27918275915340646 + }, + { + "epoch": 1.6983289357959541, + "grad_norm": 0.39442655294684786, + "learning_rate": 0.0001395341525034276, + "loss": 3.1915037631988525, + "step": 2897, + "token_acc": 0.27754377626196486 + }, + { + "epoch": 1.6989152741131632, + "grad_norm": 0.5189595783498936, + "learning_rate": 0.00013953337076607863, + "loss": 3.1603307723999023, + "step": 2898, + "token_acc": 0.28175767589006967 + }, + { + "epoch": 1.6995016124303723, + "grad_norm": 0.42544850144807234, + "learning_rate": 0.00013953258837555733, + "loss": 3.2102208137512207, + "step": 2899, + "token_acc": 0.27531493541155116 + }, + { + "epoch": 1.7000879507475815, + "grad_norm": 0.4737847539077121, + "learning_rate": 0.00013953180533187107, + "loss": 3.2122111320495605, + "step": 2900, + "token_acc": 0.27554507323479355 + }, + { + "epoch": 1.7006742890647903, + "grad_norm": 0.4175068425662153, + "learning_rate": 0.0001395310216350272, + "loss": 3.193995475769043, + "step": 2901, + "token_acc": 0.2781991379242496 + }, + { + "epoch": 1.7012606273819995, + "grad_norm": 0.4786782901048263, + "learning_rate": 0.00013953023728503303, + "loss": 3.2242555618286133, + "step": 2902, + "token_acc": 0.2742550568371782 + }, + { + "epoch": 1.7018469656992083, + "grad_norm": 0.3466876758419877, + "learning_rate": 0.00013952945228189602, + "loss": 3.188281297683716, + "step": 2903, + "token_acc": 0.27830980561919944 + }, + { + "epoch": 1.7024333040164175, + "grad_norm": 0.4158824169137935, + "learning_rate": 0.00013952866662562347, + "loss": 3.164700984954834, + "step": 2904, + "token_acc": 0.2823380959266026 + }, + { + "epoch": 1.7030196423336266, + "grad_norm": 0.4693047461777087, + "learning_rate": 0.0001395278803162228, + "loss": 3.196770191192627, + "step": 2905, + "token_acc": 0.27674774090904364 + }, + { + "epoch": 1.7036059806508357, + "grad_norm": 0.4141556079474966, + "learning_rate": 0.0001395270933537014, + "loss": 3.162008762359619, + "step": 2906, + "token_acc": 0.2816121656330264 + }, + { + "epoch": 1.7041923189680446, + "grad_norm": 0.3867629567941042, + "learning_rate": 0.00013952630573806662, + "loss": 3.1594057083129883, + "step": 2907, + "token_acc": 0.2841587279258721 + }, + { + "epoch": 1.7047786572852535, + "grad_norm": 0.44393715671487, + "learning_rate": 0.0001395255174693259, + "loss": 3.1479883193969727, + "step": 2908, + "token_acc": 0.28350913572932424 + }, + { + "epoch": 1.7053649956024626, + "grad_norm": 0.42292607668602766, + "learning_rate": 0.0001395247285474866, + "loss": 3.217233180999756, + "step": 2909, + "token_acc": 0.274435907099328 + }, + { + "epoch": 1.7059513339196717, + "grad_norm": 0.4332760783092111, + "learning_rate": 0.00013952393897255621, + "loss": 3.211113452911377, + "step": 2910, + "token_acc": 0.275207556250752 + }, + { + "epoch": 1.7065376722368808, + "grad_norm": 0.4424804360715772, + "learning_rate": 0.00013952314874454206, + "loss": 3.2433886528015137, + "step": 2911, + "token_acc": 0.27105401355141984 + }, + { + "epoch": 1.7071240105540897, + "grad_norm": 0.41509452549170445, + "learning_rate": 0.00013952235786345162, + "loss": 3.1641430854797363, + "step": 2912, + "token_acc": 0.281877070067027 + }, + { + "epoch": 1.7077103488712986, + "grad_norm": 0.43893946826783337, + "learning_rate": 0.00013952156632929234, + "loss": 3.1805288791656494, + "step": 2913, + "token_acc": 0.2804994035528903 + }, + { + "epoch": 1.7082966871885077, + "grad_norm": 0.436286982701825, + "learning_rate": 0.0001395207741420716, + "loss": 3.205930233001709, + "step": 2914, + "token_acc": 0.27706742033431747 + }, + { + "epoch": 1.7088830255057168, + "grad_norm": 0.47046190282830164, + "learning_rate": 0.00013951998130179688, + "loss": 3.17008376121521, + "step": 2915, + "token_acc": 0.2816604637560894 + }, + { + "epoch": 1.709469363822926, + "grad_norm": 0.5599994404828619, + "learning_rate": 0.0001395191878084756, + "loss": 3.1617746353149414, + "step": 2916, + "token_acc": 0.2810939172014868 + }, + { + "epoch": 1.7100557021401348, + "grad_norm": 0.4988045832552724, + "learning_rate": 0.00013951839366211524, + "loss": 3.169635772705078, + "step": 2917, + "token_acc": 0.28143373659005566 + }, + { + "epoch": 1.710642040457344, + "grad_norm": 0.4080732724867815, + "learning_rate": 0.00013951759886272325, + "loss": 3.180755138397217, + "step": 2918, + "token_acc": 0.279463860236553 + }, + { + "epoch": 1.7112283787745528, + "grad_norm": 0.4625266067108884, + "learning_rate": 0.00013951680341030707, + "loss": 3.2208359241485596, + "step": 2919, + "token_acc": 0.27521771170076004 + }, + { + "epoch": 1.711814717091762, + "grad_norm": 0.5395260563757204, + "learning_rate": 0.00013951600730487422, + "loss": 3.1578030586242676, + "step": 2920, + "token_acc": 0.28264173147769134 + }, + { + "epoch": 1.712401055408971, + "grad_norm": 0.4591262318914862, + "learning_rate": 0.00013951521054643214, + "loss": 3.1954503059387207, + "step": 2921, + "token_acc": 0.2763648047815842 + }, + { + "epoch": 1.7129873937261801, + "grad_norm": 0.43640060244321155, + "learning_rate": 0.00013951441313498836, + "loss": 3.209993839263916, + "step": 2922, + "token_acc": 0.2748840943821389 + }, + { + "epoch": 1.713573732043389, + "grad_norm": 0.4616115772308012, + "learning_rate": 0.0001395136150705503, + "loss": 3.176500082015991, + "step": 2923, + "token_acc": 0.2788897791905844 + }, + { + "epoch": 1.714160070360598, + "grad_norm": 0.494265904624796, + "learning_rate": 0.00013951281635312554, + "loss": 3.2003817558288574, + "step": 2924, + "token_acc": 0.27823480956406754 + }, + { + "epoch": 1.714746408677807, + "grad_norm": 0.5635356302275609, + "learning_rate": 0.0001395120169827215, + "loss": 3.1935057640075684, + "step": 2925, + "token_acc": 0.28065587915744833 + }, + { + "epoch": 1.7153327469950161, + "grad_norm": 0.5127042386437264, + "learning_rate": 0.00013951121695934574, + "loss": 3.1515421867370605, + "step": 2926, + "token_acc": 0.2841111085085507 + }, + { + "epoch": 1.7159190853122253, + "grad_norm": 0.4841648993396093, + "learning_rate": 0.00013951041628300579, + "loss": 3.154813528060913, + "step": 2927, + "token_acc": 0.2831344999361349 + }, + { + "epoch": 1.7165054236294341, + "grad_norm": 0.48802315656140666, + "learning_rate": 0.0001395096149537091, + "loss": 3.231116771697998, + "step": 2928, + "token_acc": 0.27359472862183754 + }, + { + "epoch": 1.7170917619466433, + "grad_norm": 0.4583969628442934, + "learning_rate": 0.00013950881297146328, + "loss": 3.1783218383789062, + "step": 2929, + "token_acc": 0.2786178434564167 + }, + { + "epoch": 1.7176781002638521, + "grad_norm": 0.4457269334996295, + "learning_rate": 0.0001395080103362758, + "loss": 3.1708686351776123, + "step": 2930, + "token_acc": 0.2817298153088552 + }, + { + "epoch": 1.7182644385810613, + "grad_norm": 0.5076840065368774, + "learning_rate": 0.00013950720704815426, + "loss": 3.167386531829834, + "step": 2931, + "token_acc": 0.28267361662574625 + }, + { + "epoch": 1.7188507768982704, + "grad_norm": 0.38586640155236157, + "learning_rate": 0.00013950640310710617, + "loss": 3.2086544036865234, + "step": 2932, + "token_acc": 0.2757667722636479 + }, + { + "epoch": 1.7194371152154795, + "grad_norm": 0.47240231386336135, + "learning_rate": 0.00013950559851313906, + "loss": 3.215143918991089, + "step": 2933, + "token_acc": 0.2740819670375566 + }, + { + "epoch": 1.7200234535326884, + "grad_norm": 0.5098640838817804, + "learning_rate": 0.00013950479326626052, + "loss": 3.2364985942840576, + "step": 2934, + "token_acc": 0.272879769156781 + }, + { + "epoch": 1.7206097918498973, + "grad_norm": 0.5119325405472774, + "learning_rate": 0.0001395039873664781, + "loss": 3.208951473236084, + "step": 2935, + "token_acc": 0.277185153777741 + }, + { + "epoch": 1.7211961301671064, + "grad_norm": 0.4316573538361131, + "learning_rate": 0.00013950318081379937, + "loss": 3.143176317214966, + "step": 2936, + "token_acc": 0.2840859237904237 + }, + { + "epoch": 1.7217824684843155, + "grad_norm": 0.5975299782852675, + "learning_rate": 0.00013950237360823192, + "loss": 3.141960620880127, + "step": 2937, + "token_acc": 0.2846559847756267 + }, + { + "epoch": 1.7223688068015246, + "grad_norm": 0.5502406246931524, + "learning_rate": 0.00013950156574978336, + "loss": 3.179710865020752, + "step": 2938, + "token_acc": 0.2816832027757104 + }, + { + "epoch": 1.7229551451187335, + "grad_norm": 0.36996452104275485, + "learning_rate": 0.0001395007572384612, + "loss": 3.1909027099609375, + "step": 2939, + "token_acc": 0.27840222008619886 + }, + { + "epoch": 1.7235414834359424, + "grad_norm": 0.5222221648602318, + "learning_rate": 0.0001394999480742731, + "loss": 3.1930480003356934, + "step": 2940, + "token_acc": 0.27837943364432205 + }, + { + "epoch": 1.7241278217531515, + "grad_norm": 0.5038578229363613, + "learning_rate": 0.00013949913825722664, + "loss": 3.1991958618164062, + "step": 2941, + "token_acc": 0.275468227729711 + }, + { + "epoch": 1.7247141600703606, + "grad_norm": 0.3647906137176386, + "learning_rate": 0.0001394983277873294, + "loss": 3.1558191776275635, + "step": 2942, + "token_acc": 0.2833918374179902 + }, + { + "epoch": 1.7253004983875697, + "grad_norm": 0.45509694732618405, + "learning_rate": 0.00013949751666458905, + "loss": 3.2349202632904053, + "step": 2943, + "token_acc": 0.27253721855126095 + }, + { + "epoch": 1.7258868367047786, + "grad_norm": 0.4320660772498575, + "learning_rate": 0.00013949670488901317, + "loss": 3.1631388664245605, + "step": 2944, + "token_acc": 0.28173722578132554 + }, + { + "epoch": 1.7264731750219877, + "grad_norm": 0.4350348818562048, + "learning_rate": 0.0001394958924606094, + "loss": 3.154768228530884, + "step": 2945, + "token_acc": 0.2834255377863047 + }, + { + "epoch": 1.7270595133391966, + "grad_norm": 0.42446626514713987, + "learning_rate": 0.00013949507937938537, + "loss": 3.151427984237671, + "step": 2946, + "token_acc": 0.2828361886540859 + }, + { + "epoch": 1.7276458516564057, + "grad_norm": 0.5093761388172953, + "learning_rate": 0.0001394942656453487, + "loss": 3.2252631187438965, + "step": 2947, + "token_acc": 0.27546640077667867 + }, + { + "epoch": 1.7282321899736148, + "grad_norm": 0.5503227731438836, + "learning_rate": 0.00013949345125850707, + "loss": 3.2185187339782715, + "step": 2948, + "token_acc": 0.27485536942632044 + }, + { + "epoch": 1.728818528290824, + "grad_norm": 0.4997795171607372, + "learning_rate": 0.0001394926362188681, + "loss": 3.205934762954712, + "step": 2949, + "token_acc": 0.27448801966004505 + }, + { + "epoch": 1.7294048666080328, + "grad_norm": 0.550953038707095, + "learning_rate": 0.00013949182052643946, + "loss": 3.1781582832336426, + "step": 2950, + "token_acc": 0.28031440154329285 + }, + { + "epoch": 1.7299912049252417, + "grad_norm": 0.6086567028349118, + "learning_rate": 0.0001394910041812288, + "loss": 3.195383071899414, + "step": 2951, + "token_acc": 0.2771617887842816 + }, + { + "epoch": 1.7305775432424508, + "grad_norm": 0.5316814126561198, + "learning_rate": 0.0001394901871832438, + "loss": 3.1902315616607666, + "step": 2952, + "token_acc": 0.2773252560188636 + }, + { + "epoch": 1.73116388155966, + "grad_norm": 0.5605691783151437, + "learning_rate": 0.0001394893695324921, + "loss": 3.2069578170776367, + "step": 2953, + "token_acc": 0.27646928936812276 + }, + { + "epoch": 1.731750219876869, + "grad_norm": 0.5356918339824384, + "learning_rate": 0.00013948855122898146, + "loss": 3.163416624069214, + "step": 2954, + "token_acc": 0.2821670249815554 + }, + { + "epoch": 1.732336558194078, + "grad_norm": 0.5237445660874269, + "learning_rate": 0.00013948773227271947, + "loss": 3.1913068294525146, + "step": 2955, + "token_acc": 0.2784569291100432 + }, + { + "epoch": 1.732922896511287, + "grad_norm": 0.6240798691395383, + "learning_rate": 0.00013948691266371392, + "loss": 3.184932231903076, + "step": 2956, + "token_acc": 0.27978044917183287 + }, + { + "epoch": 1.733509234828496, + "grad_norm": 0.4967175673780042, + "learning_rate": 0.00013948609240197244, + "loss": 3.1733694076538086, + "step": 2957, + "token_acc": 0.2814316899158769 + }, + { + "epoch": 1.734095573145705, + "grad_norm": 0.5216363535558579, + "learning_rate": 0.00013948527148750276, + "loss": 3.158154010772705, + "step": 2958, + "token_acc": 0.2809403226648567 + }, + { + "epoch": 1.7346819114629142, + "grad_norm": 0.44785391395681656, + "learning_rate": 0.00013948444992031256, + "loss": 3.224536418914795, + "step": 2959, + "token_acc": 0.27393745072829534 + }, + { + "epoch": 1.7352682497801233, + "grad_norm": 0.3939378560534969, + "learning_rate": 0.00013948362770040961, + "loss": 3.218186378479004, + "step": 2960, + "token_acc": 0.2743979484664147 + }, + { + "epoch": 1.7358545880973322, + "grad_norm": 0.4097630621572904, + "learning_rate": 0.00013948280482780162, + "loss": 3.1982107162475586, + "step": 2961, + "token_acc": 0.27736682731488355 + }, + { + "epoch": 1.736440926414541, + "grad_norm": 0.4257859017118626, + "learning_rate": 0.0001394819813024963, + "loss": 3.1899404525756836, + "step": 2962, + "token_acc": 0.27826653893235404 + }, + { + "epoch": 1.7370272647317502, + "grad_norm": 0.4624072721485425, + "learning_rate": 0.00013948115712450137, + "loss": 3.1601107120513916, + "step": 2963, + "token_acc": 0.2818528379681282 + }, + { + "epoch": 1.7376136030489593, + "grad_norm": 0.4405406604915479, + "learning_rate": 0.0001394803322938246, + "loss": 3.167111396789551, + "step": 2964, + "token_acc": 0.28262688596605917 + }, + { + "epoch": 1.7381999413661684, + "grad_norm": 0.5062994772396637, + "learning_rate": 0.00013947950681047377, + "loss": 3.2587504386901855, + "step": 2965, + "token_acc": 0.26847137745736654 + }, + { + "epoch": 1.7387862796833773, + "grad_norm": 0.5400153325101194, + "learning_rate": 0.00013947868067445656, + "loss": 3.2397029399871826, + "step": 2966, + "token_acc": 0.2705121842855324 + }, + { + "epoch": 1.7393726180005862, + "grad_norm": 0.5733149632288351, + "learning_rate": 0.0001394778538857808, + "loss": 3.2459867000579834, + "step": 2967, + "token_acc": 0.270364892688701 + }, + { + "epoch": 1.7399589563177953, + "grad_norm": 0.5708080986055255, + "learning_rate": 0.00013947702644445418, + "loss": 3.160269021987915, + "step": 2968, + "token_acc": 0.2834344190023245 + }, + { + "epoch": 1.7405452946350044, + "grad_norm": 0.553060852098264, + "learning_rate": 0.00013947619835048456, + "loss": 3.182892322540283, + "step": 2969, + "token_acc": 0.2796223575289047 + }, + { + "epoch": 1.7411316329522135, + "grad_norm": 0.48805625600322816, + "learning_rate": 0.00013947536960387966, + "loss": 3.192620038986206, + "step": 2970, + "token_acc": 0.27661618739129074 + }, + { + "epoch": 1.7417179712694224, + "grad_norm": 0.5672247990786181, + "learning_rate": 0.0001394745402046473, + "loss": 3.1784310340881348, + "step": 2971, + "token_acc": 0.276687059600326 + }, + { + "epoch": 1.7423043095866315, + "grad_norm": 0.4188954713964969, + "learning_rate": 0.00013947371015279522, + "loss": 3.1774685382843018, + "step": 2972, + "token_acc": 0.2790063807998319 + }, + { + "epoch": 1.7428906479038404, + "grad_norm": 0.5455829113753253, + "learning_rate": 0.00013947287944833127, + "loss": 3.228214979171753, + "step": 2973, + "token_acc": 0.2729039841690077 + }, + { + "epoch": 1.7434769862210495, + "grad_norm": 0.415223071559554, + "learning_rate": 0.00013947204809126323, + "loss": 3.116982936859131, + "step": 2974, + "token_acc": 0.2868892478535701 + }, + { + "epoch": 1.7440633245382586, + "grad_norm": 0.5000350945195606, + "learning_rate": 0.0001394712160815989, + "loss": 3.116703510284424, + "step": 2975, + "token_acc": 0.29025964941370325 + }, + { + "epoch": 1.7446496628554677, + "grad_norm": 0.5189122809087943, + "learning_rate": 0.00013947038341934612, + "loss": 3.1880321502685547, + "step": 2976, + "token_acc": 0.2803163158855919 + }, + { + "epoch": 1.7452360011726766, + "grad_norm": 0.47862244713686014, + "learning_rate": 0.00013946955010451273, + "loss": 3.1766796112060547, + "step": 2977, + "token_acc": 0.2812102597946518 + }, + { + "epoch": 1.7458223394898855, + "grad_norm": 0.4903419555635945, + "learning_rate": 0.00013946871613710647, + "loss": 3.1763789653778076, + "step": 2978, + "token_acc": 0.2796772453359855 + }, + { + "epoch": 1.7464086778070946, + "grad_norm": 0.4768659895339211, + "learning_rate": 0.00013946788151713527, + "loss": 3.192786931991577, + "step": 2979, + "token_acc": 0.27897821179104754 + }, + { + "epoch": 1.7469950161243037, + "grad_norm": 0.48316155906323704, + "learning_rate": 0.00013946704624460694, + "loss": 3.2073707580566406, + "step": 2980, + "token_acc": 0.27564174334337593 + }, + { + "epoch": 1.7475813544415129, + "grad_norm": 0.4171233387417619, + "learning_rate": 0.0001394662103195293, + "loss": 3.199120283126831, + "step": 2981, + "token_acc": 0.2773482712762483 + }, + { + "epoch": 1.7481676927587217, + "grad_norm": 0.45221568146353547, + "learning_rate": 0.00013946537374191022, + "loss": 3.1399085521698, + "step": 2982, + "token_acc": 0.28386914107744976 + }, + { + "epoch": 1.7487540310759309, + "grad_norm": 0.4769872415687329, + "learning_rate": 0.00013946453651175758, + "loss": 3.165022850036621, + "step": 2983, + "token_acc": 0.28225883909800115 + }, + { + "epoch": 1.7493403693931397, + "grad_norm": 0.5051013868212123, + "learning_rate": 0.0001394636986290792, + "loss": 3.1957502365112305, + "step": 2984, + "token_acc": 0.275668699332427 + }, + { + "epoch": 1.7499267077103489, + "grad_norm": 0.40459847663417486, + "learning_rate": 0.00013946286009388297, + "loss": 3.159456253051758, + "step": 2985, + "token_acc": 0.2827267916069983 + }, + { + "epoch": 1.750513046027558, + "grad_norm": 0.4676264988888676, + "learning_rate": 0.0001394620209061768, + "loss": 3.1672000885009766, + "step": 2986, + "token_acc": 0.2810569637682441 + }, + { + "epoch": 1.751099384344767, + "grad_norm": 0.5396543774404716, + "learning_rate": 0.00013946118106596852, + "loss": 3.126330614089966, + "step": 2987, + "token_acc": 0.2854535028697386 + }, + { + "epoch": 1.751685722661976, + "grad_norm": 0.5863893412891895, + "learning_rate": 0.00013946034057326606, + "loss": 3.2014126777648926, + "step": 2988, + "token_acc": 0.27612037606209866 + }, + { + "epoch": 1.7522720609791849, + "grad_norm": 0.4906860736010148, + "learning_rate": 0.0001394594994280773, + "loss": 3.1664700508117676, + "step": 2989, + "token_acc": 0.28078000180675466 + }, + { + "epoch": 1.752858399296394, + "grad_norm": 0.49839301063087105, + "learning_rate": 0.00013945865763041014, + "loss": 3.1140999794006348, + "step": 2990, + "token_acc": 0.29097877821586215 + }, + { + "epoch": 1.753444737613603, + "grad_norm": 0.5304102678540147, + "learning_rate": 0.00013945781518027246, + "loss": 3.157644748687744, + "step": 2991, + "token_acc": 0.2822222625363151 + }, + { + "epoch": 1.7540310759308122, + "grad_norm": 0.5632956151026114, + "learning_rate": 0.00013945697207767222, + "loss": 3.15358829498291, + "step": 2992, + "token_acc": 0.28113072146935547 + }, + { + "epoch": 1.754617414248021, + "grad_norm": 0.4310130893586471, + "learning_rate": 0.00013945612832261733, + "loss": 3.1892380714416504, + "step": 2993, + "token_acc": 0.27931632189888167 + }, + { + "epoch": 1.75520375256523, + "grad_norm": 0.4666505466780538, + "learning_rate": 0.0001394552839151157, + "loss": 3.202209949493408, + "step": 2994, + "token_acc": 0.2760693672087078 + }, + { + "epoch": 1.755790090882439, + "grad_norm": 0.5529548083585759, + "learning_rate": 0.00013945443885517527, + "loss": 3.1285104751586914, + "step": 2995, + "token_acc": 0.2852018265987976 + }, + { + "epoch": 1.7563764291996482, + "grad_norm": 0.41934765247948497, + "learning_rate": 0.000139453593142804, + "loss": 3.1727442741394043, + "step": 2996, + "token_acc": 0.2803262238514074 + }, + { + "epoch": 1.7569627675168573, + "grad_norm": 0.5349107559300248, + "learning_rate": 0.0001394527467780098, + "loss": 3.1704483032226562, + "step": 2997, + "token_acc": 0.28015986414625826 + }, + { + "epoch": 1.7575491058340662, + "grad_norm": 0.5099112084966309, + "learning_rate": 0.0001394518997608006, + "loss": 3.235971450805664, + "step": 2998, + "token_acc": 0.270647931303669 + }, + { + "epoch": 1.7581354441512753, + "grad_norm": 0.511299699510549, + "learning_rate": 0.00013945105209118444, + "loss": 3.153714418411255, + "step": 2999, + "token_acc": 0.28118182620647914 + }, + { + "epoch": 1.7587217824684842, + "grad_norm": 0.4967359037942677, + "learning_rate": 0.0001394502037691692, + "loss": 3.1984848976135254, + "step": 3000, + "token_acc": 0.2775382521570002 + }, + { + "epoch": 1.7593081207856933, + "grad_norm": 0.4227366080248958, + "learning_rate": 0.0001394493547947629, + "loss": 3.1918160915374756, + "step": 3001, + "token_acc": 0.2781750766791329 + }, + { + "epoch": 1.7598944591029024, + "grad_norm": 0.5150412828720263, + "learning_rate": 0.00013944850516797348, + "loss": 3.182196855545044, + "step": 3002, + "token_acc": 0.28004044987105264 + }, + { + "epoch": 1.7604807974201115, + "grad_norm": 0.37625441542999466, + "learning_rate": 0.00013944765488880893, + "loss": 3.169992446899414, + "step": 3003, + "token_acc": 0.2814530872512892 + }, + { + "epoch": 1.7610671357373204, + "grad_norm": 0.434386105441463, + "learning_rate": 0.00013944680395727726, + "loss": 3.172844171524048, + "step": 3004, + "token_acc": 0.27954541152803053 + }, + { + "epoch": 1.7616534740545293, + "grad_norm": 0.3943852902861319, + "learning_rate": 0.00013944595237338646, + "loss": 3.1710798740386963, + "step": 3005, + "token_acc": 0.27874392416185956 + }, + { + "epoch": 1.7622398123717384, + "grad_norm": 0.47467158255216385, + "learning_rate": 0.00013944510013714448, + "loss": 3.1470108032226562, + "step": 3006, + "token_acc": 0.2832109614535113 + }, + { + "epoch": 1.7628261506889475, + "grad_norm": 0.39725120203159514, + "learning_rate": 0.00013944424724855937, + "loss": 3.1844558715820312, + "step": 3007, + "token_acc": 0.2771128710457774 + }, + { + "epoch": 1.7634124890061567, + "grad_norm": 0.37136013478850843, + "learning_rate": 0.00013944339370763916, + "loss": 3.1456289291381836, + "step": 3008, + "token_acc": 0.2834135860746811 + }, + { + "epoch": 1.7639988273233655, + "grad_norm": 0.43207921243122227, + "learning_rate": 0.00013944253951439183, + "loss": 3.1810903549194336, + "step": 3009, + "token_acc": 0.27998436482084693 + }, + { + "epoch": 1.7645851656405747, + "grad_norm": 0.4221732135480788, + "learning_rate": 0.00013944168466882543, + "loss": 3.1915488243103027, + "step": 3010, + "token_acc": 0.2781500864334936 + }, + { + "epoch": 1.7651715039577835, + "grad_norm": 0.47347331784116814, + "learning_rate": 0.00013944082917094795, + "loss": 3.1884632110595703, + "step": 3011, + "token_acc": 0.2793217467344373 + }, + { + "epoch": 1.7657578422749927, + "grad_norm": 0.43010201307217233, + "learning_rate": 0.00013943997302076747, + "loss": 3.1599695682525635, + "step": 3012, + "token_acc": 0.28175054704595187 + }, + { + "epoch": 1.7663441805922018, + "grad_norm": 0.3591985550750633, + "learning_rate": 0.000139439116218292, + "loss": 3.1863489151000977, + "step": 3013, + "token_acc": 0.27946587552701 + }, + { + "epoch": 1.7669305189094109, + "grad_norm": 0.5044686289104184, + "learning_rate": 0.00013943825876352962, + "loss": 3.190765857696533, + "step": 3014, + "token_acc": 0.277968006691411 + }, + { + "epoch": 1.7675168572266198, + "grad_norm": 0.5064167124304537, + "learning_rate": 0.00013943740065648836, + "loss": 3.166590690612793, + "step": 3015, + "token_acc": 0.28248198596430835 + }, + { + "epoch": 1.7681031955438287, + "grad_norm": 0.5949833317644018, + "learning_rate": 0.0001394365418971763, + "loss": 3.1810693740844727, + "step": 3016, + "token_acc": 0.28107884139788425 + }, + { + "epoch": 1.7686895338610378, + "grad_norm": 0.6401682602803276, + "learning_rate": 0.0001394356824856015, + "loss": 3.190455436706543, + "step": 3017, + "token_acc": 0.2764243300501564 + }, + { + "epoch": 1.7692758721782469, + "grad_norm": 0.511690273723341, + "learning_rate": 0.000139434822421772, + "loss": 3.1694912910461426, + "step": 3018, + "token_acc": 0.28136341278598304 + }, + { + "epoch": 1.769862210495456, + "grad_norm": 0.5526399574080202, + "learning_rate": 0.0001394339617056959, + "loss": 3.1650853157043457, + "step": 3019, + "token_acc": 0.2824990288611954 + }, + { + "epoch": 1.770448548812665, + "grad_norm": 0.605855518965393, + "learning_rate": 0.00013943310033738134, + "loss": 3.2166709899902344, + "step": 3020, + "token_acc": 0.274029546297441 + }, + { + "epoch": 1.7710348871298738, + "grad_norm": 0.48482368847720975, + "learning_rate": 0.00013943223831683633, + "loss": 3.206500768661499, + "step": 3021, + "token_acc": 0.27586553086394966 + }, + { + "epoch": 1.771621225447083, + "grad_norm": 0.4961074485617399, + "learning_rate": 0.00013943137564406902, + "loss": 3.1531145572662354, + "step": 3022, + "token_acc": 0.282132487433008 + }, + { + "epoch": 1.772207563764292, + "grad_norm": 0.48361406338130825, + "learning_rate": 0.00013943051231908747, + "loss": 3.1888012886047363, + "step": 3023, + "token_acc": 0.2773923001329244 + }, + { + "epoch": 1.7727939020815011, + "grad_norm": 0.4330257959736322, + "learning_rate": 0.00013942964834189986, + "loss": 3.1647696495056152, + "step": 3024, + "token_acc": 0.28169150673744386 + }, + { + "epoch": 1.77338024039871, + "grad_norm": 0.50388100863475, + "learning_rate": 0.00013942878371251424, + "loss": 3.159562110900879, + "step": 3025, + "token_acc": 0.28265711632308416 + }, + { + "epoch": 1.7739665787159191, + "grad_norm": 0.4441763983328152, + "learning_rate": 0.00013942791843093874, + "loss": 3.2118473052978516, + "step": 3026, + "token_acc": 0.2745061063664018 + }, + { + "epoch": 1.774552917033128, + "grad_norm": 0.458876206146385, + "learning_rate": 0.0001394270524971815, + "loss": 3.1720352172851562, + "step": 3027, + "token_acc": 0.2802787467395745 + }, + { + "epoch": 1.7751392553503371, + "grad_norm": 0.4600232339135988, + "learning_rate": 0.00013942618591125067, + "loss": 3.1900811195373535, + "step": 3028, + "token_acc": 0.2795505570715754 + }, + { + "epoch": 1.7757255936675462, + "grad_norm": 0.3889859574534732, + "learning_rate": 0.00013942531867315437, + "loss": 3.123332977294922, + "step": 3029, + "token_acc": 0.2884469449169174 + }, + { + "epoch": 1.7763119319847553, + "grad_norm": 0.48405913544379675, + "learning_rate": 0.00013942445078290078, + "loss": 3.1768553256988525, + "step": 3030, + "token_acc": 0.2789639352247728 + }, + { + "epoch": 1.7768982703019642, + "grad_norm": 0.4722057542885997, + "learning_rate": 0.00013942358224049799, + "loss": 3.175858974456787, + "step": 3031, + "token_acc": 0.2792553675459416 + }, + { + "epoch": 1.7774846086191731, + "grad_norm": 0.4350024296163273, + "learning_rate": 0.0001394227130459542, + "loss": 3.190156936645508, + "step": 3032, + "token_acc": 0.2774042337667707 + }, + { + "epoch": 1.7780709469363822, + "grad_norm": 0.5046064488540578, + "learning_rate": 0.00013942184319927758, + "loss": 3.142198324203491, + "step": 3033, + "token_acc": 0.2827607620142467 + }, + { + "epoch": 1.7786572852535913, + "grad_norm": 0.5800926879351861, + "learning_rate": 0.00013942097270047628, + "loss": 3.166781425476074, + "step": 3034, + "token_acc": 0.2803894737123989 + }, + { + "epoch": 1.7792436235708005, + "grad_norm": 0.5709123615696609, + "learning_rate": 0.0001394201015495585, + "loss": 3.206200122833252, + "step": 3035, + "token_acc": 0.2765094669325494 + }, + { + "epoch": 1.7798299618880093, + "grad_norm": 0.3972953256325521, + "learning_rate": 0.0001394192297465324, + "loss": 3.1656837463378906, + "step": 3036, + "token_acc": 0.2807137919426006 + }, + { + "epoch": 1.7804163002052185, + "grad_norm": 0.39585383171947364, + "learning_rate": 0.0001394183572914062, + "loss": 3.1558756828308105, + "step": 3037, + "token_acc": 0.28343611574075284 + }, + { + "epoch": 1.7810026385224274, + "grad_norm": 0.39699858045426384, + "learning_rate": 0.00013941748418418805, + "loss": 3.1890172958374023, + "step": 3038, + "token_acc": 0.2774416237359775 + }, + { + "epoch": 1.7815889768396365, + "grad_norm": 0.42707524167193733, + "learning_rate": 0.00013941661042488618, + "loss": 3.1542139053344727, + "step": 3039, + "token_acc": 0.2819716446668392 + }, + { + "epoch": 1.7821753151568456, + "grad_norm": 0.39691244402946485, + "learning_rate": 0.00013941573601350879, + "loss": 3.1789612770080566, + "step": 3040, + "token_acc": 0.2800704547927733 + }, + { + "epoch": 1.7827616534740547, + "grad_norm": 0.38438277205573457, + "learning_rate": 0.00013941486095006412, + "loss": 3.180464744567871, + "step": 3041, + "token_acc": 0.27963271697214465 + }, + { + "epoch": 1.7833479917912636, + "grad_norm": 0.4089080870739832, + "learning_rate": 0.00013941398523456037, + "loss": 3.1594676971435547, + "step": 3042, + "token_acc": 0.28323771278162574 + }, + { + "epoch": 1.7839343301084725, + "grad_norm": 0.3971799476222808, + "learning_rate": 0.00013941310886700576, + "loss": 3.214721202850342, + "step": 3043, + "token_acc": 0.2776881145788799 + }, + { + "epoch": 1.7845206684256816, + "grad_norm": 0.4160673802639119, + "learning_rate": 0.00013941223184740849, + "loss": 3.1675150394439697, + "step": 3044, + "token_acc": 0.2799491331879996 + }, + { + "epoch": 1.7851070067428907, + "grad_norm": 0.4317510544713243, + "learning_rate": 0.0001394113541757769, + "loss": 3.1531519889831543, + "step": 3045, + "token_acc": 0.28322325035704343 + }, + { + "epoch": 1.7856933450600998, + "grad_norm": 0.47623892992211325, + "learning_rate": 0.00013941047585211912, + "loss": 3.185746192932129, + "step": 3046, + "token_acc": 0.27996067324421753 + }, + { + "epoch": 1.7862796833773087, + "grad_norm": 0.47806636759503185, + "learning_rate": 0.00013940959687644349, + "loss": 3.2195448875427246, + "step": 3047, + "token_acc": 0.27533950478468266 + }, + { + "epoch": 1.7868660216945176, + "grad_norm": 0.37498587410266465, + "learning_rate": 0.00013940871724875818, + "loss": 3.2072198390960693, + "step": 3048, + "token_acc": 0.27563439388392835 + }, + { + "epoch": 1.7874523600117267, + "grad_norm": 0.5279993112248405, + "learning_rate": 0.00013940783696907153, + "loss": 3.138835906982422, + "step": 3049, + "token_acc": 0.2844757545889143 + }, + { + "epoch": 1.7880386983289358, + "grad_norm": 0.5394221774295284, + "learning_rate": 0.0001394069560373918, + "loss": 3.173534393310547, + "step": 3050, + "token_acc": 0.2802749729676081 + }, + { + "epoch": 1.788625036646145, + "grad_norm": 0.4729912055712406, + "learning_rate": 0.00013940607445372721, + "loss": 3.1679487228393555, + "step": 3051, + "token_acc": 0.2818582286089384 + }, + { + "epoch": 1.7892113749633538, + "grad_norm": 0.43670485127496467, + "learning_rate": 0.0001394051922180861, + "loss": 3.1542611122131348, + "step": 3052, + "token_acc": 0.28249610188527 + }, + { + "epoch": 1.789797713280563, + "grad_norm": 0.41545786021961273, + "learning_rate": 0.00013940430933047672, + "loss": 3.1426453590393066, + "step": 3053, + "token_acc": 0.28462744720250904 + }, + { + "epoch": 1.7903840515977718, + "grad_norm": 0.4025877851962264, + "learning_rate": 0.00013940342579090738, + "loss": 3.2097442150115967, + "step": 3054, + "token_acc": 0.27398770371611714 + }, + { + "epoch": 1.790970389914981, + "grad_norm": 0.33880724094291215, + "learning_rate": 0.00013940254159938638, + "loss": 3.130758285522461, + "step": 3055, + "token_acc": 0.28541543971667116 + }, + { + "epoch": 1.79155672823219, + "grad_norm": 0.3971113752701744, + "learning_rate": 0.00013940165675592201, + "loss": 3.192953586578369, + "step": 3056, + "token_acc": 0.27773112548170964 + }, + { + "epoch": 1.7921430665493991, + "grad_norm": 0.420127374757093, + "learning_rate": 0.00013940077126052262, + "loss": 3.1329848766326904, + "step": 3057, + "token_acc": 0.2867229576384336 + }, + { + "epoch": 1.792729404866608, + "grad_norm": 0.3710304145496616, + "learning_rate": 0.00013939988511319648, + "loss": 3.1280245780944824, + "step": 3058, + "token_acc": 0.2880393914660744 + }, + { + "epoch": 1.793315743183817, + "grad_norm": 0.4699728139457792, + "learning_rate": 0.00013939899831395195, + "loss": 3.1629183292388916, + "step": 3059, + "token_acc": 0.28276094480846903 + }, + { + "epoch": 1.793902081501026, + "grad_norm": 0.5094621372538382, + "learning_rate": 0.00013939811086279735, + "loss": 3.2229623794555664, + "step": 3060, + "token_acc": 0.2734696375450341 + }, + { + "epoch": 1.7944884198182351, + "grad_norm": 0.4539228788186506, + "learning_rate": 0.000139397222759741, + "loss": 3.1880416870117188, + "step": 3061, + "token_acc": 0.27749388967380656 + }, + { + "epoch": 1.7950747581354443, + "grad_norm": 0.5387703876310291, + "learning_rate": 0.00013939633400479126, + "loss": 3.174793243408203, + "step": 3062, + "token_acc": 0.2801326237255446 + }, + { + "epoch": 1.7956610964526531, + "grad_norm": 0.5736186198601464, + "learning_rate": 0.0001393954445979565, + "loss": 3.2441062927246094, + "step": 3063, + "token_acc": 0.26925142081931536 + }, + { + "epoch": 1.7962474347698623, + "grad_norm": 0.5551946202069394, + "learning_rate": 0.000139394554539245, + "loss": 3.2213916778564453, + "step": 3064, + "token_acc": 0.27582292265965314 + }, + { + "epoch": 1.7968337730870712, + "grad_norm": 0.49036338976591426, + "learning_rate": 0.00013939366382866519, + "loss": 3.1287078857421875, + "step": 3065, + "token_acc": 0.2872081311235235 + }, + { + "epoch": 1.7974201114042803, + "grad_norm": 0.46533985001211564, + "learning_rate": 0.00013939277246622543, + "loss": 3.19309139251709, + "step": 3066, + "token_acc": 0.27740108145223585 + }, + { + "epoch": 1.7980064497214894, + "grad_norm": 0.5381523928879589, + "learning_rate": 0.00013939188045193406, + "loss": 3.1795382499694824, + "step": 3067, + "token_acc": 0.2788989192359249 + }, + { + "epoch": 1.7985927880386985, + "grad_norm": 0.5659396468033879, + "learning_rate": 0.0001393909877857995, + "loss": 3.215360641479492, + "step": 3068, + "token_acc": 0.27581955342741665 + }, + { + "epoch": 1.7991791263559074, + "grad_norm": 0.39846348849663715, + "learning_rate": 0.00013939009446783013, + "loss": 3.2298998832702637, + "step": 3069, + "token_acc": 0.2732455091052682 + }, + { + "epoch": 1.7997654646731163, + "grad_norm": 0.6023182591545795, + "learning_rate": 0.00013938920049803432, + "loss": 3.1581497192382812, + "step": 3070, + "token_acc": 0.2816277031571653 + }, + { + "epoch": 1.8003518029903254, + "grad_norm": 0.5973667899463941, + "learning_rate": 0.00013938830587642044, + "loss": 3.2382943630218506, + "step": 3071, + "token_acc": 0.27295622356008137 + }, + { + "epoch": 1.8009381413075345, + "grad_norm": 0.40716133912826363, + "learning_rate": 0.00013938741060299693, + "loss": 3.1783199310302734, + "step": 3072, + "token_acc": 0.2781033493078453 + }, + { + "epoch": 1.8015244796247436, + "grad_norm": 0.5260938963136131, + "learning_rate": 0.00013938651467777224, + "loss": 3.1620545387268066, + "step": 3073, + "token_acc": 0.28243666097336956 + }, + { + "epoch": 1.8021108179419525, + "grad_norm": 0.49606053059243854, + "learning_rate": 0.00013938561810075472, + "loss": 3.1735994815826416, + "step": 3074, + "token_acc": 0.2802943378046917 + }, + { + "epoch": 1.8026971562591614, + "grad_norm": 0.4231313636004261, + "learning_rate": 0.00013938472087195283, + "loss": 3.1567726135253906, + "step": 3075, + "token_acc": 0.28127438886231465 + }, + { + "epoch": 1.8032834945763705, + "grad_norm": 0.5405233238987444, + "learning_rate": 0.00013938382299137495, + "loss": 3.158040761947632, + "step": 3076, + "token_acc": 0.2812863280584251 + }, + { + "epoch": 1.8038698328935796, + "grad_norm": 0.5304781267798307, + "learning_rate": 0.00013938292445902958, + "loss": 3.213930606842041, + "step": 3077, + "token_acc": 0.2746973863267896 + }, + { + "epoch": 1.8044561712107887, + "grad_norm": 0.44431819676251194, + "learning_rate": 0.00013938202527492513, + "loss": 3.1612045764923096, + "step": 3078, + "token_acc": 0.2809203090880267 + }, + { + "epoch": 1.8050425095279976, + "grad_norm": 0.4717217570765823, + "learning_rate": 0.00013938112543907005, + "loss": 3.175443172454834, + "step": 3079, + "token_acc": 0.2806741897838353 + }, + { + "epoch": 1.8056288478452067, + "grad_norm": 0.46229025408759195, + "learning_rate": 0.00013938022495147274, + "loss": 3.2002222537994385, + "step": 3080, + "token_acc": 0.27617259006430717 + }, + { + "epoch": 1.8062151861624156, + "grad_norm": 0.5083890099407025, + "learning_rate": 0.00013937932381214175, + "loss": 3.1847095489501953, + "step": 3081, + "token_acc": 0.2784684602430661 + }, + { + "epoch": 1.8068015244796247, + "grad_norm": 0.5533686622046305, + "learning_rate": 0.0001393784220210855, + "loss": 3.1746768951416016, + "step": 3082, + "token_acc": 0.27897653152164087 + }, + { + "epoch": 1.8073878627968338, + "grad_norm": 0.4142104940355026, + "learning_rate": 0.00013937751957831247, + "loss": 3.173844814300537, + "step": 3083, + "token_acc": 0.2810900214749703 + }, + { + "epoch": 1.807974201114043, + "grad_norm": 0.4978544453029577, + "learning_rate": 0.00013937661648383114, + "loss": 3.1158652305603027, + "step": 3084, + "token_acc": 0.2877712113934888 + }, + { + "epoch": 1.8085605394312518, + "grad_norm": 0.404520868960592, + "learning_rate": 0.00013937571273764995, + "loss": 3.187324047088623, + "step": 3085, + "token_acc": 0.2792948247078464 + }, + { + "epoch": 1.8091468777484607, + "grad_norm": 0.522672732730257, + "learning_rate": 0.00013937480833977744, + "loss": 3.159419298171997, + "step": 3086, + "token_acc": 0.2824156212272322 + }, + { + "epoch": 1.8097332160656698, + "grad_norm": 0.415376939666003, + "learning_rate": 0.00013937390329022206, + "loss": 3.1354565620422363, + "step": 3087, + "token_acc": 0.28479857895070737 + }, + { + "epoch": 1.810319554382879, + "grad_norm": 0.4642213383858837, + "learning_rate": 0.0001393729975889924, + "loss": 3.205690383911133, + "step": 3088, + "token_acc": 0.2772781288465066 + }, + { + "epoch": 1.810905892700088, + "grad_norm": 0.49813799232559103, + "learning_rate": 0.00013937209123609688, + "loss": 3.117408514022827, + "step": 3089, + "token_acc": 0.28825134026694493 + }, + { + "epoch": 1.811492231017297, + "grad_norm": 0.47138060972549517, + "learning_rate": 0.00013937118423154403, + "loss": 3.1561031341552734, + "step": 3090, + "token_acc": 0.2823397551123988 + }, + { + "epoch": 1.812078569334506, + "grad_norm": 0.44806060526564306, + "learning_rate": 0.0001393702765753424, + "loss": 3.219968795776367, + "step": 3091, + "token_acc": 0.27336022516504754 + }, + { + "epoch": 1.812664907651715, + "grad_norm": 0.35604252760771127, + "learning_rate": 0.00013936936826750048, + "loss": 3.159332036972046, + "step": 3092, + "token_acc": 0.2835389278910419 + }, + { + "epoch": 1.813251245968924, + "grad_norm": 0.43141545658571695, + "learning_rate": 0.00013936845930802685, + "loss": 3.113966464996338, + "step": 3093, + "token_acc": 0.2870039080453933 + }, + { + "epoch": 1.8138375842861332, + "grad_norm": 0.3702649756938618, + "learning_rate": 0.00013936754969693, + "loss": 3.1587395668029785, + "step": 3094, + "token_acc": 0.28325766620373 + }, + { + "epoch": 1.8144239226033423, + "grad_norm": 0.3448607896054455, + "learning_rate": 0.0001393666394342185, + "loss": 3.1702499389648438, + "step": 3095, + "token_acc": 0.2804072404243649 + }, + { + "epoch": 1.8150102609205512, + "grad_norm": 0.3900493020572125, + "learning_rate": 0.0001393657285199009, + "loss": 3.1721315383911133, + "step": 3096, + "token_acc": 0.28116943302154135 + }, + { + "epoch": 1.81559659923776, + "grad_norm": 0.3957939271157844, + "learning_rate": 0.00013936481695398572, + "loss": 3.139617919921875, + "step": 3097, + "token_acc": 0.2866223370152589 + }, + { + "epoch": 1.8161829375549692, + "grad_norm": 0.4577551980560736, + "learning_rate": 0.00013936390473648157, + "loss": 3.1384427547454834, + "step": 3098, + "token_acc": 0.28515393626683166 + }, + { + "epoch": 1.8167692758721783, + "grad_norm": 0.4177773953964753, + "learning_rate": 0.00013936299186739702, + "loss": 3.2024097442626953, + "step": 3099, + "token_acc": 0.2757726054828157 + }, + { + "epoch": 1.8173556141893874, + "grad_norm": 0.5427176212870789, + "learning_rate": 0.00013936207834674063, + "loss": 3.1971421241760254, + "step": 3100, + "token_acc": 0.2774537474175481 + }, + { + "epoch": 1.8179419525065963, + "grad_norm": 0.5671069807234658, + "learning_rate": 0.000139361164174521, + "loss": 3.2120771408081055, + "step": 3101, + "token_acc": 0.274823438187648 + }, + { + "epoch": 1.8185282908238052, + "grad_norm": 0.47576236019797347, + "learning_rate": 0.00013936024935074667, + "loss": 3.1772897243499756, + "step": 3102, + "token_acc": 0.2794671620745376 + }, + { + "epoch": 1.8191146291410143, + "grad_norm": 0.38272089477308774, + "learning_rate": 0.00013935933387542625, + "loss": 3.1722116470336914, + "step": 3103, + "token_acc": 0.28010020602613195 + }, + { + "epoch": 1.8197009674582234, + "grad_norm": 0.38335341776311815, + "learning_rate": 0.00013935841774856837, + "loss": 3.166057586669922, + "step": 3104, + "token_acc": 0.28029580340741556 + }, + { + "epoch": 1.8202873057754325, + "grad_norm": 0.3951982151991867, + "learning_rate": 0.0001393575009701816, + "loss": 3.131580352783203, + "step": 3105, + "token_acc": 0.2866240309235814 + }, + { + "epoch": 1.8208736440926414, + "grad_norm": 0.37854925932490785, + "learning_rate": 0.0001393565835402746, + "loss": 3.1695775985717773, + "step": 3106, + "token_acc": 0.2811941346726824 + }, + { + "epoch": 1.8214599824098505, + "grad_norm": 0.380870748421684, + "learning_rate": 0.00013935566545885593, + "loss": 3.1792449951171875, + "step": 3107, + "token_acc": 0.2798451775068595 + }, + { + "epoch": 1.8220463207270594, + "grad_norm": 0.37751737685559617, + "learning_rate": 0.00013935474672593424, + "loss": 3.1232857704162598, + "step": 3108, + "token_acc": 0.2874499431770081 + }, + { + "epoch": 1.8226326590442685, + "grad_norm": 0.4675124369873627, + "learning_rate": 0.00013935382734151818, + "loss": 3.152430534362793, + "step": 3109, + "token_acc": 0.2824377167182058 + }, + { + "epoch": 1.8232189973614776, + "grad_norm": 0.47256996763411707, + "learning_rate": 0.00013935290730561636, + "loss": 3.1945910453796387, + "step": 3110, + "token_acc": 0.2794085098959242 + }, + { + "epoch": 1.8238053356786867, + "grad_norm": 0.5349976757661753, + "learning_rate": 0.00013935198661823743, + "loss": 3.1444740295410156, + "step": 3111, + "token_acc": 0.28274354771317456 + }, + { + "epoch": 1.8243916739958956, + "grad_norm": 0.5885466420989289, + "learning_rate": 0.00013935106527939004, + "loss": 3.1792304515838623, + "step": 3112, + "token_acc": 0.2787217847628339 + }, + { + "epoch": 1.8249780123131045, + "grad_norm": 0.394028174009091, + "learning_rate": 0.00013935014328908283, + "loss": 3.175494432449341, + "step": 3113, + "token_acc": 0.28276836837478725 + }, + { + "epoch": 1.8255643506303136, + "grad_norm": 0.42584787829155984, + "learning_rate": 0.00013934922064732448, + "loss": 3.1496691703796387, + "step": 3114, + "token_acc": 0.28202434044452057 + }, + { + "epoch": 1.8261506889475227, + "grad_norm": 0.5159808963665635, + "learning_rate": 0.00013934829735412366, + "loss": 3.166433334350586, + "step": 3115, + "token_acc": 0.28145037902030823 + }, + { + "epoch": 1.8267370272647319, + "grad_norm": 0.5305346169379062, + "learning_rate": 0.00013934737340948905, + "loss": 3.1666316986083984, + "step": 3116, + "token_acc": 0.28240817254059414 + }, + { + "epoch": 1.8273233655819408, + "grad_norm": 0.44382065124134346, + "learning_rate": 0.00013934644881342928, + "loss": 3.1748905181884766, + "step": 3117, + "token_acc": 0.2804614817851157 + }, + { + "epoch": 1.8279097038991499, + "grad_norm": 0.3736691576577399, + "learning_rate": 0.00013934552356595307, + "loss": 3.132499933242798, + "step": 3118, + "token_acc": 0.2841951227187735 + }, + { + "epoch": 1.8284960422163588, + "grad_norm": 0.41432946873712084, + "learning_rate": 0.00013934459766706914, + "loss": 3.117541790008545, + "step": 3119, + "token_acc": 0.286692869174621 + }, + { + "epoch": 1.8290823805335679, + "grad_norm": 0.3962454139031622, + "learning_rate": 0.0001393436711167861, + "loss": 3.1921987533569336, + "step": 3120, + "token_acc": 0.27692729307339203 + }, + { + "epoch": 1.829668718850777, + "grad_norm": 0.49240817896377026, + "learning_rate": 0.00013934274391511276, + "loss": 3.1450676918029785, + "step": 3121, + "token_acc": 0.2848091244477988 + }, + { + "epoch": 1.830255057167986, + "grad_norm": 0.4981208730177976, + "learning_rate": 0.0001393418160620578, + "loss": 3.2035679817199707, + "step": 3122, + "token_acc": 0.2743361426813617 + }, + { + "epoch": 1.830841395485195, + "grad_norm": 0.4730297181081027, + "learning_rate": 0.00013934088755762988, + "loss": 3.1686644554138184, + "step": 3123, + "token_acc": 0.28061125882125565 + }, + { + "epoch": 1.8314277338024039, + "grad_norm": 0.49353159956789816, + "learning_rate": 0.00013933995840183778, + "loss": 3.096209764480591, + "step": 3124, + "token_acc": 0.28976383232791636 + }, + { + "epoch": 1.832014072119613, + "grad_norm": 0.5098706284039898, + "learning_rate": 0.0001393390285946902, + "loss": 3.1716156005859375, + "step": 3125, + "token_acc": 0.2803987204855593 + }, + { + "epoch": 1.832600410436822, + "grad_norm": 0.3983377277263639, + "learning_rate": 0.0001393380981361959, + "loss": 3.181307792663574, + "step": 3126, + "token_acc": 0.27931707624250246 + }, + { + "epoch": 1.8331867487540312, + "grad_norm": 0.4685289541200483, + "learning_rate": 0.00013933716702636354, + "loss": 3.1936874389648438, + "step": 3127, + "token_acc": 0.27790426311713406 + }, + { + "epoch": 1.83377308707124, + "grad_norm": 0.4850728489300537, + "learning_rate": 0.00013933623526520198, + "loss": 3.216696262359619, + "step": 3128, + "token_acc": 0.27244670394131887 + }, + { + "epoch": 1.834359425388449, + "grad_norm": 0.4442784435578466, + "learning_rate": 0.0001393353028527199, + "loss": 3.152035713195801, + "step": 3129, + "token_acc": 0.28279672085886315 + }, + { + "epoch": 1.834945763705658, + "grad_norm": 0.3398012479935769, + "learning_rate": 0.00013933436978892611, + "loss": 3.134960651397705, + "step": 3130, + "token_acc": 0.2829116580117453 + }, + { + "epoch": 1.8355321020228672, + "grad_norm": 0.4807958262620541, + "learning_rate": 0.00013933343607382934, + "loss": 3.1734981536865234, + "step": 3131, + "token_acc": 0.2793506831734474 + }, + { + "epoch": 1.8361184403400763, + "grad_norm": 0.479538214670079, + "learning_rate": 0.00013933250170743836, + "loss": 3.1691830158233643, + "step": 3132, + "token_acc": 0.28349071587397845 + }, + { + "epoch": 1.8367047786572852, + "grad_norm": 0.41577059028038776, + "learning_rate": 0.00013933156668976193, + "loss": 3.190570831298828, + "step": 3133, + "token_acc": 0.2777780676126878 + }, + { + "epoch": 1.8372911169744943, + "grad_norm": 0.42132792114377215, + "learning_rate": 0.00013933063102080888, + "loss": 3.1849000453948975, + "step": 3134, + "token_acc": 0.2796674071205853 + }, + { + "epoch": 1.8378774552917032, + "grad_norm": 0.4553283949667844, + "learning_rate": 0.00013932969470058796, + "loss": 3.182206153869629, + "step": 3135, + "token_acc": 0.2787902522103891 + }, + { + "epoch": 1.8384637936089123, + "grad_norm": 0.5656155033583683, + "learning_rate": 0.00013932875772910798, + "loss": 3.164655923843384, + "step": 3136, + "token_acc": 0.28017347693825534 + }, + { + "epoch": 1.8390501319261214, + "grad_norm": 0.5555682484932005, + "learning_rate": 0.00013932782010637776, + "loss": 3.1872076988220215, + "step": 3137, + "token_acc": 0.2785982675461254 + }, + { + "epoch": 1.8396364702433305, + "grad_norm": 0.41618370745012034, + "learning_rate": 0.0001393268818324061, + "loss": 3.156522274017334, + "step": 3138, + "token_acc": 0.2824550642700944 + }, + { + "epoch": 1.8402228085605394, + "grad_norm": 0.5558849173494799, + "learning_rate": 0.00013932594290720177, + "loss": 3.1615028381347656, + "step": 3139, + "token_acc": 0.2817987218868425 + }, + { + "epoch": 1.8408091468777483, + "grad_norm": 0.4390032545505644, + "learning_rate": 0.00013932500333077363, + "loss": 3.1430325508117676, + "step": 3140, + "token_acc": 0.2852734937745245 + }, + { + "epoch": 1.8413954851949574, + "grad_norm": 0.4319138272206243, + "learning_rate": 0.00013932406310313052, + "loss": 3.185291290283203, + "step": 3141, + "token_acc": 0.27892822863013883 + }, + { + "epoch": 1.8419818235121665, + "grad_norm": 0.4538933071360432, + "learning_rate": 0.00013932312222428127, + "loss": 3.1958789825439453, + "step": 3142, + "token_acc": 0.27818737141865674 + }, + { + "epoch": 1.8425681618293757, + "grad_norm": 0.3800487750062563, + "learning_rate": 0.00013932218069423466, + "loss": 3.1312551498413086, + "step": 3143, + "token_acc": 0.2856742920095911 + }, + { + "epoch": 1.8431545001465846, + "grad_norm": 0.43832071318606297, + "learning_rate": 0.00013932123851299958, + "loss": 3.1768927574157715, + "step": 3144, + "token_acc": 0.27985662361315905 + }, + { + "epoch": 1.8437408384637937, + "grad_norm": 0.41746990910120824, + "learning_rate": 0.0001393202956805849, + "loss": 3.174783229827881, + "step": 3145, + "token_acc": 0.27985528934086384 + }, + { + "epoch": 1.8443271767810026, + "grad_norm": 0.4379011159990005, + "learning_rate": 0.00013931935219699943, + "loss": 3.1576087474823, + "step": 3146, + "token_acc": 0.2816806261582582 + }, + { + "epoch": 1.8449135150982117, + "grad_norm": 0.4210637540099297, + "learning_rate": 0.00013931840806225208, + "loss": 3.1915342807769775, + "step": 3147, + "token_acc": 0.2771453010600491 + }, + { + "epoch": 1.8454998534154208, + "grad_norm": 0.49838357316306653, + "learning_rate": 0.00013931746327635166, + "loss": 3.147874355316162, + "step": 3148, + "token_acc": 0.28525543381202006 + }, + { + "epoch": 1.8460861917326299, + "grad_norm": 0.4679593221950554, + "learning_rate": 0.0001393165178393071, + "loss": 3.163898229598999, + "step": 3149, + "token_acc": 0.2820557675041997 + }, + { + "epoch": 1.8466725300498388, + "grad_norm": 0.47297095236215914, + "learning_rate": 0.00013931557175112728, + "loss": 3.1384949684143066, + "step": 3150, + "token_acc": 0.2856092373299035 + }, + { + "epoch": 1.8472588683670477, + "grad_norm": 0.5010868354231831, + "learning_rate": 0.00013931462501182103, + "loss": 3.1679749488830566, + "step": 3151, + "token_acc": 0.2815936718739666 + }, + { + "epoch": 1.8478452066842568, + "grad_norm": 0.4474457647663326, + "learning_rate": 0.0001393136776213973, + "loss": 3.1668801307678223, + "step": 3152, + "token_acc": 0.2814158541305466 + }, + { + "epoch": 1.848431545001466, + "grad_norm": 0.4426601992627646, + "learning_rate": 0.00013931272957986497, + "loss": 3.1903252601623535, + "step": 3153, + "token_acc": 0.27782354228054873 + }, + { + "epoch": 1.849017883318675, + "grad_norm": 0.4661470586098385, + "learning_rate": 0.00013931178088723292, + "loss": 3.1708498001098633, + "step": 3154, + "token_acc": 0.27983128653205636 + }, + { + "epoch": 1.849604221635884, + "grad_norm": 0.3996256301843792, + "learning_rate": 0.00013931083154351012, + "loss": 3.1814990043640137, + "step": 3155, + "token_acc": 0.278026604327858 + }, + { + "epoch": 1.8501905599530928, + "grad_norm": 0.37533383920605207, + "learning_rate": 0.00013930988154870543, + "loss": 3.16428804397583, + "step": 3156, + "token_acc": 0.27941226065368363 + }, + { + "epoch": 1.850776898270302, + "grad_norm": 0.4246557418440358, + "learning_rate": 0.00013930893090282782, + "loss": 3.176447868347168, + "step": 3157, + "token_acc": 0.28066625680614893 + }, + { + "epoch": 1.851363236587511, + "grad_norm": 0.408151642288997, + "learning_rate": 0.00013930797960588618, + "loss": 3.1404223442077637, + "step": 3158, + "token_acc": 0.28213637167146094 + }, + { + "epoch": 1.8519495749047201, + "grad_norm": 0.4028073887243625, + "learning_rate": 0.0001393070276578895, + "loss": 3.1863162517547607, + "step": 3159, + "token_acc": 0.2778931168401917 + }, + { + "epoch": 1.852535913221929, + "grad_norm": 0.3869435824996025, + "learning_rate": 0.00013930607505884665, + "loss": 3.1493799686431885, + "step": 3160, + "token_acc": 0.28486303708517363 + }, + { + "epoch": 1.8531222515391381, + "grad_norm": 0.3994008424027253, + "learning_rate": 0.00013930512180876663, + "loss": 3.195265293121338, + "step": 3161, + "token_acc": 0.27772275372483096 + }, + { + "epoch": 1.853708589856347, + "grad_norm": 0.438679374910476, + "learning_rate": 0.00013930416790765838, + "loss": 3.1393446922302246, + "step": 3162, + "token_acc": 0.28480518238245045 + }, + { + "epoch": 1.8542949281735561, + "grad_norm": 0.35642283953295867, + "learning_rate": 0.00013930321335553085, + "loss": 3.1942851543426514, + "step": 3163, + "token_acc": 0.2790757272729675 + }, + { + "epoch": 1.8548812664907652, + "grad_norm": 0.36933449704949317, + "learning_rate": 0.00013930225815239305, + "loss": 3.217050075531006, + "step": 3164, + "token_acc": 0.2745982572471888 + }, + { + "epoch": 1.8554676048079743, + "grad_norm": 0.38915299533280945, + "learning_rate": 0.0001393013022982539, + "loss": 3.1804919242858887, + "step": 3165, + "token_acc": 0.278897950119047 + }, + { + "epoch": 1.8560539431251832, + "grad_norm": 0.4343528363117653, + "learning_rate": 0.0001393003457931224, + "loss": 3.136134624481201, + "step": 3166, + "token_acc": 0.2850956709681521 + }, + { + "epoch": 1.8566402814423921, + "grad_norm": 0.5016250329219616, + "learning_rate": 0.00013929938863700754, + "loss": 3.1551413536071777, + "step": 3167, + "token_acc": 0.2822694998556 + }, + { + "epoch": 1.8572266197596012, + "grad_norm": 0.4248031394944281, + "learning_rate": 0.00013929843082991828, + "loss": 3.1437995433807373, + "step": 3168, + "token_acc": 0.2851472519429076 + }, + { + "epoch": 1.8578129580768104, + "grad_norm": 0.4040668038723804, + "learning_rate": 0.00013929747237186366, + "loss": 3.1509008407592773, + "step": 3169, + "token_acc": 0.2834553724402382 + }, + { + "epoch": 1.8583992963940195, + "grad_norm": 0.31637779847916714, + "learning_rate": 0.00013929651326285267, + "loss": 3.1650071144104004, + "step": 3170, + "token_acc": 0.2813639007625848 + }, + { + "epoch": 1.8589856347112284, + "grad_norm": 0.44539377667623115, + "learning_rate": 0.00013929555350289432, + "loss": 3.1280226707458496, + "step": 3171, + "token_acc": 0.28717283980252367 + }, + { + "epoch": 1.8595719730284375, + "grad_norm": 0.4582479016867659, + "learning_rate": 0.00013929459309199762, + "loss": 3.1732263565063477, + "step": 3172, + "token_acc": 0.2801640531269985 + }, + { + "epoch": 1.8601583113456464, + "grad_norm": 0.45264600986543596, + "learning_rate": 0.0001392936320301716, + "loss": 3.16690993309021, + "step": 3173, + "token_acc": 0.2807256500217292 + }, + { + "epoch": 1.8607446496628555, + "grad_norm": 0.46004897631861874, + "learning_rate": 0.00013929267031742527, + "loss": 3.169872760772705, + "step": 3174, + "token_acc": 0.28023176388987786 + }, + { + "epoch": 1.8613309879800646, + "grad_norm": 0.38790402064626045, + "learning_rate": 0.00013929170795376768, + "loss": 3.1654186248779297, + "step": 3175, + "token_acc": 0.280828693661595 + }, + { + "epoch": 1.8619173262972737, + "grad_norm": 0.47520985549150313, + "learning_rate": 0.00013929074493920787, + "loss": 3.1156277656555176, + "step": 3176, + "token_acc": 0.2891200693541396 + }, + { + "epoch": 1.8625036646144826, + "grad_norm": 0.5006846008239417, + "learning_rate": 0.00013928978127375488, + "loss": 3.197873592376709, + "step": 3177, + "token_acc": 0.2771237267979939 + }, + { + "epoch": 1.8630900029316915, + "grad_norm": 0.43510501661829165, + "learning_rate": 0.00013928881695741773, + "loss": 3.1668832302093506, + "step": 3178, + "token_acc": 0.2804988527522332 + }, + { + "epoch": 1.8636763412489006, + "grad_norm": 0.42840004165706247, + "learning_rate": 0.00013928785199020556, + "loss": 3.2006068229675293, + "step": 3179, + "token_acc": 0.2771065186972507 + }, + { + "epoch": 1.8642626795661097, + "grad_norm": 0.5227774874469465, + "learning_rate": 0.00013928688637212736, + "loss": 3.147080659866333, + "step": 3180, + "token_acc": 0.2832838859282804 + }, + { + "epoch": 1.8648490178833188, + "grad_norm": 0.6243791859283473, + "learning_rate": 0.00013928592010319224, + "loss": 3.1829938888549805, + "step": 3181, + "token_acc": 0.2796941193741873 + }, + { + "epoch": 1.8654353562005277, + "grad_norm": 0.5694853019269515, + "learning_rate": 0.00013928495318340925, + "loss": 3.188652276992798, + "step": 3182, + "token_acc": 0.2783027453359624 + }, + { + "epoch": 1.8660216945177366, + "grad_norm": 0.47143333055759234, + "learning_rate": 0.0001392839856127875, + "loss": 3.1679487228393555, + "step": 3183, + "token_acc": 0.27880433957201656 + }, + { + "epoch": 1.8666080328349457, + "grad_norm": 0.4723448652573872, + "learning_rate": 0.0001392830173913361, + "loss": 3.146944999694824, + "step": 3184, + "token_acc": 0.28359464466812195 + }, + { + "epoch": 1.8671943711521548, + "grad_norm": 0.4800870738967834, + "learning_rate": 0.00013928204851906408, + "loss": 3.157043933868408, + "step": 3185, + "token_acc": 0.28231543517269936 + }, + { + "epoch": 1.867780709469364, + "grad_norm": 0.5061764019688572, + "learning_rate": 0.00013928107899598054, + "loss": 3.1980795860290527, + "step": 3186, + "token_acc": 0.27643952278045597 + }, + { + "epoch": 1.8683670477865728, + "grad_norm": 0.5514986383159833, + "learning_rate": 0.00013928010882209468, + "loss": 3.191307783126831, + "step": 3187, + "token_acc": 0.2783021634842221 + }, + { + "epoch": 1.868953386103782, + "grad_norm": 0.43693590323965487, + "learning_rate": 0.00013927913799741552, + "loss": 3.135305881500244, + "step": 3188, + "token_acc": 0.28483880611104995 + }, + { + "epoch": 1.8695397244209908, + "grad_norm": 0.4443579387867809, + "learning_rate": 0.0001392781665219522, + "loss": 3.1411781311035156, + "step": 3189, + "token_acc": 0.28354050072090575 + }, + { + "epoch": 1.8701260627382, + "grad_norm": 0.42033430316866605, + "learning_rate": 0.00013927719439571387, + "loss": 3.1558656692504883, + "step": 3190, + "token_acc": 0.2805567791384019 + }, + { + "epoch": 1.870712401055409, + "grad_norm": 0.4561818756032974, + "learning_rate": 0.00013927622161870966, + "loss": 3.164008140563965, + "step": 3191, + "token_acc": 0.2834006810683134 + }, + { + "epoch": 1.8712987393726181, + "grad_norm": 0.40538453492410115, + "learning_rate": 0.0001392752481909487, + "loss": 3.1403615474700928, + "step": 3192, + "token_acc": 0.2836600090423127 + }, + { + "epoch": 1.871885077689827, + "grad_norm": 0.37927015827380234, + "learning_rate": 0.00013927427411244013, + "loss": 3.1708359718322754, + "step": 3193, + "token_acc": 0.27973665836450984 + }, + { + "epoch": 1.872471416007036, + "grad_norm": 0.396055522820391, + "learning_rate": 0.0001392732993831931, + "loss": 3.149012565612793, + "step": 3194, + "token_acc": 0.2835968616057532 + }, + { + "epoch": 1.873057754324245, + "grad_norm": 0.3699032754830193, + "learning_rate": 0.00013927232400321677, + "loss": 3.138062000274658, + "step": 3195, + "token_acc": 0.28395361655551465 + }, + { + "epoch": 1.8736440926414542, + "grad_norm": 0.4216996926870833, + "learning_rate": 0.0001392713479725203, + "loss": 3.140707015991211, + "step": 3196, + "token_acc": 0.2831224327204627 + }, + { + "epoch": 1.8742304309586633, + "grad_norm": 0.47806893854204235, + "learning_rate": 0.00013927037129111289, + "loss": 3.167876720428467, + "step": 3197, + "token_acc": 0.28166361777682125 + }, + { + "epoch": 1.8748167692758722, + "grad_norm": 0.40773732353368186, + "learning_rate": 0.00013926939395900363, + "loss": 3.168194532394409, + "step": 3198, + "token_acc": 0.2792231536664702 + }, + { + "epoch": 1.875403107593081, + "grad_norm": 0.49929190999179485, + "learning_rate": 0.0001392684159762018, + "loss": 3.137099266052246, + "step": 3199, + "token_acc": 0.28466421209601855 + }, + { + "epoch": 1.8759894459102902, + "grad_norm": 0.3939360740493163, + "learning_rate": 0.0001392674373427165, + "loss": 3.1563568115234375, + "step": 3200, + "token_acc": 0.2837894725967784 + }, + { + "epoch": 1.8765757842274993, + "grad_norm": 0.44235563129322336, + "learning_rate": 0.000139266458058557, + "loss": 3.1277499198913574, + "step": 3201, + "token_acc": 0.28550824086901355 + }, + { + "epoch": 1.8771621225447084, + "grad_norm": 0.3699179531293641, + "learning_rate": 0.00013926547812373246, + "loss": 3.0988478660583496, + "step": 3202, + "token_acc": 0.28933124035833174 + }, + { + "epoch": 1.8777484608619175, + "grad_norm": 0.38433421388597616, + "learning_rate": 0.00013926449753825208, + "loss": 3.174110174179077, + "step": 3203, + "token_acc": 0.27928009149545613 + }, + { + "epoch": 1.8783347991791264, + "grad_norm": 0.35849157503337414, + "learning_rate": 0.0001392635163021251, + "loss": 3.1657798290252686, + "step": 3204, + "token_acc": 0.28283916928462777 + }, + { + "epoch": 1.8789211374963353, + "grad_norm": 0.45925139587220076, + "learning_rate": 0.0001392625344153607, + "loss": 3.170564651489258, + "step": 3205, + "token_acc": 0.2785159679870972 + }, + { + "epoch": 1.8795074758135444, + "grad_norm": 0.3627784208088287, + "learning_rate": 0.0001392615518779681, + "loss": 3.1906752586364746, + "step": 3206, + "token_acc": 0.2794275919799099 + }, + { + "epoch": 1.8800938141307535, + "grad_norm": 0.38103563519565514, + "learning_rate": 0.00013926056868995658, + "loss": 3.133211135864258, + "step": 3207, + "token_acc": 0.2842498812896897 + }, + { + "epoch": 1.8806801524479626, + "grad_norm": 0.36515337250559926, + "learning_rate": 0.00013925958485133536, + "loss": 3.157428503036499, + "step": 3208, + "token_acc": 0.2828138787344006 + }, + { + "epoch": 1.8812664907651715, + "grad_norm": 0.3807589227799556, + "learning_rate": 0.00013925860036211366, + "loss": 3.154573917388916, + "step": 3209, + "token_acc": 0.28180255499719653 + }, + { + "epoch": 1.8818528290823804, + "grad_norm": 0.4306466813557704, + "learning_rate": 0.0001392576152223007, + "loss": 3.1789722442626953, + "step": 3210, + "token_acc": 0.2789900445842593 + }, + { + "epoch": 1.8824391673995895, + "grad_norm": 0.39154732834637207, + "learning_rate": 0.0001392566294319058, + "loss": 3.1569294929504395, + "step": 3211, + "token_acc": 0.28095053160368794 + }, + { + "epoch": 1.8830255057167986, + "grad_norm": 0.421677878031165, + "learning_rate": 0.0001392556429909382, + "loss": 3.17936372756958, + "step": 3212, + "token_acc": 0.278695482908155 + }, + { + "epoch": 1.8836118440340077, + "grad_norm": 0.4372732286099418, + "learning_rate": 0.00013925465589940714, + "loss": 3.176467180252075, + "step": 3213, + "token_acc": 0.27942310618182964 + }, + { + "epoch": 1.8841981823512166, + "grad_norm": 0.4140991247351585, + "learning_rate": 0.00013925366815732194, + "loss": 3.1532692909240723, + "step": 3214, + "token_acc": 0.28295799433803465 + }, + { + "epoch": 1.8847845206684257, + "grad_norm": 0.4508223053126904, + "learning_rate": 0.0001392526797646918, + "loss": 3.2087998390197754, + "step": 3215, + "token_acc": 0.2774270002036773 + }, + { + "epoch": 1.8853708589856346, + "grad_norm": 0.5012089860045833, + "learning_rate": 0.00013925169072152608, + "loss": 3.1582112312316895, + "step": 3216, + "token_acc": 0.27963699829031996 + }, + { + "epoch": 1.8859571973028437, + "grad_norm": 0.46565720167085073, + "learning_rate": 0.00013925070102783406, + "loss": 3.1830849647521973, + "step": 3217, + "token_acc": 0.2777029480618817 + }, + { + "epoch": 1.8865435356200528, + "grad_norm": 0.4211693954561409, + "learning_rate": 0.000139249710683625, + "loss": 3.1555519104003906, + "step": 3218, + "token_acc": 0.28383141480056123 + }, + { + "epoch": 1.887129873937262, + "grad_norm": 0.436791452156444, + "learning_rate": 0.0001392487196889082, + "loss": 3.1979591846466064, + "step": 3219, + "token_acc": 0.27799840939333026 + }, + { + "epoch": 1.8877162122544708, + "grad_norm": 0.47964134596519054, + "learning_rate": 0.00013924772804369302, + "loss": 3.169191598892212, + "step": 3220, + "token_acc": 0.2798651810937302 + }, + { + "epoch": 1.8883025505716797, + "grad_norm": 0.5242219429844494, + "learning_rate": 0.00013924673574798875, + "loss": 3.178762912750244, + "step": 3221, + "token_acc": 0.28010421790048784 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.5259833030520964, + "learning_rate": 0.0001392457428018047, + "loss": 3.1936635971069336, + "step": 3222, + "token_acc": 0.277296928048222 + }, + { + "epoch": 1.889475227206098, + "grad_norm": 0.5260827468692263, + "learning_rate": 0.00013924474920515021, + "loss": 3.190840244293213, + "step": 3223, + "token_acc": 0.2771848775567145 + }, + { + "epoch": 1.890061565523307, + "grad_norm": 0.47998355469632054, + "learning_rate": 0.00013924375495803464, + "loss": 3.1654210090637207, + "step": 3224, + "token_acc": 0.2803497821683053 + }, + { + "epoch": 1.890647903840516, + "grad_norm": 0.42930586885380156, + "learning_rate": 0.00013924276006046726, + "loss": 3.163123607635498, + "step": 3225, + "token_acc": 0.28264162684062616 + }, + { + "epoch": 1.8912342421577248, + "grad_norm": 0.5900401610410939, + "learning_rate": 0.00013924176451245745, + "loss": 3.209183692932129, + "step": 3226, + "token_acc": 0.27388793164972525 + }, + { + "epoch": 1.891820580474934, + "grad_norm": 0.3710060984014983, + "learning_rate": 0.0001392407683140146, + "loss": 3.2026174068450928, + "step": 3227, + "token_acc": 0.2759523853002789 + }, + { + "epoch": 1.892406918792143, + "grad_norm": 0.45531589579279325, + "learning_rate": 0.00013923977146514802, + "loss": 3.1405768394470215, + "step": 3228, + "token_acc": 0.28363631721391 + }, + { + "epoch": 1.8929932571093522, + "grad_norm": 0.3929252099195596, + "learning_rate": 0.00013923877396586706, + "loss": 3.132526397705078, + "step": 3229, + "token_acc": 0.28557895270549744 + }, + { + "epoch": 1.8935795954265613, + "grad_norm": 0.3819646646849763, + "learning_rate": 0.00013923777581618114, + "loss": 3.0966849327087402, + "step": 3230, + "token_acc": 0.28866570635936833 + }, + { + "epoch": 1.8941659337437702, + "grad_norm": 0.41937028930101194, + "learning_rate": 0.00013923677701609962, + "loss": 3.1625704765319824, + "step": 3231, + "token_acc": 0.2815315972120965 + }, + { + "epoch": 1.894752272060979, + "grad_norm": 0.48558169730096007, + "learning_rate": 0.00013923577756563187, + "loss": 3.2017264366149902, + "step": 3232, + "token_acc": 0.27718883013205114 + }, + { + "epoch": 1.8953386103781882, + "grad_norm": 0.4044825468728186, + "learning_rate": 0.0001392347774647873, + "loss": 3.195657730102539, + "step": 3233, + "token_acc": 0.27687345480231856 + }, + { + "epoch": 1.8959249486953973, + "grad_norm": 0.3636799516545554, + "learning_rate": 0.00013923377671357527, + "loss": 3.1659018993377686, + "step": 3234, + "token_acc": 0.2814515510589405 + }, + { + "epoch": 1.8965112870126064, + "grad_norm": 0.43506610084262193, + "learning_rate": 0.00013923277531200525, + "loss": 3.1271121501922607, + "step": 3235, + "token_acc": 0.287426762744966 + }, + { + "epoch": 1.8970976253298153, + "grad_norm": 0.39835074128630904, + "learning_rate": 0.00013923177326008655, + "loss": 3.164862632751465, + "step": 3236, + "token_acc": 0.27948463059870615 + }, + { + "epoch": 1.8976839636470242, + "grad_norm": 0.398647678967304, + "learning_rate": 0.00013923077055782862, + "loss": 3.1710309982299805, + "step": 3237, + "token_acc": 0.27940895912209096 + }, + { + "epoch": 1.8982703019642333, + "grad_norm": 0.43493695556800605, + "learning_rate": 0.00013922976720524092, + "loss": 3.1558518409729004, + "step": 3238, + "token_acc": 0.2812582835195153 + }, + { + "epoch": 1.8988566402814424, + "grad_norm": 0.47465953459841326, + "learning_rate": 0.00013922876320233285, + "loss": 3.1219582557678223, + "step": 3239, + "token_acc": 0.28650379629151385 + }, + { + "epoch": 1.8994429785986515, + "grad_norm": 0.39889173750064055, + "learning_rate": 0.00013922775854911384, + "loss": 3.159717559814453, + "step": 3240, + "token_acc": 0.28235672306922555 + }, + { + "epoch": 1.9000293169158604, + "grad_norm": 0.3882967828517272, + "learning_rate": 0.00013922675324559328, + "loss": 3.1364872455596924, + "step": 3241, + "token_acc": 0.28430565659032864 + }, + { + "epoch": 1.9006156552330695, + "grad_norm": 0.386053519364716, + "learning_rate": 0.0001392257472917807, + "loss": 3.1468558311462402, + "step": 3242, + "token_acc": 0.28261187384537684 + }, + { + "epoch": 1.9012019935502784, + "grad_norm": 0.3535354835029746, + "learning_rate": 0.0001392247406876855, + "loss": 3.1676108837127686, + "step": 3243, + "token_acc": 0.28007552837906935 + }, + { + "epoch": 1.9017883318674875, + "grad_norm": 0.3647392294318364, + "learning_rate": 0.00013922373343331715, + "loss": 3.1686136722564697, + "step": 3244, + "token_acc": 0.2785155376704673 + }, + { + "epoch": 1.9023746701846966, + "grad_norm": 0.3295374863320513, + "learning_rate": 0.00013922272552868508, + "loss": 3.190880298614502, + "step": 3245, + "token_acc": 0.2781064935468662 + }, + { + "epoch": 1.9029610085019057, + "grad_norm": 0.3716464974255558, + "learning_rate": 0.0001392217169737988, + "loss": 3.132190465927124, + "step": 3246, + "token_acc": 0.28722933698293673 + }, + { + "epoch": 1.9035473468191146, + "grad_norm": 0.3723810947186311, + "learning_rate": 0.00013922070776866774, + "loss": 3.123586654663086, + "step": 3247, + "token_acc": 0.28549912584545756 + }, + { + "epoch": 1.9041336851363235, + "grad_norm": 0.3314649450858669, + "learning_rate": 0.00013921969791330145, + "loss": 3.1549324989318848, + "step": 3248, + "token_acc": 0.2806800601690515 + }, + { + "epoch": 1.9047200234535326, + "grad_norm": 0.35860992804761826, + "learning_rate": 0.00013921868740770935, + "loss": 3.1849069595336914, + "step": 3249, + "token_acc": 0.27700294177448453 + }, + { + "epoch": 1.9053063617707418, + "grad_norm": 0.4300130726346776, + "learning_rate": 0.00013921767625190096, + "loss": 3.1899471282958984, + "step": 3250, + "token_acc": 0.27732966468521814 + }, + { + "epoch": 1.9058927000879509, + "grad_norm": 0.4630545263141743, + "learning_rate": 0.00013921666444588577, + "loss": 3.1187996864318848, + "step": 3251, + "token_acc": 0.2866724923207287 + }, + { + "epoch": 1.9064790384051598, + "grad_norm": 0.40185097618652516, + "learning_rate": 0.00013921565198967328, + "loss": 3.203143835067749, + "step": 3252, + "token_acc": 0.27640630589064746 + }, + { + "epoch": 1.9070653767223686, + "grad_norm": 0.5057966622113312, + "learning_rate": 0.00013921463888327303, + "loss": 3.180072069168091, + "step": 3253, + "token_acc": 0.2780814131432819 + }, + { + "epoch": 1.9076517150395778, + "grad_norm": 0.6114914422264391, + "learning_rate": 0.00013921362512669448, + "loss": 3.1662051677703857, + "step": 3254, + "token_acc": 0.2801447278453474 + }, + { + "epoch": 1.9082380533567869, + "grad_norm": 0.4777811847193061, + "learning_rate": 0.0001392126107199472, + "loss": 3.1516036987304688, + "step": 3255, + "token_acc": 0.2816334235197886 + }, + { + "epoch": 1.908824391673996, + "grad_norm": 0.39579552402232804, + "learning_rate": 0.00013921159566304074, + "loss": 3.1827802658081055, + "step": 3256, + "token_acc": 0.27978707658476765 + }, + { + "epoch": 1.909410729991205, + "grad_norm": 0.41516264424378074, + "learning_rate": 0.00013921057995598457, + "loss": 3.157862424850464, + "step": 3257, + "token_acc": 0.2819306851676474 + }, + { + "epoch": 1.909997068308414, + "grad_norm": 0.42452023110846204, + "learning_rate": 0.00013920956359878827, + "loss": 3.1134355068206787, + "step": 3258, + "token_acc": 0.2886879107671642 + }, + { + "epoch": 1.9105834066256229, + "grad_norm": 0.43351651653653855, + "learning_rate": 0.00013920854659146137, + "loss": 3.146402359008789, + "step": 3259, + "token_acc": 0.2827022319617334 + }, + { + "epoch": 1.911169744942832, + "grad_norm": 0.5159748991774191, + "learning_rate": 0.00013920752893401347, + "loss": 3.1085493564605713, + "step": 3260, + "token_acc": 0.28829555092812637 + }, + { + "epoch": 1.911756083260041, + "grad_norm": 0.47485556005404345, + "learning_rate": 0.00013920651062645408, + "loss": 3.1511948108673096, + "step": 3261, + "token_acc": 0.2816732024717888 + }, + { + "epoch": 1.9123424215772502, + "grad_norm": 0.5364071028164041, + "learning_rate": 0.00013920549166879278, + "loss": 3.191603660583496, + "step": 3262, + "token_acc": 0.27772889615460705 + }, + { + "epoch": 1.912928759894459, + "grad_norm": 0.47463936261131123, + "learning_rate": 0.0001392044720610391, + "loss": 3.135042190551758, + "step": 3263, + "token_acc": 0.28402370029558494 + }, + { + "epoch": 1.913515098211668, + "grad_norm": 0.49892655898849975, + "learning_rate": 0.00013920345180320272, + "loss": 3.132401466369629, + "step": 3264, + "token_acc": 0.2864922204616688 + }, + { + "epoch": 1.914101436528877, + "grad_norm": 0.48148554776460956, + "learning_rate": 0.00013920243089529313, + "loss": 3.1199803352355957, + "step": 3265, + "token_acc": 0.28821712727451937 + }, + { + "epoch": 1.9146877748460862, + "grad_norm": 0.48642291445733177, + "learning_rate": 0.00013920140933731996, + "loss": 3.1713573932647705, + "step": 3266, + "token_acc": 0.27960152593781595 + }, + { + "epoch": 1.9152741131632953, + "grad_norm": 0.424112545953526, + "learning_rate": 0.0001392003871292928, + "loss": 3.1691460609436035, + "step": 3267, + "token_acc": 0.2814102202562718 + }, + { + "epoch": 1.9158604514805042, + "grad_norm": 0.5505198714378431, + "learning_rate": 0.00013919936427122127, + "loss": 3.163496971130371, + "step": 3268, + "token_acc": 0.2814821116209333 + }, + { + "epoch": 1.9164467897977133, + "grad_norm": 0.5247317034728544, + "learning_rate": 0.00013919834076311493, + "loss": 3.1559205055236816, + "step": 3269, + "token_acc": 0.28346764437661687 + }, + { + "epoch": 1.9170331281149222, + "grad_norm": 0.40305588343407733, + "learning_rate": 0.00013919731660498342, + "loss": 3.1450202465057373, + "step": 3270, + "token_acc": 0.2829689843960355 + }, + { + "epoch": 1.9176194664321313, + "grad_norm": 0.43542750350141607, + "learning_rate": 0.00013919629179683638, + "loss": 3.1738524436950684, + "step": 3271, + "token_acc": 0.2793582447791656 + }, + { + "epoch": 1.9182058047493404, + "grad_norm": 0.41845032690860806, + "learning_rate": 0.00013919526633868342, + "loss": 3.1482574939727783, + "step": 3272, + "token_acc": 0.2841056976561606 + }, + { + "epoch": 1.9187921430665495, + "grad_norm": 0.42860755232729386, + "learning_rate": 0.00013919424023053418, + "loss": 3.126512050628662, + "step": 3273, + "token_acc": 0.28690584313278983 + }, + { + "epoch": 1.9193784813837584, + "grad_norm": 0.4244661969338137, + "learning_rate": 0.00013919321347239828, + "loss": 3.119539499282837, + "step": 3274, + "token_acc": 0.28811144996880667 + }, + { + "epoch": 1.9199648197009673, + "grad_norm": 0.5076369354379808, + "learning_rate": 0.0001391921860642854, + "loss": 3.123659610748291, + "step": 3275, + "token_acc": 0.2858278598975952 + }, + { + "epoch": 1.9205511580181764, + "grad_norm": 0.41525805077185407, + "learning_rate": 0.00013919115800620517, + "loss": 3.1809468269348145, + "step": 3276, + "token_acc": 0.28006693214688755 + }, + { + "epoch": 1.9211374963353856, + "grad_norm": 0.4149317804297892, + "learning_rate": 0.00013919012929816723, + "loss": 3.1191177368164062, + "step": 3277, + "token_acc": 0.28689471859470705 + }, + { + "epoch": 1.9217238346525947, + "grad_norm": 0.5536141576094403, + "learning_rate": 0.00013918909994018125, + "loss": 3.1598801612854004, + "step": 3278, + "token_acc": 0.2796153381155799 + }, + { + "epoch": 1.9223101729698036, + "grad_norm": 0.4419464666248388, + "learning_rate": 0.00013918806993225695, + "loss": 3.167031764984131, + "step": 3279, + "token_acc": 0.2798491915896873 + }, + { + "epoch": 1.9228965112870124, + "grad_norm": 0.5302034122116437, + "learning_rate": 0.0001391870392744039, + "loss": 3.1663784980773926, + "step": 3280, + "token_acc": 0.2823344383973931 + }, + { + "epoch": 1.9234828496042216, + "grad_norm": 0.48053295401526613, + "learning_rate": 0.0001391860079666319, + "loss": 3.1544718742370605, + "step": 3281, + "token_acc": 0.2801588885059765 + }, + { + "epoch": 1.9240691879214307, + "grad_norm": 0.4241278880950176, + "learning_rate": 0.0001391849760089506, + "loss": 3.1778786182403564, + "step": 3282, + "token_acc": 0.2807347838950001 + }, + { + "epoch": 1.9246555262386398, + "grad_norm": 0.33828674699365086, + "learning_rate": 0.00013918394340136964, + "loss": 3.138490676879883, + "step": 3283, + "token_acc": 0.2830146094326697 + }, + { + "epoch": 1.9252418645558487, + "grad_norm": 0.39755368308773864, + "learning_rate": 0.00013918291014389876, + "loss": 3.175848960876465, + "step": 3284, + "token_acc": 0.2785844516999787 + }, + { + "epoch": 1.9258282028730578, + "grad_norm": 0.38604145871464735, + "learning_rate": 0.00013918187623654767, + "loss": 3.171117067337036, + "step": 3285, + "token_acc": 0.2802097013457254 + }, + { + "epoch": 1.9264145411902667, + "grad_norm": 0.4116837400507202, + "learning_rate": 0.0001391808416793261, + "loss": 3.1548666954040527, + "step": 3286, + "token_acc": 0.28296070929997125 + }, + { + "epoch": 1.9270008795074758, + "grad_norm": 0.47925215408818306, + "learning_rate": 0.00013917980647224369, + "loss": 3.126538038253784, + "step": 3287, + "token_acc": 0.2863094284941763 + }, + { + "epoch": 1.927587217824685, + "grad_norm": 0.40561069819793283, + "learning_rate": 0.00013917877061531025, + "loss": 3.130110502243042, + "step": 3288, + "token_acc": 0.28508181734137295 + }, + { + "epoch": 1.928173556141894, + "grad_norm": 0.40045760768739375, + "learning_rate": 0.0001391777341085355, + "loss": 3.0992798805236816, + "step": 3289, + "token_acc": 0.2909193402388192 + }, + { + "epoch": 1.928759894459103, + "grad_norm": 0.46216424801239697, + "learning_rate": 0.00013917669695192914, + "loss": 3.197916030883789, + "step": 3290, + "token_acc": 0.27465419039869815 + }, + { + "epoch": 1.9293462327763118, + "grad_norm": 0.49159086503092136, + "learning_rate": 0.0001391756591455009, + "loss": 3.1789021492004395, + "step": 3291, + "token_acc": 0.27898984248338593 + }, + { + "epoch": 1.929932571093521, + "grad_norm": 0.416132083820143, + "learning_rate": 0.0001391746206892606, + "loss": 3.1742429733276367, + "step": 3292, + "token_acc": 0.2802672264094932 + }, + { + "epoch": 1.93051890941073, + "grad_norm": 0.40880889684883, + "learning_rate": 0.00013917358158321795, + "loss": 3.1671812534332275, + "step": 3293, + "token_acc": 0.28001754540839974 + }, + { + "epoch": 1.9311052477279391, + "grad_norm": 0.42395819801369944, + "learning_rate": 0.0001391725418273827, + "loss": 3.1937198638916016, + "step": 3294, + "token_acc": 0.2752581537832047 + }, + { + "epoch": 1.931691586045148, + "grad_norm": 0.4761573407644075, + "learning_rate": 0.00013917150142176462, + "loss": 3.1711363792419434, + "step": 3295, + "token_acc": 0.2802139722604824 + }, + { + "epoch": 1.9322779243623571, + "grad_norm": 0.5038722462137357, + "learning_rate": 0.0001391704603663735, + "loss": 3.1723551750183105, + "step": 3296, + "token_acc": 0.2794002681811828 + }, + { + "epoch": 1.932864262679566, + "grad_norm": 0.4791020318949933, + "learning_rate": 0.0001391694186612191, + "loss": 3.1585421562194824, + "step": 3297, + "token_acc": 0.28360109560955293 + }, + { + "epoch": 1.9334506009967751, + "grad_norm": 0.4443220857718283, + "learning_rate": 0.00013916837630631126, + "loss": 3.156704902648926, + "step": 3298, + "token_acc": 0.2818789814817265 + }, + { + "epoch": 1.9340369393139842, + "grad_norm": 0.39094139863593524, + "learning_rate": 0.0001391673333016597, + "loss": 3.1366348266601562, + "step": 3299, + "token_acc": 0.28436502208808995 + }, + { + "epoch": 1.9346232776311933, + "grad_norm": 0.4443070477594585, + "learning_rate": 0.00013916628964727427, + "loss": 3.148031711578369, + "step": 3300, + "token_acc": 0.2835102051368225 + }, + { + "epoch": 1.9352096159484022, + "grad_norm": 0.35307826443732476, + "learning_rate": 0.00013916524534316472, + "loss": 3.178837776184082, + "step": 3301, + "token_acc": 0.28024760687899924 + }, + { + "epoch": 1.9357959542656111, + "grad_norm": 0.3964903356877723, + "learning_rate": 0.00013916420038934094, + "loss": 3.147000312805176, + "step": 3302, + "token_acc": 0.2851044529228142 + }, + { + "epoch": 1.9363822925828202, + "grad_norm": 0.3631784184651412, + "learning_rate": 0.00013916315478581265, + "loss": 3.1657633781433105, + "step": 3303, + "token_acc": 0.2800417862010086 + }, + { + "epoch": 1.9369686309000294, + "grad_norm": 0.43325326832762157, + "learning_rate": 0.00013916210853258973, + "loss": 3.1562752723693848, + "step": 3304, + "token_acc": 0.28346237119233086 + }, + { + "epoch": 1.9375549692172385, + "grad_norm": 0.4253623584273094, + "learning_rate": 0.000139161061629682, + "loss": 3.15427827835083, + "step": 3305, + "token_acc": 0.2819169161396049 + }, + { + "epoch": 1.9381413075344474, + "grad_norm": 0.3793038725521701, + "learning_rate": 0.00013916001407709928, + "loss": 3.151179075241089, + "step": 3306, + "token_acc": 0.28333660898079094 + }, + { + "epoch": 1.9387276458516562, + "grad_norm": 0.3051401964319418, + "learning_rate": 0.00013915896587485147, + "loss": 3.152759552001953, + "step": 3307, + "token_acc": 0.28244004624835156 + }, + { + "epoch": 1.9393139841688654, + "grad_norm": 0.3482694610961628, + "learning_rate": 0.00013915791702294832, + "loss": 3.1778111457824707, + "step": 3308, + "token_acc": 0.2780676358419388 + }, + { + "epoch": 1.9399003224860745, + "grad_norm": 0.3646645050090139, + "learning_rate": 0.00013915686752139975, + "loss": 3.1917200088500977, + "step": 3309, + "token_acc": 0.2775195205132174 + }, + { + "epoch": 1.9404866608032836, + "grad_norm": 0.3494030587340589, + "learning_rate": 0.00013915581737021558, + "loss": 3.1626620292663574, + "step": 3310, + "token_acc": 0.28083597491239704 + }, + { + "epoch": 1.9410729991204925, + "grad_norm": 0.36655816396878566, + "learning_rate": 0.00013915476656940572, + "loss": 3.1225814819335938, + "step": 3311, + "token_acc": 0.286216775425752 + }, + { + "epoch": 1.9416593374377016, + "grad_norm": 0.39445667455175293, + "learning_rate": 0.00013915371511898, + "loss": 3.1573116779327393, + "step": 3312, + "token_acc": 0.2815383588168525 + }, + { + "epoch": 1.9422456757549105, + "grad_norm": 0.4746391513523968, + "learning_rate": 0.00013915266301894834, + "loss": 3.2001843452453613, + "step": 3313, + "token_acc": 0.27618760720892077 + }, + { + "epoch": 1.9428320140721196, + "grad_norm": 0.38343661437801124, + "learning_rate": 0.00013915161026932055, + "loss": 3.1935229301452637, + "step": 3314, + "token_acc": 0.2758296615133841 + }, + { + "epoch": 1.9434183523893287, + "grad_norm": 0.4441171186075295, + "learning_rate": 0.00013915055687010658, + "loss": 3.1794397830963135, + "step": 3315, + "token_acc": 0.2792302577375575 + }, + { + "epoch": 1.9440046907065378, + "grad_norm": 0.5073690761317307, + "learning_rate": 0.00013914950282131633, + "loss": 3.1501359939575195, + "step": 3316, + "token_acc": 0.2815280370741534 + }, + { + "epoch": 1.9445910290237467, + "grad_norm": 0.4383736735160132, + "learning_rate": 0.00013914844812295966, + "loss": 3.1700868606567383, + "step": 3317, + "token_acc": 0.2775822979008603 + }, + { + "epoch": 1.9451773673409556, + "grad_norm": 0.37026449453039234, + "learning_rate": 0.0001391473927750465, + "loss": 3.1751694679260254, + "step": 3318, + "token_acc": 0.27938548444573497 + }, + { + "epoch": 1.9457637056581647, + "grad_norm": 0.4752379004371895, + "learning_rate": 0.0001391463367775868, + "loss": 3.151136875152588, + "step": 3319, + "token_acc": 0.2839719688530707 + }, + { + "epoch": 1.9463500439753738, + "grad_norm": 0.3752967177179502, + "learning_rate": 0.00013914528013059038, + "loss": 3.1590545177459717, + "step": 3320, + "token_acc": 0.2810434689878564 + }, + { + "epoch": 1.946936382292583, + "grad_norm": 0.3418851457918463, + "learning_rate": 0.00013914422283406726, + "loss": 3.09909725189209, + "step": 3321, + "token_acc": 0.2896089031755309 + }, + { + "epoch": 1.9475227206097918, + "grad_norm": 0.3777334735992987, + "learning_rate": 0.00013914316488802735, + "loss": 3.158996105194092, + "step": 3322, + "token_acc": 0.2797934545688525 + }, + { + "epoch": 1.948109058927001, + "grad_norm": 0.35720934881011457, + "learning_rate": 0.00013914210629248057, + "loss": 3.1173105239868164, + "step": 3323, + "token_acc": 0.2873663315617596 + }, + { + "epoch": 1.9486953972442098, + "grad_norm": 0.3415138545348308, + "learning_rate": 0.00013914104704743684, + "loss": 3.2053141593933105, + "step": 3324, + "token_acc": 0.2748350126143588 + }, + { + "epoch": 1.949281735561419, + "grad_norm": 0.37447937251340824, + "learning_rate": 0.0001391399871529062, + "loss": 3.108191967010498, + "step": 3325, + "token_acc": 0.2892518934174117 + }, + { + "epoch": 1.949868073878628, + "grad_norm": 0.46367929391170054, + "learning_rate": 0.0001391389266088985, + "loss": 3.1765410900115967, + "step": 3326, + "token_acc": 0.27848255112045006 + }, + { + "epoch": 1.9504544121958372, + "grad_norm": 0.5380620646381263, + "learning_rate": 0.00013913786541542376, + "loss": 3.1276051998138428, + "step": 3327, + "token_acc": 0.2853071229706956 + }, + { + "epoch": 1.951040750513046, + "grad_norm": 0.42845334335984475, + "learning_rate": 0.00013913680357249196, + "loss": 3.164273262023926, + "step": 3328, + "token_acc": 0.28181472852512157 + }, + { + "epoch": 1.951627088830255, + "grad_norm": 0.37886830214080564, + "learning_rate": 0.00013913574108011302, + "loss": 3.163154125213623, + "step": 3329, + "token_acc": 0.282292258485365 + }, + { + "epoch": 1.952213427147464, + "grad_norm": 0.5329529024726649, + "learning_rate": 0.00013913467793829696, + "loss": 3.174312114715576, + "step": 3330, + "token_acc": 0.27895905889005795 + }, + { + "epoch": 1.9527997654646732, + "grad_norm": 0.47476736074615217, + "learning_rate": 0.0001391336141470538, + "loss": 3.166505813598633, + "step": 3331, + "token_acc": 0.2800059025458612 + }, + { + "epoch": 1.9533861037818823, + "grad_norm": 0.5498986721983645, + "learning_rate": 0.00013913254970639345, + "loss": 3.110722064971924, + "step": 3332, + "token_acc": 0.28798181706624126 + }, + { + "epoch": 1.9539724420990912, + "grad_norm": 0.5331648008460982, + "learning_rate": 0.00013913148461632598, + "loss": 3.1444947719573975, + "step": 3333, + "token_acc": 0.2831374362261653 + }, + { + "epoch": 1.9545587804163, + "grad_norm": 0.4269616937981869, + "learning_rate": 0.00013913041887686137, + "loss": 3.156773090362549, + "step": 3334, + "token_acc": 0.28320404489919576 + }, + { + "epoch": 1.9551451187335092, + "grad_norm": 0.5078127698860585, + "learning_rate": 0.0001391293524880096, + "loss": 3.176126003265381, + "step": 3335, + "token_acc": 0.27880362899892536 + }, + { + "epoch": 1.9557314570507183, + "grad_norm": 0.5181925024168382, + "learning_rate": 0.00013912828544978076, + "loss": 3.1840755939483643, + "step": 3336, + "token_acc": 0.27890622000998067 + }, + { + "epoch": 1.9563177953679274, + "grad_norm": 0.48023841551979124, + "learning_rate": 0.0001391272177621848, + "loss": 3.1813955307006836, + "step": 3337, + "token_acc": 0.2765049721330503 + }, + { + "epoch": 1.9569041336851363, + "grad_norm": 0.4516200328695524, + "learning_rate": 0.00013912614942523176, + "loss": 3.1648964881896973, + "step": 3338, + "token_acc": 0.28140642297567153 + }, + { + "epoch": 1.9574904720023454, + "grad_norm": 0.42577968928190246, + "learning_rate": 0.00013912508043893173, + "loss": 3.140504837036133, + "step": 3339, + "token_acc": 0.2831769042032293 + }, + { + "epoch": 1.9580768103195543, + "grad_norm": 0.5111732639417289, + "learning_rate": 0.0001391240108032947, + "loss": 3.179898262023926, + "step": 3340, + "token_acc": 0.27670737108509963 + }, + { + "epoch": 1.9586631486367634, + "grad_norm": 0.4552748307317893, + "learning_rate": 0.00013912294051833074, + "loss": 3.18254017829895, + "step": 3341, + "token_acc": 0.27814316506194897 + }, + { + "epoch": 1.9592494869539725, + "grad_norm": 0.48351955739373526, + "learning_rate": 0.0001391218695840499, + "loss": 3.1951117515563965, + "step": 3342, + "token_acc": 0.2778835193383347 + }, + { + "epoch": 1.9598358252711816, + "grad_norm": 0.4760817325750893, + "learning_rate": 0.00013912079800046221, + "loss": 3.1729722023010254, + "step": 3343, + "token_acc": 0.2783520362185126 + }, + { + "epoch": 1.9604221635883905, + "grad_norm": 0.4950446273086604, + "learning_rate": 0.0001391197257675778, + "loss": 3.1841742992401123, + "step": 3344, + "token_acc": 0.2775857214366162 + }, + { + "epoch": 1.9610085019055994, + "grad_norm": 0.5373742829542658, + "learning_rate": 0.00013911865288540669, + "loss": 3.183089256286621, + "step": 3345, + "token_acc": 0.2785410725928719 + }, + { + "epoch": 1.9615948402228085, + "grad_norm": 0.41963336031516096, + "learning_rate": 0.000139117579353959, + "loss": 3.1570446491241455, + "step": 3346, + "token_acc": 0.2805888824854613 + }, + { + "epoch": 1.9621811785400176, + "grad_norm": 0.41231944365877926, + "learning_rate": 0.00013911650517324476, + "loss": 3.163341522216797, + "step": 3347, + "token_acc": 0.2817850939960098 + }, + { + "epoch": 1.9627675168572267, + "grad_norm": 0.4446434675061905, + "learning_rate": 0.0001391154303432741, + "loss": 3.1212615966796875, + "step": 3348, + "token_acc": 0.2857479668335585 + }, + { + "epoch": 1.9633538551744356, + "grad_norm": 0.3920935737036198, + "learning_rate": 0.00013911435486405708, + "loss": 3.140012502670288, + "step": 3349, + "token_acc": 0.2854368114370536 + }, + { + "epoch": 1.9639401934916447, + "grad_norm": 0.36324685591141376, + "learning_rate": 0.00013911327873560386, + "loss": 3.2009711265563965, + "step": 3350, + "token_acc": 0.2753081048238647 + }, + { + "epoch": 1.9645265318088536, + "grad_norm": 0.4320949402726731, + "learning_rate": 0.00013911220195792452, + "loss": 3.1375491619110107, + "step": 3351, + "token_acc": 0.2835975571166382 + }, + { + "epoch": 1.9651128701260627, + "grad_norm": 0.4120621234884834, + "learning_rate": 0.00013911112453102916, + "loss": 3.1618008613586426, + "step": 3352, + "token_acc": 0.28184359696652916 + }, + { + "epoch": 1.9656992084432718, + "grad_norm": 0.39869757060849625, + "learning_rate": 0.00013911004645492792, + "loss": 3.1614646911621094, + "step": 3353, + "token_acc": 0.28210271603896925 + }, + { + "epoch": 1.966285546760481, + "grad_norm": 0.388628049288299, + "learning_rate": 0.00013910896772963092, + "loss": 3.171799421310425, + "step": 3354, + "token_acc": 0.2792662397119742 + }, + { + "epoch": 1.9668718850776898, + "grad_norm": 0.4950069484258709, + "learning_rate": 0.00013910788835514828, + "loss": 3.1041934490203857, + "step": 3355, + "token_acc": 0.28896797509535843 + }, + { + "epoch": 1.9674582233948987, + "grad_norm": 0.420312074379008, + "learning_rate": 0.00013910680833149016, + "loss": 3.145193099975586, + "step": 3356, + "token_acc": 0.2844221293515322 + }, + { + "epoch": 1.9680445617121078, + "grad_norm": 0.4379301611286279, + "learning_rate": 0.0001391057276586667, + "loss": 3.1596078872680664, + "step": 3357, + "token_acc": 0.28141256983986956 + }, + { + "epoch": 1.968630900029317, + "grad_norm": 0.4576193199560471, + "learning_rate": 0.00013910464633668808, + "loss": 3.137131690979004, + "step": 3358, + "token_acc": 0.28517970472915366 + }, + { + "epoch": 1.969217238346526, + "grad_norm": 0.3831662506452973, + "learning_rate": 0.00013910356436556439, + "loss": 3.10365629196167, + "step": 3359, + "token_acc": 0.28884990088094054 + }, + { + "epoch": 1.969803576663735, + "grad_norm": 0.42800697376584795, + "learning_rate": 0.00013910248174530584, + "loss": 3.1386098861694336, + "step": 3360, + "token_acc": 0.28403796107275875 + }, + { + "epoch": 1.9703899149809438, + "grad_norm": 0.36777368578711706, + "learning_rate": 0.0001391013984759226, + "loss": 3.1901535987854004, + "step": 3361, + "token_acc": 0.27514457168624845 + }, + { + "epoch": 1.970976253298153, + "grad_norm": 0.43693785479480013, + "learning_rate": 0.00013910031455742483, + "loss": 3.185361385345459, + "step": 3362, + "token_acc": 0.27827186456017994 + }, + { + "epoch": 1.971562591615362, + "grad_norm": 0.4080545794625674, + "learning_rate": 0.0001390992299898227, + "loss": 3.1773855686187744, + "step": 3363, + "token_acc": 0.27940934602495837 + }, + { + "epoch": 1.9721489299325712, + "grad_norm": 0.33149753041246305, + "learning_rate": 0.00013909814477312645, + "loss": 3.158292770385742, + "step": 3364, + "token_acc": 0.28046724535837236 + }, + { + "epoch": 1.97273526824978, + "grad_norm": 0.41432338329669927, + "learning_rate": 0.0001390970589073462, + "loss": 3.122222423553467, + "step": 3365, + "token_acc": 0.2868338477366255 + }, + { + "epoch": 1.9733216065669892, + "grad_norm": 0.38028346339070257, + "learning_rate": 0.00013909597239249223, + "loss": 3.1400723457336426, + "step": 3366, + "token_acc": 0.28621956339210747 + }, + { + "epoch": 1.973907944884198, + "grad_norm": 0.48225520172457403, + "learning_rate": 0.0001390948852285747, + "loss": 3.174577474594116, + "step": 3367, + "token_acc": 0.27805069077773104 + }, + { + "epoch": 1.9744942832014072, + "grad_norm": 0.4892394984713239, + "learning_rate": 0.0001390937974156038, + "loss": 3.2006263732910156, + "step": 3368, + "token_acc": 0.27479479035481663 + }, + { + "epoch": 1.9750806215186163, + "grad_norm": 0.37513538712981825, + "learning_rate": 0.0001390927089535898, + "loss": 3.1002066135406494, + "step": 3369, + "token_acc": 0.29237799711937645 + }, + { + "epoch": 1.9756669598358254, + "grad_norm": 0.3788080452458428, + "learning_rate": 0.00013909161984254292, + "loss": 3.163529634475708, + "step": 3370, + "token_acc": 0.28033904159636513 + }, + { + "epoch": 1.9762532981530343, + "grad_norm": 0.36749549261683495, + "learning_rate": 0.00013909053008247333, + "loss": 3.174147605895996, + "step": 3371, + "token_acc": 0.27931145253499706 + }, + { + "epoch": 1.9768396364702432, + "grad_norm": 0.4563196539727423, + "learning_rate": 0.00013908943967339135, + "loss": 3.1264514923095703, + "step": 3372, + "token_acc": 0.28626685162489396 + }, + { + "epoch": 1.9774259747874523, + "grad_norm": 0.5180713425226017, + "learning_rate": 0.0001390883486153072, + "loss": 3.1820428371429443, + "step": 3373, + "token_acc": 0.27802897432756785 + }, + { + "epoch": 1.9780123131046614, + "grad_norm": 0.553205273299101, + "learning_rate": 0.00013908725690823105, + "loss": 3.1747257709503174, + "step": 3374, + "token_acc": 0.2788229517025804 + }, + { + "epoch": 1.9785986514218705, + "grad_norm": 0.5342559386292113, + "learning_rate": 0.00013908616455217328, + "loss": 3.1705732345581055, + "step": 3375, + "token_acc": 0.28006252945319665 + }, + { + "epoch": 1.9791849897390794, + "grad_norm": 0.48529099459159397, + "learning_rate": 0.00013908507154714405, + "loss": 3.1789064407348633, + "step": 3376, + "token_acc": 0.27894978155582845 + }, + { + "epoch": 1.9797713280562885, + "grad_norm": 0.44637011902567103, + "learning_rate": 0.00013908397789315366, + "loss": 3.1910946369171143, + "step": 3377, + "token_acc": 0.2766798418972332 + }, + { + "epoch": 1.9803576663734974, + "grad_norm": 0.4588139083747449, + "learning_rate": 0.00013908288359021243, + "loss": 3.1674647331237793, + "step": 3378, + "token_acc": 0.2797496298989689 + }, + { + "epoch": 1.9809440046907065, + "grad_norm": 0.45585553465898915, + "learning_rate": 0.00013908178863833055, + "loss": 3.167445182800293, + "step": 3379, + "token_acc": 0.280860807517464 + }, + { + "epoch": 1.9815303430079156, + "grad_norm": 0.49982200519603226, + "learning_rate": 0.00013908069303751838, + "loss": 3.1494970321655273, + "step": 3380, + "token_acc": 0.28157468734790614 + }, + { + "epoch": 1.9821166813251248, + "grad_norm": 0.4795147066178234, + "learning_rate": 0.0001390795967877862, + "loss": 3.139040946960449, + "step": 3381, + "token_acc": 0.28312468209232555 + }, + { + "epoch": 1.9827030196423336, + "grad_norm": 0.36394608559749825, + "learning_rate": 0.00013907849988914426, + "loss": 3.158405303955078, + "step": 3382, + "token_acc": 0.28004248163554296 + }, + { + "epoch": 1.9832893579595425, + "grad_norm": 0.41611010968303497, + "learning_rate": 0.00013907740234160292, + "loss": 3.1257734298706055, + "step": 3383, + "token_acc": 0.28653803504102904 + }, + { + "epoch": 1.9838756962767516, + "grad_norm": 0.3813537595018297, + "learning_rate": 0.00013907630414517247, + "loss": 3.1708922386169434, + "step": 3384, + "token_acc": 0.28199272889576266 + }, + { + "epoch": 1.9844620345939608, + "grad_norm": 0.46635818782639954, + "learning_rate": 0.00013907520529986322, + "loss": 3.1240193843841553, + "step": 3385, + "token_acc": 0.2855685444280805 + }, + { + "epoch": 1.9850483729111699, + "grad_norm": 0.40974716316719545, + "learning_rate": 0.0001390741058056855, + "loss": 3.1687774658203125, + "step": 3386, + "token_acc": 0.27905286088088577 + }, + { + "epoch": 1.9856347112283788, + "grad_norm": 0.46702609892230296, + "learning_rate": 0.00013907300566264963, + "loss": 3.151608943939209, + "step": 3387, + "token_acc": 0.28313424998343095 + }, + { + "epoch": 1.9862210495455876, + "grad_norm": 0.4676852203644609, + "learning_rate": 0.00013907190487076596, + "loss": 3.134857416152954, + "step": 3388, + "token_acc": 0.28581432494315184 + }, + { + "epoch": 1.9868073878627968, + "grad_norm": 0.4116647375498723, + "learning_rate": 0.0001390708034300448, + "loss": 3.158132553100586, + "step": 3389, + "token_acc": 0.28150687559961624 + }, + { + "epoch": 1.9873937261800059, + "grad_norm": 0.42256908864682385, + "learning_rate": 0.00013906970134049652, + "loss": 3.142395496368408, + "step": 3390, + "token_acc": 0.28337116764514025 + }, + { + "epoch": 1.987980064497215, + "grad_norm": 0.342299822929288, + "learning_rate": 0.00013906859860213146, + "loss": 3.151010513305664, + "step": 3391, + "token_acc": 0.281481932108529 + }, + { + "epoch": 1.9885664028144239, + "grad_norm": 0.41686605485067674, + "learning_rate": 0.00013906749521496, + "loss": 3.141700029373169, + "step": 3392, + "token_acc": 0.2832681625910635 + }, + { + "epoch": 1.989152741131633, + "grad_norm": 0.438087000357279, + "learning_rate": 0.0001390663911789925, + "loss": 3.2011260986328125, + "step": 3393, + "token_acc": 0.27543164320532093 + }, + { + "epoch": 1.9897390794488419, + "grad_norm": 0.43210630934866456, + "learning_rate": 0.00013906528649423934, + "loss": 3.184021472930908, + "step": 3394, + "token_acc": 0.2769989649289002 + }, + { + "epoch": 1.990325417766051, + "grad_norm": 0.39396970731104547, + "learning_rate": 0.00013906418116071083, + "loss": 3.1602015495300293, + "step": 3395, + "token_acc": 0.2811733937175484 + }, + { + "epoch": 1.99091175608326, + "grad_norm": 0.3835246207990156, + "learning_rate": 0.00013906307517841743, + "loss": 3.174670934677124, + "step": 3396, + "token_acc": 0.2802589132347396 + }, + { + "epoch": 1.9914980944004692, + "grad_norm": 0.4502263409154559, + "learning_rate": 0.0001390619685473695, + "loss": 3.1310057640075684, + "step": 3397, + "token_acc": 0.28627079275469497 + }, + { + "epoch": 1.992084432717678, + "grad_norm": 0.3356345733414897, + "learning_rate": 0.00013906086126757745, + "loss": 3.1600825786590576, + "step": 3398, + "token_acc": 0.28327654875361774 + }, + { + "epoch": 1.992670771034887, + "grad_norm": 0.39931703854496614, + "learning_rate": 0.00013905975333905165, + "loss": 3.194636344909668, + "step": 3399, + "token_acc": 0.2775292786658566 + }, + { + "epoch": 1.993257109352096, + "grad_norm": 0.5363229668098536, + "learning_rate": 0.00013905864476180252, + "loss": 3.194115400314331, + "step": 3400, + "token_acc": 0.27615912574816476 + }, + { + "epoch": 1.9938434476693052, + "grad_norm": 0.36189066126935593, + "learning_rate": 0.00013905753553584052, + "loss": 3.172762632369995, + "step": 3401, + "token_acc": 0.27922836166826814 + }, + { + "epoch": 1.9944297859865143, + "grad_norm": 0.43477399592264865, + "learning_rate": 0.000139056425661176, + "loss": 3.1928272247314453, + "step": 3402, + "token_acc": 0.2779728777784421 + }, + { + "epoch": 1.9950161243037232, + "grad_norm": 0.5116720962708814, + "learning_rate": 0.0001390553151378194, + "loss": 3.179208755493164, + "step": 3403, + "token_acc": 0.2773243860949061 + }, + { + "epoch": 1.9956024626209323, + "grad_norm": 0.4601297982476801, + "learning_rate": 0.0001390542039657812, + "loss": 3.1578078269958496, + "step": 3404, + "token_acc": 0.2812150082301985 + }, + { + "epoch": 1.9961888009381412, + "grad_norm": 0.36016640229048014, + "learning_rate": 0.00013905309214507178, + "loss": 3.1756491661071777, + "step": 3405, + "token_acc": 0.27825384828696176 + }, + { + "epoch": 1.9967751392553503, + "grad_norm": 0.45878020867165564, + "learning_rate": 0.00013905197967570163, + "loss": 3.157804012298584, + "step": 3406, + "token_acc": 0.27966751614850033 + }, + { + "epoch": 1.9973614775725594, + "grad_norm": 0.5320286410410868, + "learning_rate": 0.00013905086655768115, + "loss": 3.162482738494873, + "step": 3407, + "token_acc": 0.28064756126094625 + }, + { + "epoch": 1.9979478158897686, + "grad_norm": 0.31122430183490246, + "learning_rate": 0.00013904975279102087, + "loss": 3.1572160720825195, + "step": 3408, + "token_acc": 0.28231274527433703 + }, + { + "epoch": 1.9985341542069774, + "grad_norm": 0.39401512308512576, + "learning_rate": 0.0001390486383757312, + "loss": 3.128884792327881, + "step": 3409, + "token_acc": 0.2859725573908362 + }, + { + "epoch": 1.9991204925241863, + "grad_norm": 0.3304642291278498, + "learning_rate": 0.00013904752331182259, + "loss": 3.14455509185791, + "step": 3410, + "token_acc": 0.28313608990647277 + }, + { + "epoch": 1.9997068308413954, + "grad_norm": 0.3900789751129363, + "learning_rate": 0.00013904640759930555, + "loss": 3.147974729537964, + "step": 3411, + "token_acc": 0.2815449028076655 + }, + { + "epoch": 2.0, + "grad_norm": 0.4310696272541357, + "learning_rate": 0.00013904529123819054, + "loss": 3.139267921447754, + "step": 3412, + "token_acc": 0.28635098877074433 + }, + { + "epoch": 2.0, + "eval_loss": 3.1359596252441406, + "eval_runtime": 22.002, + "eval_samples_per_second": 11.635, + "eval_steps_per_second": 1.454, + "eval_token_acc": 0.28405878778806104, + "step": 3412 + }, + { + "epoch": 2.000586338317209, + "grad_norm": 0.4669908189040292, + "learning_rate": 0.0001390441742284881, + "loss": 3.072059154510498, + "step": 3413, + "token_acc": 0.2912753173855335 + }, + { + "epoch": 2.0011726766344182, + "grad_norm": 0.43554543297573217, + "learning_rate": 0.00013904305657020863, + "loss": 3.103198528289795, + "step": 3414, + "token_acc": 0.2872645329574976 + }, + { + "epoch": 2.001759014951627, + "grad_norm": 0.4301203737295776, + "learning_rate": 0.00013904193826336271, + "loss": 3.1443958282470703, + "step": 3415, + "token_acc": 0.2818147374952005 + }, + { + "epoch": 2.002345353268836, + "grad_norm": 0.4727902843210217, + "learning_rate": 0.00013904081930796083, + "loss": 3.1033222675323486, + "step": 3416, + "token_acc": 0.28631646444498754 + }, + { + "epoch": 2.002931691586045, + "grad_norm": 0.3954903682659769, + "learning_rate": 0.00013903969970401346, + "loss": 3.0854570865631104, + "step": 3417, + "token_acc": 0.2891944231301252 + }, + { + "epoch": 2.0035180299032542, + "grad_norm": 0.416162010598298, + "learning_rate": 0.00013903857945153116, + "loss": 3.046881914138794, + "step": 3418, + "token_acc": 0.2950915223326926 + }, + { + "epoch": 2.0041043682204633, + "grad_norm": 0.41094544425404733, + "learning_rate": 0.0001390374585505244, + "loss": 3.1224188804626465, + "step": 3419, + "token_acc": 0.2849217686809312 + }, + { + "epoch": 2.0046907065376725, + "grad_norm": 0.39276595654606583, + "learning_rate": 0.00013903633700100378, + "loss": 3.0865747928619385, + "step": 3420, + "token_acc": 0.2888065015357855 + }, + { + "epoch": 2.005277044854881, + "grad_norm": 0.35315777119064307, + "learning_rate": 0.0001390352148029798, + "loss": 3.0870871543884277, + "step": 3421, + "token_acc": 0.2890270350722388 + }, + { + "epoch": 2.0058633831720902, + "grad_norm": 0.42520944556099705, + "learning_rate": 0.000139034091956463, + "loss": 3.0999088287353516, + "step": 3422, + "token_acc": 0.289940221493437 + }, + { + "epoch": 2.0064497214892993, + "grad_norm": 0.39026284731512934, + "learning_rate": 0.00013903296846146392, + "loss": 3.0217690467834473, + "step": 3423, + "token_acc": 0.29762488840273144 + }, + { + "epoch": 2.0070360598065085, + "grad_norm": 0.39492583402254616, + "learning_rate": 0.00013903184431799314, + "loss": 3.0698060989379883, + "step": 3424, + "token_acc": 0.29152610657498923 + }, + { + "epoch": 2.0076223981237176, + "grad_norm": 0.41938728405294856, + "learning_rate": 0.00013903071952606118, + "loss": 3.063624382019043, + "step": 3425, + "token_acc": 0.2926996464381478 + }, + { + "epoch": 2.0082087364409262, + "grad_norm": 0.47611770461397407, + "learning_rate": 0.00013902959408567867, + "loss": 3.0697710514068604, + "step": 3426, + "token_acc": 0.29184361121835023 + }, + { + "epoch": 2.0087950747581353, + "grad_norm": 0.3988898565195493, + "learning_rate": 0.0001390284679968561, + "loss": 3.083996295928955, + "step": 3427, + "token_acc": 0.2904132445816811 + }, + { + "epoch": 2.0093814130753445, + "grad_norm": 0.4296929927232593, + "learning_rate": 0.00013902734125960413, + "loss": 3.071540355682373, + "step": 3428, + "token_acc": 0.2923533006679604 + }, + { + "epoch": 2.0099677513925536, + "grad_norm": 0.3872715929520596, + "learning_rate": 0.0001390262138739333, + "loss": 3.067164421081543, + "step": 3429, + "token_acc": 0.2914328040561688 + }, + { + "epoch": 2.0105540897097627, + "grad_norm": 0.41043113711035617, + "learning_rate": 0.00013902508583985416, + "loss": 3.0388574600219727, + "step": 3430, + "token_acc": 0.2961614298849497 + }, + { + "epoch": 2.0111404280269713, + "grad_norm": 0.38033273177235727, + "learning_rate": 0.0001390239571573774, + "loss": 3.0351784229278564, + "step": 3431, + "token_acc": 0.29528218901432934 + }, + { + "epoch": 2.0117267663441805, + "grad_norm": 0.395434071298658, + "learning_rate": 0.00013902282782651354, + "loss": 3.0387587547302246, + "step": 3432, + "token_acc": 0.29488862093038015 + }, + { + "epoch": 2.0123131046613896, + "grad_norm": 0.4574238969049557, + "learning_rate": 0.00013902169784727324, + "loss": 3.0689291954040527, + "step": 3433, + "token_acc": 0.2921390647253255 + }, + { + "epoch": 2.0128994429785987, + "grad_norm": 0.402623654896216, + "learning_rate": 0.00013902056721966708, + "loss": 3.045943260192871, + "step": 3434, + "token_acc": 0.29525997366652035 + }, + { + "epoch": 2.013485781295808, + "grad_norm": 0.4496952249744904, + "learning_rate": 0.00013901943594370571, + "loss": 3.0669713020324707, + "step": 3435, + "token_acc": 0.2930701516267887 + }, + { + "epoch": 2.014072119613017, + "grad_norm": 0.3932798656168556, + "learning_rate": 0.00013901830401939975, + "loss": 3.065974235534668, + "step": 3436, + "token_acc": 0.29347344955762594 + }, + { + "epoch": 2.0146584579302256, + "grad_norm": 0.47407363665798985, + "learning_rate": 0.00013901717144675983, + "loss": 3.04148530960083, + "step": 3437, + "token_acc": 0.2943343873350028 + }, + { + "epoch": 2.0152447962474347, + "grad_norm": 0.3859093513079026, + "learning_rate": 0.00013901603822579655, + "loss": 3.048157215118408, + "step": 3438, + "token_acc": 0.2938826006439368 + }, + { + "epoch": 2.015831134564644, + "grad_norm": 0.40237225387247594, + "learning_rate": 0.00013901490435652063, + "loss": 3.0712270736694336, + "step": 3439, + "token_acc": 0.29054114847842605 + }, + { + "epoch": 2.016417472881853, + "grad_norm": 0.4909742982888658, + "learning_rate": 0.00013901376983894265, + "loss": 3.079235315322876, + "step": 3440, + "token_acc": 0.29215351233370823 + }, + { + "epoch": 2.017003811199062, + "grad_norm": 0.31975014475213787, + "learning_rate": 0.00013901263467307334, + "loss": 3.0642590522766113, + "step": 3441, + "token_acc": 0.29168100367909794 + }, + { + "epoch": 2.0175901495162707, + "grad_norm": 0.4222532375163001, + "learning_rate": 0.0001390114988589233, + "loss": 3.084491729736328, + "step": 3442, + "token_acc": 0.291710751796941 + }, + { + "epoch": 2.01817648783348, + "grad_norm": 0.38970997897605225, + "learning_rate": 0.0001390103623965032, + "loss": 3.132301092147827, + "step": 3443, + "token_acc": 0.282129687689501 + }, + { + "epoch": 2.018762826150689, + "grad_norm": 0.3738976978706402, + "learning_rate": 0.00013900922528582377, + "loss": 3.0894436836242676, + "step": 3444, + "token_acc": 0.2888680699315493 + }, + { + "epoch": 2.019349164467898, + "grad_norm": 0.34166636403749784, + "learning_rate": 0.00013900808752689568, + "loss": 3.086480140686035, + "step": 3445, + "token_acc": 0.2889001623893929 + }, + { + "epoch": 2.019935502785107, + "grad_norm": 0.3511117298739608, + "learning_rate": 0.00013900694911972956, + "loss": 3.0834155082702637, + "step": 3446, + "token_acc": 0.29072972234527655 + }, + { + "epoch": 2.0205218411023163, + "grad_norm": 0.3345105659173039, + "learning_rate": 0.00013900581006433615, + "loss": 3.0512261390686035, + "step": 3447, + "token_acc": 0.2942930530542352 + }, + { + "epoch": 2.021108179419525, + "grad_norm": 0.3406395198209844, + "learning_rate": 0.00013900467036072613, + "loss": 3.0245869159698486, + "step": 3448, + "token_acc": 0.29897653551536474 + }, + { + "epoch": 2.021694517736734, + "grad_norm": 0.3231953406581536, + "learning_rate": 0.00013900353000891022, + "loss": 3.0322794914245605, + "step": 3449, + "token_acc": 0.2982415086005164 + }, + { + "epoch": 2.022280856053943, + "grad_norm": 0.34774244541798166, + "learning_rate": 0.00013900238900889914, + "loss": 3.0630829334259033, + "step": 3450, + "token_acc": 0.2910205882800493 + }, + { + "epoch": 2.0228671943711523, + "grad_norm": 0.3362818856368379, + "learning_rate": 0.0001390012473607036, + "loss": 3.067298412322998, + "step": 3451, + "token_acc": 0.29096726950190027 + }, + { + "epoch": 2.0234535326883614, + "grad_norm": 0.3691514087950321, + "learning_rate": 0.00013900010506433434, + "loss": 3.1181302070617676, + "step": 3452, + "token_acc": 0.2870263879617491 + }, + { + "epoch": 2.02403987100557, + "grad_norm": 0.3403475005569846, + "learning_rate": 0.00013899896211980203, + "loss": 2.9972331523895264, + "step": 3453, + "token_acc": 0.3017676834460835 + }, + { + "epoch": 2.024626209322779, + "grad_norm": 0.326686025552459, + "learning_rate": 0.00013899781852711745, + "loss": 3.1233699321746826, + "step": 3454, + "token_acc": 0.2844483761280902 + }, + { + "epoch": 2.0252125476399883, + "grad_norm": 0.4150678908932345, + "learning_rate": 0.00013899667428629136, + "loss": 3.0711934566497803, + "step": 3455, + "token_acc": 0.29046136281073226 + }, + { + "epoch": 2.0257988859571974, + "grad_norm": 0.3629808762699235, + "learning_rate": 0.00013899552939733448, + "loss": 3.0263452529907227, + "step": 3456, + "token_acc": 0.2984379645754577 + }, + { + "epoch": 2.0263852242744065, + "grad_norm": 0.3756951973299575, + "learning_rate": 0.0001389943838602576, + "loss": 3.035609006881714, + "step": 3457, + "token_acc": 0.296783867062837 + }, + { + "epoch": 2.026971562591615, + "grad_norm": 0.4289039762091412, + "learning_rate": 0.00013899323767507143, + "loss": 3.0780153274536133, + "step": 3458, + "token_acc": 0.28908465734555083 + }, + { + "epoch": 2.0275579009088243, + "grad_norm": 0.4474921310454204, + "learning_rate": 0.00013899209084178676, + "loss": 2.995811939239502, + "step": 3459, + "token_acc": 0.3021539336882394 + }, + { + "epoch": 2.0281442392260334, + "grad_norm": 0.47111657686215347, + "learning_rate": 0.00013899094336041436, + "loss": 3.089670419692993, + "step": 3460, + "token_acc": 0.28995273082258644 + }, + { + "epoch": 2.0287305775432425, + "grad_norm": 0.5961298681369821, + "learning_rate": 0.00013898979523096502, + "loss": 3.0654194355010986, + "step": 3461, + "token_acc": 0.2913794339680621 + }, + { + "epoch": 2.0293169158604516, + "grad_norm": 0.573750621079621, + "learning_rate": 0.00013898864645344955, + "loss": 3.057342290878296, + "step": 3462, + "token_acc": 0.2934591042765979 + }, + { + "epoch": 2.0299032541776607, + "grad_norm": 0.39945754262711736, + "learning_rate": 0.00013898749702787866, + "loss": 3.0644164085388184, + "step": 3463, + "token_acc": 0.29409398388787145 + }, + { + "epoch": 2.0304895924948694, + "grad_norm": 0.412384414730368, + "learning_rate": 0.00013898634695426324, + "loss": 3.058180332183838, + "step": 3464, + "token_acc": 0.2938338411285717 + }, + { + "epoch": 2.0310759308120785, + "grad_norm": 0.5066905819476569, + "learning_rate": 0.000138985196232614, + "loss": 3.0717389583587646, + "step": 3465, + "token_acc": 0.292684232789501 + }, + { + "epoch": 2.0316622691292876, + "grad_norm": 0.48871545088828505, + "learning_rate": 0.00013898404486294185, + "loss": 3.067204236984253, + "step": 3466, + "token_acc": 0.2935458879461016 + }, + { + "epoch": 2.0322486074464967, + "grad_norm": 0.4449837081070912, + "learning_rate": 0.00013898289284525753, + "loss": 3.0645785331726074, + "step": 3467, + "token_acc": 0.29097301854517554 + }, + { + "epoch": 2.032834945763706, + "grad_norm": 0.3815733217074833, + "learning_rate": 0.0001389817401795719, + "loss": 3.0932931900024414, + "step": 3468, + "token_acc": 0.287144719552194 + }, + { + "epoch": 2.0334212840809145, + "grad_norm": 0.44770759116377795, + "learning_rate": 0.00013898058686589575, + "loss": 3.0803585052490234, + "step": 3469, + "token_acc": 0.2899080776045779 + }, + { + "epoch": 2.0340076223981236, + "grad_norm": 0.3652633585681172, + "learning_rate": 0.00013897943290423997, + "loss": 3.0651512145996094, + "step": 3470, + "token_acc": 0.29123762825751387 + }, + { + "epoch": 2.0345939607153327, + "grad_norm": 0.3784496445286311, + "learning_rate": 0.00013897827829461535, + "loss": 3.044992208480835, + "step": 3471, + "token_acc": 0.29525641827775345 + }, + { + "epoch": 2.035180299032542, + "grad_norm": 0.33637936388710926, + "learning_rate": 0.00013897712303703275, + "loss": 3.0728578567504883, + "step": 3472, + "token_acc": 0.29096464517756226 + }, + { + "epoch": 2.035766637349751, + "grad_norm": 0.3820809499335438, + "learning_rate": 0.00013897596713150306, + "loss": 3.087179660797119, + "step": 3473, + "token_acc": 0.28980674026969366 + }, + { + "epoch": 2.03635297566696, + "grad_norm": 0.38592561104361417, + "learning_rate": 0.00013897481057803708, + "loss": 3.0463616847991943, + "step": 3474, + "token_acc": 0.2947199411825126 + }, + { + "epoch": 2.0369393139841687, + "grad_norm": 0.3374260129650237, + "learning_rate": 0.0001389736533766457, + "loss": 3.030139446258545, + "step": 3475, + "token_acc": 0.2970991123400071 + }, + { + "epoch": 2.037525652301378, + "grad_norm": 0.40412707712917734, + "learning_rate": 0.0001389724955273398, + "loss": 3.0653152465820312, + "step": 3476, + "token_acc": 0.2906515742020299 + }, + { + "epoch": 2.038111990618587, + "grad_norm": 0.44707635396474155, + "learning_rate": 0.00013897133703013023, + "loss": 3.0286879539489746, + "step": 3477, + "token_acc": 0.29569523435967837 + }, + { + "epoch": 2.038698328935796, + "grad_norm": 0.45007508827377335, + "learning_rate": 0.0001389701778850279, + "loss": 3.066561460494995, + "step": 3478, + "token_acc": 0.294022135670904 + }, + { + "epoch": 2.039284667253005, + "grad_norm": 0.3862571054450777, + "learning_rate": 0.0001389690180920437, + "loss": 3.0781593322753906, + "step": 3479, + "token_acc": 0.29256291047882005 + }, + { + "epoch": 2.039871005570214, + "grad_norm": 0.37927662650382504, + "learning_rate": 0.00013896785765118847, + "loss": 3.0932347774505615, + "step": 3480, + "token_acc": 0.2885424793793854 + }, + { + "epoch": 2.040457343887423, + "grad_norm": 0.42765202665111174, + "learning_rate": 0.0001389666965624732, + "loss": 3.059185028076172, + "step": 3481, + "token_acc": 0.29528892189342615 + }, + { + "epoch": 2.041043682204632, + "grad_norm": 0.3711948807703126, + "learning_rate": 0.00013896553482590872, + "loss": 3.0578880310058594, + "step": 3482, + "token_acc": 0.29277653026948824 + }, + { + "epoch": 2.041630020521841, + "grad_norm": 0.36493182997328655, + "learning_rate": 0.00013896437244150596, + "loss": 3.035597801208496, + "step": 3483, + "token_acc": 0.29741378162184046 + }, + { + "epoch": 2.0422163588390503, + "grad_norm": 0.38724176365213764, + "learning_rate": 0.0001389632094092759, + "loss": 3.091658115386963, + "step": 3484, + "token_acc": 0.2894869533898831 + }, + { + "epoch": 2.042802697156259, + "grad_norm": 0.37577560591828546, + "learning_rate": 0.0001389620457292294, + "loss": 3.089517116546631, + "step": 3485, + "token_acc": 0.2890385331066877 + }, + { + "epoch": 2.043389035473468, + "grad_norm": 0.40496128277998505, + "learning_rate": 0.00013896088140137735, + "loss": 3.055583953857422, + "step": 3486, + "token_acc": 0.29190633803571236 + }, + { + "epoch": 2.043975373790677, + "grad_norm": 0.38654520782815943, + "learning_rate": 0.0001389597164257308, + "loss": 3.0676355361938477, + "step": 3487, + "token_acc": 0.2928620365377452 + }, + { + "epoch": 2.0445617121078863, + "grad_norm": 0.4042467598816843, + "learning_rate": 0.00013895855080230064, + "loss": 3.0710344314575195, + "step": 3488, + "token_acc": 0.2913159185174591 + }, + { + "epoch": 2.0451480504250954, + "grad_norm": 0.40896869383979084, + "learning_rate": 0.00013895738453109782, + "loss": 3.030439853668213, + "step": 3489, + "token_acc": 0.29848190963871 + }, + { + "epoch": 2.0457343887423045, + "grad_norm": 0.38790729012197117, + "learning_rate": 0.00013895621761213329, + "loss": 3.0286054611206055, + "step": 3490, + "token_acc": 0.298629261849774 + }, + { + "epoch": 2.046320727059513, + "grad_norm": 0.4821222350377258, + "learning_rate": 0.000138955050045418, + "loss": 3.0593550205230713, + "step": 3491, + "token_acc": 0.2938112223168654 + }, + { + "epoch": 2.0469070653767223, + "grad_norm": 0.3991683610115054, + "learning_rate": 0.00013895388183096294, + "loss": 3.077314853668213, + "step": 3492, + "token_acc": 0.2910727055396934 + }, + { + "epoch": 2.0474934036939314, + "grad_norm": 0.38816021283949714, + "learning_rate": 0.0001389527129687791, + "loss": 3.0493855476379395, + "step": 3493, + "token_acc": 0.2961974649766511 + }, + { + "epoch": 2.0480797420111405, + "grad_norm": 0.42515795112143534, + "learning_rate": 0.00013895154345887738, + "loss": 3.0880775451660156, + "step": 3494, + "token_acc": 0.28967441239727537 + }, + { + "epoch": 2.0486660803283496, + "grad_norm": 0.42041345230269767, + "learning_rate": 0.00013895037330126887, + "loss": 3.0557007789611816, + "step": 3495, + "token_acc": 0.2934895826602533 + }, + { + "epoch": 2.0492524186455583, + "grad_norm": 0.4691868311427592, + "learning_rate": 0.0001389492024959645, + "loss": 3.0429744720458984, + "step": 3496, + "token_acc": 0.29481594598939076 + }, + { + "epoch": 2.0498387569627674, + "grad_norm": 0.3376588575930607, + "learning_rate": 0.00013894803104297528, + "loss": 3.0879642963409424, + "step": 3497, + "token_acc": 0.28765841274437315 + }, + { + "epoch": 2.0504250952799765, + "grad_norm": 0.4276939921884808, + "learning_rate": 0.0001389468589423122, + "loss": 3.081231117248535, + "step": 3498, + "token_acc": 0.29084729635138684 + }, + { + "epoch": 2.0510114335971856, + "grad_norm": 0.34471830057852365, + "learning_rate": 0.00013894568619398634, + "loss": 3.0540566444396973, + "step": 3499, + "token_acc": 0.2948839623729773 + }, + { + "epoch": 2.0515977719143947, + "grad_norm": 0.44890928496200116, + "learning_rate": 0.00013894451279800862, + "loss": 3.0594706535339355, + "step": 3500, + "token_acc": 0.2926022335164329 + }, + { + "epoch": 2.052184110231604, + "grad_norm": 0.4702387472093667, + "learning_rate": 0.0001389433387543901, + "loss": 3.055852174758911, + "step": 3501, + "token_acc": 0.29325221179875166 + }, + { + "epoch": 2.0527704485488125, + "grad_norm": 0.42990029925615225, + "learning_rate": 0.00013894216406314184, + "loss": 3.025768756866455, + "step": 3502, + "token_acc": 0.29865615843184806 + }, + { + "epoch": 2.0533567868660216, + "grad_norm": 0.411579069803052, + "learning_rate": 0.00013894098872427484, + "loss": 3.0794122219085693, + "step": 3503, + "token_acc": 0.2900157058680918 + }, + { + "epoch": 2.0539431251832307, + "grad_norm": 0.467918842582373, + "learning_rate": 0.00013893981273780016, + "loss": 3.0619492530822754, + "step": 3504, + "token_acc": 0.29292926595435015 + }, + { + "epoch": 2.05452946350044, + "grad_norm": 0.37465442090327244, + "learning_rate": 0.00013893863610372882, + "loss": 3.0856080055236816, + "step": 3505, + "token_acc": 0.2868833446702267 + }, + { + "epoch": 2.055115801817649, + "grad_norm": 0.42102381953460777, + "learning_rate": 0.00013893745882207192, + "loss": 3.043754816055298, + "step": 3506, + "token_acc": 0.2942270251350819 + }, + { + "epoch": 2.0557021401348576, + "grad_norm": 0.44549311869150143, + "learning_rate": 0.00013893628089284047, + "loss": 3.0618090629577637, + "step": 3507, + "token_acc": 0.2946323880661832 + }, + { + "epoch": 2.0562884784520667, + "grad_norm": 0.35740773778443635, + "learning_rate": 0.00013893510231604553, + "loss": 3.076803684234619, + "step": 3508, + "token_acc": 0.29123609625390756 + }, + { + "epoch": 2.056874816769276, + "grad_norm": 0.3784400651413219, + "learning_rate": 0.0001389339230916982, + "loss": 3.0753490924835205, + "step": 3509, + "token_acc": 0.2883057486822945 + }, + { + "epoch": 2.057461155086485, + "grad_norm": 0.3849374552498772, + "learning_rate": 0.0001389327432198096, + "loss": 3.0726709365844727, + "step": 3510, + "token_acc": 0.290370829344351 + }, + { + "epoch": 2.058047493403694, + "grad_norm": 0.4404796770604643, + "learning_rate": 0.00013893156270039072, + "loss": 3.0375819206237793, + "step": 3511, + "token_acc": 0.2958033796066766 + }, + { + "epoch": 2.0586338317209028, + "grad_norm": 0.4730945604544182, + "learning_rate": 0.00013893038153345273, + "loss": 3.049166679382324, + "step": 3512, + "token_acc": 0.29502331408790894 + }, + { + "epoch": 2.059220170038112, + "grad_norm": 0.4133791358890913, + "learning_rate": 0.00013892919971900664, + "loss": 3.0344483852386475, + "step": 3513, + "token_acc": 0.2953296739563779 + }, + { + "epoch": 2.059806508355321, + "grad_norm": 0.34088059367146384, + "learning_rate": 0.00013892801725706364, + "loss": 3.0450730323791504, + "step": 3514, + "token_acc": 0.2947151358972599 + }, + { + "epoch": 2.06039284667253, + "grad_norm": 0.38428536695216325, + "learning_rate": 0.0001389268341476348, + "loss": 3.056018352508545, + "step": 3515, + "token_acc": 0.29324597148110665 + }, + { + "epoch": 2.060979184989739, + "grad_norm": 0.4255846095671945, + "learning_rate": 0.0001389256503907312, + "loss": 3.088204860687256, + "step": 3516, + "token_acc": 0.28800695127823955 + }, + { + "epoch": 2.0615655233069483, + "grad_norm": 0.37318286696068487, + "learning_rate": 0.000138924465986364, + "loss": 3.052006721496582, + "step": 3517, + "token_acc": 0.2948661738991391 + }, + { + "epoch": 2.062151861624157, + "grad_norm": 0.34174364000427443, + "learning_rate": 0.00013892328093454437, + "loss": 3.0661933422088623, + "step": 3518, + "token_acc": 0.2916311754684838 + }, + { + "epoch": 2.062738199941366, + "grad_norm": 0.40610053314771744, + "learning_rate": 0.00013892209523528335, + "loss": 3.0514962673187256, + "step": 3519, + "token_acc": 0.29424710948531 + }, + { + "epoch": 2.063324538258575, + "grad_norm": 0.4574651245519017, + "learning_rate": 0.00013892090888859213, + "loss": 3.06376051902771, + "step": 3520, + "token_acc": 0.2930837772372374 + }, + { + "epoch": 2.0639108765757843, + "grad_norm": 0.4051548129017675, + "learning_rate": 0.00013891972189448182, + "loss": 3.058706045150757, + "step": 3521, + "token_acc": 0.29427995245536015 + }, + { + "epoch": 2.0644972148929934, + "grad_norm": 0.38283659334037207, + "learning_rate": 0.00013891853425296362, + "loss": 3.0689632892608643, + "step": 3522, + "token_acc": 0.2925337913132399 + }, + { + "epoch": 2.065083553210202, + "grad_norm": 0.33950106588876433, + "learning_rate": 0.00013891734596404865, + "loss": 3.07800030708313, + "step": 3523, + "token_acc": 0.29127493783386715 + }, + { + "epoch": 2.065669891527411, + "grad_norm": 0.42440471424385023, + "learning_rate": 0.0001389161570277481, + "loss": 3.028881549835205, + "step": 3524, + "token_acc": 0.29673678407855625 + }, + { + "epoch": 2.0662562298446203, + "grad_norm": 0.3842773131311537, + "learning_rate": 0.0001389149674440731, + "loss": 3.116633892059326, + "step": 3525, + "token_acc": 0.28492784461357995 + }, + { + "epoch": 2.0668425681618294, + "grad_norm": 0.3430151755366685, + "learning_rate": 0.00013891377721303485, + "loss": 3.076627492904663, + "step": 3526, + "token_acc": 0.2900480995451456 + }, + { + "epoch": 2.0674289064790385, + "grad_norm": 0.4204628725896369, + "learning_rate": 0.00013891258633464453, + "loss": 3.044167995452881, + "step": 3527, + "token_acc": 0.29622921560408216 + }, + { + "epoch": 2.068015244796247, + "grad_norm": 0.40267316373095247, + "learning_rate": 0.00013891139480891332, + "loss": 3.044188976287842, + "step": 3528, + "token_acc": 0.2951207391352991 + }, + { + "epoch": 2.0686015831134563, + "grad_norm": 0.33559651782703215, + "learning_rate": 0.0001389102026358524, + "loss": 3.0739214420318604, + "step": 3529, + "token_acc": 0.2914874652100133 + }, + { + "epoch": 2.0691879214306654, + "grad_norm": 0.45584418670165666, + "learning_rate": 0.000138909009815473, + "loss": 3.0521011352539062, + "step": 3530, + "token_acc": 0.29478772717634333 + }, + { + "epoch": 2.0697742597478745, + "grad_norm": 0.46365594539469024, + "learning_rate": 0.00013890781634778632, + "loss": 3.0737452507019043, + "step": 3531, + "token_acc": 0.2908484948112482 + }, + { + "epoch": 2.0703605980650837, + "grad_norm": 0.37207110847329805, + "learning_rate": 0.00013890662223280353, + "loss": 3.102999210357666, + "step": 3532, + "token_acc": 0.2869288937898953 + }, + { + "epoch": 2.0709469363822928, + "grad_norm": 0.4525946802934254, + "learning_rate": 0.00013890542747053587, + "loss": 3.0277554988861084, + "step": 3533, + "token_acc": 0.2964807436918991 + }, + { + "epoch": 2.0715332746995014, + "grad_norm": 0.4092387580535566, + "learning_rate": 0.0001389042320609946, + "loss": 3.0771327018737793, + "step": 3534, + "token_acc": 0.29068526704757497 + }, + { + "epoch": 2.0721196130167105, + "grad_norm": 0.4273660003873795, + "learning_rate": 0.0001389030360041909, + "loss": 3.0531277656555176, + "step": 3535, + "token_acc": 0.29315080261465654 + }, + { + "epoch": 2.0727059513339197, + "grad_norm": 0.4092966684668221, + "learning_rate": 0.00013890183930013607, + "loss": 3.0588302612304688, + "step": 3536, + "token_acc": 0.2933185386442119 + }, + { + "epoch": 2.0732922896511288, + "grad_norm": 0.3346312341655145, + "learning_rate": 0.00013890064194884127, + "loss": 3.0411734580993652, + "step": 3537, + "token_acc": 0.29643031464822606 + }, + { + "epoch": 2.073878627968338, + "grad_norm": 0.41731637847588315, + "learning_rate": 0.00013889944395031778, + "loss": 3.0604910850524902, + "step": 3538, + "token_acc": 0.29336089695763906 + }, + { + "epoch": 2.0744649662855466, + "grad_norm": 0.4045197174314917, + "learning_rate": 0.00013889824530457685, + "loss": 3.050107002258301, + "step": 3539, + "token_acc": 0.29259017311371377 + }, + { + "epoch": 2.0750513046027557, + "grad_norm": 0.452924669036322, + "learning_rate": 0.00013889704601162975, + "loss": 3.071323871612549, + "step": 3540, + "token_acc": 0.29121733355903073 + }, + { + "epoch": 2.0756376429199648, + "grad_norm": 0.40431479652835034, + "learning_rate": 0.00013889584607148776, + "loss": 3.0575058460235596, + "step": 3541, + "token_acc": 0.2928217953493322 + }, + { + "epoch": 2.076223981237174, + "grad_norm": 0.4364179452223577, + "learning_rate": 0.00013889464548416214, + "loss": 3.0586159229278564, + "step": 3542, + "token_acc": 0.2939687374584521 + }, + { + "epoch": 2.076810319554383, + "grad_norm": 0.3698869493118724, + "learning_rate": 0.00013889344424966414, + "loss": 3.067042350769043, + "step": 3543, + "token_acc": 0.29363012837152547 + }, + { + "epoch": 2.077396657871592, + "grad_norm": 0.36602209007964237, + "learning_rate": 0.00013889224236800508, + "loss": 3.101879119873047, + "step": 3544, + "token_acc": 0.28768401161940854 + }, + { + "epoch": 2.077982996188801, + "grad_norm": 0.37952766460468174, + "learning_rate": 0.00013889103983919621, + "loss": 3.030776023864746, + "step": 3545, + "token_acc": 0.29583906054285053 + }, + { + "epoch": 2.07856933450601, + "grad_norm": 0.36148176497114115, + "learning_rate": 0.00013888983666324889, + "loss": 3.088855266571045, + "step": 3546, + "token_acc": 0.2876598640157119 + }, + { + "epoch": 2.079155672823219, + "grad_norm": 0.4510779278604473, + "learning_rate": 0.00013888863284017438, + "loss": 3.083803653717041, + "step": 3547, + "token_acc": 0.289877606550837 + }, + { + "epoch": 2.079742011140428, + "grad_norm": 0.3980043338887096, + "learning_rate": 0.00013888742836998396, + "loss": 3.0812692642211914, + "step": 3548, + "token_acc": 0.2904634740702101 + }, + { + "epoch": 2.0803283494576372, + "grad_norm": 0.3860855959875072, + "learning_rate": 0.00013888622325268903, + "loss": 3.092256546020508, + "step": 3549, + "token_acc": 0.2872811571042955 + }, + { + "epoch": 2.080914687774846, + "grad_norm": 0.3875731991675352, + "learning_rate": 0.0001388850174883008, + "loss": 3.0609793663024902, + "step": 3550, + "token_acc": 0.29324398172260124 + }, + { + "epoch": 2.081501026092055, + "grad_norm": 0.39500752505142206, + "learning_rate": 0.0001388838110768307, + "loss": 3.043837070465088, + "step": 3551, + "token_acc": 0.29330793306630043 + }, + { + "epoch": 2.082087364409264, + "grad_norm": 0.41043470812820104, + "learning_rate": 0.00013888260401828998, + "loss": 3.0730392932891846, + "step": 3552, + "token_acc": 0.2917727452014937 + }, + { + "epoch": 2.0826737027264732, + "grad_norm": 0.3507368869803403, + "learning_rate": 0.00013888139631269004, + "loss": 3.0573110580444336, + "step": 3553, + "token_acc": 0.29385295510762593 + }, + { + "epoch": 2.0832600410436823, + "grad_norm": 0.4006798736851808, + "learning_rate": 0.0001388801879600422, + "loss": 3.04017972946167, + "step": 3554, + "token_acc": 0.29484467702849504 + }, + { + "epoch": 2.0838463793608915, + "grad_norm": 0.452925579882886, + "learning_rate": 0.0001388789789603578, + "loss": 3.0513229370117188, + "step": 3555, + "token_acc": 0.29312111376168315 + }, + { + "epoch": 2.0844327176781, + "grad_norm": 0.3943787766630467, + "learning_rate": 0.00013887776931364822, + "loss": 3.056554079055786, + "step": 3556, + "token_acc": 0.29287224832547554 + }, + { + "epoch": 2.0850190559953092, + "grad_norm": 0.3915269367099127, + "learning_rate": 0.0001388765590199248, + "loss": 3.0723562240600586, + "step": 3557, + "token_acc": 0.2902428742997579 + }, + { + "epoch": 2.0856053943125183, + "grad_norm": 0.37404432548508443, + "learning_rate": 0.00013887534807919893, + "loss": 3.0584282875061035, + "step": 3558, + "token_acc": 0.2935394568209129 + }, + { + "epoch": 2.0861917326297275, + "grad_norm": 0.30856376439782496, + "learning_rate": 0.00013887413649148197, + "loss": 3.0505573749542236, + "step": 3559, + "token_acc": 0.2943578396359768 + }, + { + "epoch": 2.0867780709469366, + "grad_norm": 0.36798856932035745, + "learning_rate": 0.00013887292425678532, + "loss": 3.0731148719787598, + "step": 3560, + "token_acc": 0.29178892447566895 + }, + { + "epoch": 2.0873644092641452, + "grad_norm": 0.35350458537987217, + "learning_rate": 0.00013887171137512034, + "loss": 3.055309295654297, + "step": 3561, + "token_acc": 0.293104949690331 + }, + { + "epoch": 2.0879507475813543, + "grad_norm": 0.35907991658595895, + "learning_rate": 0.00013887049784649843, + "loss": 3.0864431858062744, + "step": 3562, + "token_acc": 0.28949481990965836 + }, + { + "epoch": 2.0885370858985635, + "grad_norm": 0.3387628504731505, + "learning_rate": 0.000138869283670931, + "loss": 3.070488691329956, + "step": 3563, + "token_acc": 0.28914147570812276 + }, + { + "epoch": 2.0891234242157726, + "grad_norm": 0.41085522690362625, + "learning_rate": 0.00013886806884842945, + "loss": 3.0603437423706055, + "step": 3564, + "token_acc": 0.2938226088801655 + }, + { + "epoch": 2.0897097625329817, + "grad_norm": 0.2784734052620729, + "learning_rate": 0.0001388668533790052, + "loss": 3.1336379051208496, + "step": 3565, + "token_acc": 0.28183984697004927 + }, + { + "epoch": 2.0902961008501904, + "grad_norm": 0.3657858610358495, + "learning_rate": 0.0001388656372626697, + "loss": 2.970426559448242, + "step": 3566, + "token_acc": 0.3057965256164051 + }, + { + "epoch": 2.0908824391673995, + "grad_norm": 0.33229922196910266, + "learning_rate": 0.00013886442049943428, + "loss": 3.077293872833252, + "step": 3567, + "token_acc": 0.2906531923163472 + }, + { + "epoch": 2.0914687774846086, + "grad_norm": 0.39200059285857153, + "learning_rate": 0.00013886320308931045, + "loss": 3.0790364742279053, + "step": 3568, + "token_acc": 0.2907560388999268 + }, + { + "epoch": 2.0920551158018177, + "grad_norm": 0.37369921209068635, + "learning_rate": 0.00013886198503230962, + "loss": 3.0396976470947266, + "step": 3569, + "token_acc": 0.29564506130763907 + }, + { + "epoch": 2.092641454119027, + "grad_norm": 0.3805989472191098, + "learning_rate": 0.00013886076632844323, + "loss": 3.043625831604004, + "step": 3570, + "token_acc": 0.2953660149441935 + }, + { + "epoch": 2.093227792436236, + "grad_norm": 0.3238231588133131, + "learning_rate": 0.00013885954697772274, + "loss": 3.0459446907043457, + "step": 3571, + "token_acc": 0.2949544744920946 + }, + { + "epoch": 2.0938141307534446, + "grad_norm": 0.4877732099431555, + "learning_rate": 0.0001388583269801596, + "loss": 3.0441641807556152, + "step": 3572, + "token_acc": 0.29641167202564084 + }, + { + "epoch": 2.0944004690706537, + "grad_norm": 0.5199488882369063, + "learning_rate": 0.00013885710633576524, + "loss": 3.035614013671875, + "step": 3573, + "token_acc": 0.29613103306188393 + }, + { + "epoch": 2.094986807387863, + "grad_norm": 0.46036855836612933, + "learning_rate": 0.00013885588504455117, + "loss": 3.057079315185547, + "step": 3574, + "token_acc": 0.29274486286830265 + }, + { + "epoch": 2.095573145705072, + "grad_norm": 0.33080412061328, + "learning_rate": 0.00013885466310652883, + "loss": 3.0364432334899902, + "step": 3575, + "token_acc": 0.2961586461319034 + }, + { + "epoch": 2.096159484022281, + "grad_norm": 0.45210945309584005, + "learning_rate": 0.00013885344052170972, + "loss": 3.0727055072784424, + "step": 3576, + "token_acc": 0.2908903854303351 + }, + { + "epoch": 2.0967458223394897, + "grad_norm": 0.3771424159078343, + "learning_rate": 0.00013885221729010533, + "loss": 3.049459457397461, + "step": 3577, + "token_acc": 0.2953231878678615 + }, + { + "epoch": 2.097332160656699, + "grad_norm": 0.3946934157881624, + "learning_rate": 0.0001388509934117271, + "loss": 3.0266995429992676, + "step": 3578, + "token_acc": 0.2987828547495735 + }, + { + "epoch": 2.097918498973908, + "grad_norm": 0.33273365398867966, + "learning_rate": 0.0001388497688865866, + "loss": 3.123490333557129, + "step": 3579, + "token_acc": 0.2817662395590508 + }, + { + "epoch": 2.098504837291117, + "grad_norm": 0.4174170306956088, + "learning_rate": 0.0001388485437146953, + "loss": 3.0044331550598145, + "step": 3580, + "token_acc": 0.3007205021199537 + }, + { + "epoch": 2.099091175608326, + "grad_norm": 0.3448999335467518, + "learning_rate": 0.00013884731789606472, + "loss": 3.0394911766052246, + "step": 3581, + "token_acc": 0.2958471955070383 + }, + { + "epoch": 2.099677513925535, + "grad_norm": 0.35329633195899907, + "learning_rate": 0.00013884609143070633, + "loss": 3.0720462799072266, + "step": 3582, + "token_acc": 0.2900113201444666 + }, + { + "epoch": 2.100263852242744, + "grad_norm": 0.37334592023000984, + "learning_rate": 0.0001388448643186317, + "loss": 3.078434467315674, + "step": 3583, + "token_acc": 0.2910762291575339 + }, + { + "epoch": 2.100850190559953, + "grad_norm": 0.3627473373863117, + "learning_rate": 0.0001388436365598523, + "loss": 3.04420804977417, + "step": 3584, + "token_acc": 0.2952784095695392 + }, + { + "epoch": 2.101436528877162, + "grad_norm": 0.3164468872263793, + "learning_rate": 0.00013884240815437976, + "loss": 3.0036802291870117, + "step": 3585, + "token_acc": 0.3006704446313341 + }, + { + "epoch": 2.1020228671943713, + "grad_norm": 0.3262969368433032, + "learning_rate": 0.00013884117910222552, + "loss": 3.0708227157592773, + "step": 3586, + "token_acc": 0.29111566108961867 + }, + { + "epoch": 2.1026092055115804, + "grad_norm": 0.3423508577097571, + "learning_rate": 0.0001388399494034012, + "loss": 3.086907386779785, + "step": 3587, + "token_acc": 0.2894343090339478 + }, + { + "epoch": 2.103195543828789, + "grad_norm": 0.40907763832314026, + "learning_rate": 0.00013883871905791828, + "loss": 3.0891640186309814, + "step": 3588, + "token_acc": 0.28894577958187906 + }, + { + "epoch": 2.103781882145998, + "grad_norm": 0.32224690229854824, + "learning_rate": 0.00013883748806578839, + "loss": 3.0678625106811523, + "step": 3589, + "token_acc": 0.29178688159455496 + }, + { + "epoch": 2.1043682204632073, + "grad_norm": 0.35566718818039156, + "learning_rate": 0.00013883625642702304, + "loss": 3.0766353607177734, + "step": 3590, + "token_acc": 0.2908577307632113 + }, + { + "epoch": 2.1049545587804164, + "grad_norm": 0.3967392343762941, + "learning_rate": 0.0001388350241416338, + "loss": 3.0980453491210938, + "step": 3591, + "token_acc": 0.28833701831964625 + }, + { + "epoch": 2.1055408970976255, + "grad_norm": 0.3504420694306796, + "learning_rate": 0.0001388337912096323, + "loss": 3.1171092987060547, + "step": 3592, + "token_acc": 0.28501722384694816 + }, + { + "epoch": 2.106127235414834, + "grad_norm": 0.37889231818891056, + "learning_rate": 0.00013883255763103006, + "loss": 3.0386791229248047, + "step": 3593, + "token_acc": 0.2972305138742019 + }, + { + "epoch": 2.1067135737320433, + "grad_norm": 0.43377059479628705, + "learning_rate": 0.00013883132340583872, + "loss": 3.067584991455078, + "step": 3594, + "token_acc": 0.2896499366925756 + }, + { + "epoch": 2.1072999120492524, + "grad_norm": 0.42918857678919764, + "learning_rate": 0.00013883008853406986, + "loss": 3.099790096282959, + "step": 3595, + "token_acc": 0.28827267630419096 + }, + { + "epoch": 2.1078862503664615, + "grad_norm": 0.32746334629217966, + "learning_rate": 0.00013882885301573503, + "loss": 3.083308458328247, + "step": 3596, + "token_acc": 0.29072784104154187 + }, + { + "epoch": 2.1084725886836706, + "grad_norm": 0.4151508781766218, + "learning_rate": 0.00013882761685084588, + "loss": 3.0627522468566895, + "step": 3597, + "token_acc": 0.292556027769712 + }, + { + "epoch": 2.1090589270008797, + "grad_norm": 0.4501716845944949, + "learning_rate": 0.00013882638003941404, + "loss": 3.057436466217041, + "step": 3598, + "token_acc": 0.2941305450472346 + }, + { + "epoch": 2.1096452653180884, + "grad_norm": 0.3919677735952834, + "learning_rate": 0.00013882514258145107, + "loss": 3.1192445755004883, + "step": 3599, + "token_acc": 0.28294378713564294 + }, + { + "epoch": 2.1102316036352975, + "grad_norm": 0.37019183032269365, + "learning_rate": 0.00013882390447696866, + "loss": 3.0782432556152344, + "step": 3600, + "token_acc": 0.29042322837241563 + }, + { + "epoch": 2.1108179419525066, + "grad_norm": 0.4771789753368112, + "learning_rate": 0.0001388226657259784, + "loss": 3.047024726867676, + "step": 3601, + "token_acc": 0.294799740764744 + }, + { + "epoch": 2.1114042802697157, + "grad_norm": 0.37605110120237084, + "learning_rate": 0.00013882142632849192, + "loss": 3.0196657180786133, + "step": 3602, + "token_acc": 0.2976590542579202 + }, + { + "epoch": 2.111990618586925, + "grad_norm": 0.41957170706842967, + "learning_rate": 0.00013882018628452088, + "loss": 3.052840232849121, + "step": 3603, + "token_acc": 0.2946973446557537 + }, + { + "epoch": 2.1125769569041335, + "grad_norm": 0.33889499844363175, + "learning_rate": 0.00013881894559407694, + "loss": 3.0038328170776367, + "step": 3604, + "token_acc": 0.30226127079942483 + }, + { + "epoch": 2.1131632952213426, + "grad_norm": 0.39923432764794514, + "learning_rate": 0.00013881770425717174, + "loss": 3.0949344635009766, + "step": 3605, + "token_acc": 0.29052818669236946 + }, + { + "epoch": 2.1137496335385517, + "grad_norm": 0.36667302462685836, + "learning_rate": 0.00013881646227381693, + "loss": 3.0374503135681152, + "step": 3606, + "token_acc": 0.29694623471035303 + }, + { + "epoch": 2.114335971855761, + "grad_norm": 0.37789056269355675, + "learning_rate": 0.00013881521964402422, + "loss": 3.0434722900390625, + "step": 3607, + "token_acc": 0.2940331985108957 + }, + { + "epoch": 2.11492231017297, + "grad_norm": 0.38962330669679957, + "learning_rate": 0.0001388139763678052, + "loss": 3.0545945167541504, + "step": 3608, + "token_acc": 0.2938835956178853 + }, + { + "epoch": 2.115508648490179, + "grad_norm": 0.3773396580271617, + "learning_rate": 0.00013881273244517164, + "loss": 3.053905487060547, + "step": 3609, + "token_acc": 0.2932949371318769 + }, + { + "epoch": 2.1160949868073877, + "grad_norm": 0.34250857421263803, + "learning_rate": 0.00013881148787613516, + "loss": 3.0920209884643555, + "step": 3610, + "token_acc": 0.2891248425362829 + }, + { + "epoch": 2.116681325124597, + "grad_norm": 0.3474107552382277, + "learning_rate": 0.00013881024266070748, + "loss": 3.084031581878662, + "step": 3611, + "token_acc": 0.28883255354446663 + }, + { + "epoch": 2.117267663441806, + "grad_norm": 0.39087542899953936, + "learning_rate": 0.00013880899679890031, + "loss": 3.050617218017578, + "step": 3612, + "token_acc": 0.29192745571469875 + }, + { + "epoch": 2.117854001759015, + "grad_norm": 0.3555552888370427, + "learning_rate": 0.00013880775029072534, + "loss": 3.0233449935913086, + "step": 3613, + "token_acc": 0.29731515296103944 + }, + { + "epoch": 2.118440340076224, + "grad_norm": 0.3914464117314933, + "learning_rate": 0.00013880650313619425, + "loss": 3.0774118900299072, + "step": 3614, + "token_acc": 0.29087339597697964 + }, + { + "epoch": 2.119026678393433, + "grad_norm": 0.32859215092179267, + "learning_rate": 0.0001388052553353188, + "loss": 3.030160903930664, + "step": 3615, + "token_acc": 0.29745642403913275 + }, + { + "epoch": 2.119613016710642, + "grad_norm": 0.3400075420358624, + "learning_rate": 0.00013880400688811068, + "loss": 3.0234766006469727, + "step": 3616, + "token_acc": 0.29710982036347383 + }, + { + "epoch": 2.120199355027851, + "grad_norm": 0.39377774699355317, + "learning_rate": 0.00013880275779458163, + "loss": 3.097604274749756, + "step": 3617, + "token_acc": 0.2869141039236479 + }, + { + "epoch": 2.12078569334506, + "grad_norm": 0.37940735004322373, + "learning_rate": 0.0001388015080547434, + "loss": 3.059603452682495, + "step": 3618, + "token_acc": 0.29475008275405495 + }, + { + "epoch": 2.1213720316622693, + "grad_norm": 0.3778553124002791, + "learning_rate": 0.0001388002576686077, + "loss": 3.0681915283203125, + "step": 3619, + "token_acc": 0.2903755456685635 + }, + { + "epoch": 2.121958369979478, + "grad_norm": 0.38972075395246414, + "learning_rate": 0.00013879900663618628, + "loss": 3.0619359016418457, + "step": 3620, + "token_acc": 0.29286590394823103 + }, + { + "epoch": 2.122544708296687, + "grad_norm": 0.4404951896061724, + "learning_rate": 0.00013879775495749094, + "loss": 3.0906691551208496, + "step": 3621, + "token_acc": 0.2884586809314568 + }, + { + "epoch": 2.123131046613896, + "grad_norm": 0.4491328320037228, + "learning_rate": 0.00013879650263253336, + "loss": 3.0378637313842773, + "step": 3622, + "token_acc": 0.2951317392948691 + }, + { + "epoch": 2.1237173849311053, + "grad_norm": 0.4992025779950859, + "learning_rate": 0.00013879524966132535, + "loss": 3.06771183013916, + "step": 3623, + "token_acc": 0.2925594456247561 + }, + { + "epoch": 2.1243037232483144, + "grad_norm": 0.4082898380301269, + "learning_rate": 0.00013879399604387865, + "loss": 3.0601770877838135, + "step": 3624, + "token_acc": 0.2923739113098148 + }, + { + "epoch": 2.1248900615655235, + "grad_norm": 0.3675700665200735, + "learning_rate": 0.0001387927417802051, + "loss": 3.037097930908203, + "step": 3625, + "token_acc": 0.29715589483792104 + }, + { + "epoch": 2.125476399882732, + "grad_norm": 0.4303759949350231, + "learning_rate": 0.00013879148687031642, + "loss": 3.0430657863616943, + "step": 3626, + "token_acc": 0.29609048753618733 + }, + { + "epoch": 2.1260627381999413, + "grad_norm": 0.3946235400757452, + "learning_rate": 0.00013879023131422444, + "loss": 2.9939348697662354, + "step": 3627, + "token_acc": 0.30232217244255866 + }, + { + "epoch": 2.1266490765171504, + "grad_norm": 0.42086315973020916, + "learning_rate": 0.0001387889751119409, + "loss": 3.0546250343322754, + "step": 3628, + "token_acc": 0.29454897402044067 + }, + { + "epoch": 2.1272354148343595, + "grad_norm": 0.40593444058890665, + "learning_rate": 0.00013878771826347766, + "loss": 3.0519537925720215, + "step": 3629, + "token_acc": 0.29307148665361094 + }, + { + "epoch": 2.1278217531515686, + "grad_norm": 0.43234312908781897, + "learning_rate": 0.00013878646076884648, + "loss": 3.070136547088623, + "step": 3630, + "token_acc": 0.29147284405623647 + }, + { + "epoch": 2.1284080914687773, + "grad_norm": 0.4504799345237179, + "learning_rate": 0.00013878520262805918, + "loss": 3.082275390625, + "step": 3631, + "token_acc": 0.28904291979949875 + }, + { + "epoch": 2.1289944297859864, + "grad_norm": 0.4011962312662107, + "learning_rate": 0.0001387839438411276, + "loss": 3.054649591445923, + "step": 3632, + "token_acc": 0.29241864266511775 + }, + { + "epoch": 2.1295807681031955, + "grad_norm": 0.41912516080667495, + "learning_rate": 0.0001387826844080636, + "loss": 3.0400443077087402, + "step": 3633, + "token_acc": 0.2942002936560174 + }, + { + "epoch": 2.1301671064204046, + "grad_norm": 0.34348948494630654, + "learning_rate": 0.00013878142432887893, + "loss": 3.041868209838867, + "step": 3634, + "token_acc": 0.2953584830086854 + }, + { + "epoch": 2.1307534447376137, + "grad_norm": 0.4035827395360975, + "learning_rate": 0.00013878016360358545, + "loss": 3.068477153778076, + "step": 3635, + "token_acc": 0.29235573274084814 + }, + { + "epoch": 2.1313397830548224, + "grad_norm": 0.4262363695095578, + "learning_rate": 0.00013877890223219503, + "loss": 3.067061424255371, + "step": 3636, + "token_acc": 0.29295252343588357 + }, + { + "epoch": 2.1319261213720315, + "grad_norm": 0.40681922164514644, + "learning_rate": 0.0001387776402147195, + "loss": 3.0702812671661377, + "step": 3637, + "token_acc": 0.29150909513447276 + }, + { + "epoch": 2.1325124596892406, + "grad_norm": 0.33436231298634655, + "learning_rate": 0.00013877637755117073, + "loss": 3.0855891704559326, + "step": 3638, + "token_acc": 0.29016843959974226 + }, + { + "epoch": 2.1330987980064497, + "grad_norm": 0.38711956761857064, + "learning_rate": 0.00013877511424156057, + "loss": 3.0739312171936035, + "step": 3639, + "token_acc": 0.2899624537424705 + }, + { + "epoch": 2.133685136323659, + "grad_norm": 0.4318123819826134, + "learning_rate": 0.00013877385028590087, + "loss": 3.10980224609375, + "step": 3640, + "token_acc": 0.2851005864783999 + }, + { + "epoch": 2.134271474640868, + "grad_norm": 0.3819295474651742, + "learning_rate": 0.00013877258568420353, + "loss": 3.073606014251709, + "step": 3641, + "token_acc": 0.29272549139673015 + }, + { + "epoch": 2.1348578129580766, + "grad_norm": 0.3909055834300277, + "learning_rate": 0.00013877132043648043, + "loss": 3.070145606994629, + "step": 3642, + "token_acc": 0.29150916769321267 + }, + { + "epoch": 2.1354441512752858, + "grad_norm": 0.4353780446853067, + "learning_rate": 0.00013877005454274342, + "loss": 3.088529109954834, + "step": 3643, + "token_acc": 0.28969861550104614 + }, + { + "epoch": 2.136030489592495, + "grad_norm": 0.40087679951274807, + "learning_rate": 0.00013876878800300445, + "loss": 3.0913333892822266, + "step": 3644, + "token_acc": 0.28927806279925095 + }, + { + "epoch": 2.136616827909704, + "grad_norm": 0.33669371253807756, + "learning_rate": 0.00013876752081727536, + "loss": 3.0919599533081055, + "step": 3645, + "token_acc": 0.2889062207364724 + }, + { + "epoch": 2.137203166226913, + "grad_norm": 0.35344715790764947, + "learning_rate": 0.0001387662529855681, + "loss": 3.006959915161133, + "step": 3646, + "token_acc": 0.2983681689472595 + }, + { + "epoch": 2.1377895045441218, + "grad_norm": 0.37274571488313335, + "learning_rate": 0.00013876498450789452, + "loss": 3.0951483249664307, + "step": 3647, + "token_acc": 0.2883967273370232 + }, + { + "epoch": 2.138375842861331, + "grad_norm": 0.3487507334823329, + "learning_rate": 0.00013876371538426658, + "loss": 3.0230093002319336, + "step": 3648, + "token_acc": 0.29741157946052194 + }, + { + "epoch": 2.13896218117854, + "grad_norm": 0.3633333053378516, + "learning_rate": 0.00013876244561469622, + "loss": 3.100297689437866, + "step": 3649, + "token_acc": 0.287270521698186 + }, + { + "epoch": 2.139548519495749, + "grad_norm": 0.38050448528690217, + "learning_rate": 0.00013876117519919532, + "loss": 3.0393474102020264, + "step": 3650, + "token_acc": 0.2964029405267673 + }, + { + "epoch": 2.140134857812958, + "grad_norm": 0.33489754719401343, + "learning_rate": 0.00013875990413777584, + "loss": 3.0547125339508057, + "step": 3651, + "token_acc": 0.29423085012409034 + }, + { + "epoch": 2.1407211961301673, + "grad_norm": 0.3864930676077191, + "learning_rate": 0.00013875863243044973, + "loss": 3.066652297973633, + "step": 3652, + "token_acc": 0.29185848902732253 + }, + { + "epoch": 2.141307534447376, + "grad_norm": 0.3344110178839375, + "learning_rate": 0.0001387573600772289, + "loss": 3.1205050945281982, + "step": 3653, + "token_acc": 0.2840130959797463 + }, + { + "epoch": 2.141893872764585, + "grad_norm": 0.3657241972698941, + "learning_rate": 0.00013875608707812533, + "loss": 3.0484561920166016, + "step": 3654, + "token_acc": 0.294231117460062 + }, + { + "epoch": 2.142480211081794, + "grad_norm": 0.38402585340674356, + "learning_rate": 0.000138754813433151, + "loss": 3.075462818145752, + "step": 3655, + "token_acc": 0.2906715912341361 + }, + { + "epoch": 2.1430665493990033, + "grad_norm": 0.3275827527592282, + "learning_rate": 0.00013875353914231782, + "loss": 3.018284797668457, + "step": 3656, + "token_acc": 0.29906496223885964 + }, + { + "epoch": 2.1436528877162124, + "grad_norm": 0.395919189019556, + "learning_rate": 0.00013875226420563777, + "loss": 3.027825355529785, + "step": 3657, + "token_acc": 0.2975526942069318 + }, + { + "epoch": 2.144239226033421, + "grad_norm": 0.3511033846613789, + "learning_rate": 0.00013875098862312289, + "loss": 3.080601453781128, + "step": 3658, + "token_acc": 0.28909791947003416 + }, + { + "epoch": 2.14482556435063, + "grad_norm": 0.37160672064283234, + "learning_rate": 0.00013874971239478506, + "loss": 3.0602476596832275, + "step": 3659, + "token_acc": 0.29364957251806606 + }, + { + "epoch": 2.1454119026678393, + "grad_norm": 0.32056967862860675, + "learning_rate": 0.00013874843552063635, + "loss": 3.0788612365722656, + "step": 3660, + "token_acc": 0.28874581072454514 + }, + { + "epoch": 2.1459982409850484, + "grad_norm": 0.3783432078255706, + "learning_rate": 0.00013874715800068872, + "loss": 3.0443053245544434, + "step": 3661, + "token_acc": 0.29407903194483576 + }, + { + "epoch": 2.1465845793022575, + "grad_norm": 0.32572597395113556, + "learning_rate": 0.0001387458798349542, + "loss": 3.067356586456299, + "step": 3662, + "token_acc": 0.29309579411633363 + }, + { + "epoch": 2.1471709176194667, + "grad_norm": 0.3840179338080964, + "learning_rate": 0.00013874460102344477, + "loss": 3.085653066635132, + "step": 3663, + "token_acc": 0.2903970112341814 + }, + { + "epoch": 2.1477572559366753, + "grad_norm": 0.32948715681376284, + "learning_rate": 0.00013874332156617244, + "loss": 3.034700870513916, + "step": 3664, + "token_acc": 0.2972501068061382 + }, + { + "epoch": 2.1483435942538844, + "grad_norm": 0.3002745315299916, + "learning_rate": 0.00013874204146314923, + "loss": 3.0876808166503906, + "step": 3665, + "token_acc": 0.2901335086225356 + }, + { + "epoch": 2.1489299325710935, + "grad_norm": 0.39374262948818484, + "learning_rate": 0.00013874076071438717, + "loss": 3.0260844230651855, + "step": 3666, + "token_acc": 0.29816531209981106 + }, + { + "epoch": 2.1495162708883027, + "grad_norm": 0.37591434430094156, + "learning_rate": 0.0001387394793198983, + "loss": 3.052907943725586, + "step": 3667, + "token_acc": 0.2931241990449627 + }, + { + "epoch": 2.1501026092055118, + "grad_norm": 0.33043674151675834, + "learning_rate": 0.00013873819727969465, + "loss": 3.0446677207946777, + "step": 3668, + "token_acc": 0.2953654346596772 + }, + { + "epoch": 2.1506889475227204, + "grad_norm": 0.37498769926723785, + "learning_rate": 0.00013873691459378827, + "loss": 3.0363340377807617, + "step": 3669, + "token_acc": 0.2967423272458442 + }, + { + "epoch": 2.1512752858399296, + "grad_norm": 0.3508183158161332, + "learning_rate": 0.0001387356312621912, + "loss": 3.1133742332458496, + "step": 3670, + "token_acc": 0.28428066007282493 + }, + { + "epoch": 2.1518616241571387, + "grad_norm": 0.3543527107334875, + "learning_rate": 0.00013873434728491548, + "loss": 3.0330026149749756, + "step": 3671, + "token_acc": 0.29579688322244163 + }, + { + "epoch": 2.1524479624743478, + "grad_norm": 0.33057145508591396, + "learning_rate": 0.0001387330626619732, + "loss": 3.0613560676574707, + "step": 3672, + "token_acc": 0.2916567618965294 + }, + { + "epoch": 2.153034300791557, + "grad_norm": 0.34355229517349395, + "learning_rate": 0.00013873177739337644, + "loss": 3.050032138824463, + "step": 3673, + "token_acc": 0.294324142084322 + }, + { + "epoch": 2.1536206391087656, + "grad_norm": 0.3492257763401718, + "learning_rate": 0.0001387304914791372, + "loss": 3.108842134475708, + "step": 3674, + "token_acc": 0.28639746291683893 + }, + { + "epoch": 2.1542069774259747, + "grad_norm": 0.33297352386959567, + "learning_rate": 0.00013872920491926762, + "loss": 3.0299482345581055, + "step": 3675, + "token_acc": 0.2987976436751925 + }, + { + "epoch": 2.154793315743184, + "grad_norm": 0.3469590089013452, + "learning_rate": 0.0001387279177137798, + "loss": 3.055520534515381, + "step": 3676, + "token_acc": 0.29350654177817426 + }, + { + "epoch": 2.155379654060393, + "grad_norm": 0.3532602767974127, + "learning_rate": 0.00013872662986268578, + "loss": 3.06115460395813, + "step": 3677, + "token_acc": 0.2918595383746523 + }, + { + "epoch": 2.155965992377602, + "grad_norm": 0.3190345846968025, + "learning_rate": 0.0001387253413659977, + "loss": 3.0101656913757324, + "step": 3678, + "token_acc": 0.29874629311997813 + }, + { + "epoch": 2.1565523306948107, + "grad_norm": 0.37120519446301753, + "learning_rate": 0.00013872405222372766, + "loss": 3.0963406562805176, + "step": 3679, + "token_acc": 0.289283651833917 + }, + { + "epoch": 2.15713866901202, + "grad_norm": 0.3933495833608431, + "learning_rate": 0.0001387227624358877, + "loss": 3.0724759101867676, + "step": 3680, + "token_acc": 0.29197194906119844 + }, + { + "epoch": 2.157725007329229, + "grad_norm": 0.43429109210816436, + "learning_rate": 0.00013872147200249003, + "loss": 3.0453054904937744, + "step": 3681, + "token_acc": 0.295247251064555 + }, + { + "epoch": 2.158311345646438, + "grad_norm": 0.33834917267168285, + "learning_rate": 0.00013872018092354673, + "loss": 3.0735626220703125, + "step": 3682, + "token_acc": 0.29215881972332025 + }, + { + "epoch": 2.158897683963647, + "grad_norm": 0.3977111416940941, + "learning_rate": 0.00013871888919906992, + "loss": 3.1411757469177246, + "step": 3683, + "token_acc": 0.2817891549959399 + }, + { + "epoch": 2.1594840222808562, + "grad_norm": 0.46326847758176, + "learning_rate": 0.00013871759682907177, + "loss": 3.069664239883423, + "step": 3684, + "token_acc": 0.29289964153938336 + }, + { + "epoch": 2.160070360598065, + "grad_norm": 0.3535593276211105, + "learning_rate": 0.00013871630381356439, + "loss": 3.066467046737671, + "step": 3685, + "token_acc": 0.29108191423116314 + }, + { + "epoch": 2.160656698915274, + "grad_norm": 0.43580055453221866, + "learning_rate": 0.00013871501015255992, + "loss": 3.0497055053710938, + "step": 3686, + "token_acc": 0.2943328299480011 + }, + { + "epoch": 2.161243037232483, + "grad_norm": 0.4267797503908854, + "learning_rate": 0.00013871371584607052, + "loss": 3.049644947052002, + "step": 3687, + "token_acc": 0.2936733192970532 + }, + { + "epoch": 2.1618293755496922, + "grad_norm": 0.4167482931096908, + "learning_rate": 0.0001387124208941084, + "loss": 3.0985755920410156, + "step": 3688, + "token_acc": 0.28847724375853884 + }, + { + "epoch": 2.1624157138669013, + "grad_norm": 0.421703405790752, + "learning_rate": 0.00013871112529668562, + "loss": 3.0653538703918457, + "step": 3689, + "token_acc": 0.2905989746025681 + }, + { + "epoch": 2.16300205218411, + "grad_norm": 0.45062132450792347, + "learning_rate": 0.00013870982905381444, + "loss": 3.0211079120635986, + "step": 3690, + "token_acc": 0.2978757407921695 + }, + { + "epoch": 2.163588390501319, + "grad_norm": 0.3510442601763075, + "learning_rate": 0.00013870853216550697, + "loss": 3.0555906295776367, + "step": 3691, + "token_acc": 0.2955308475924564 + }, + { + "epoch": 2.1641747288185282, + "grad_norm": 0.36212792654951687, + "learning_rate": 0.00013870723463177547, + "loss": 3.085108995437622, + "step": 3692, + "token_acc": 0.28803588008665865 + }, + { + "epoch": 2.1647610671357373, + "grad_norm": 0.3353144039638235, + "learning_rate": 0.00013870593645263203, + "loss": 3.0589964389801025, + "step": 3693, + "token_acc": 0.2928838264015415 + }, + { + "epoch": 2.1653474054529465, + "grad_norm": 0.3944769159378328, + "learning_rate": 0.00013870463762808894, + "loss": 3.081333875656128, + "step": 3694, + "token_acc": 0.2892041853667369 + }, + { + "epoch": 2.1659337437701556, + "grad_norm": 0.38905277377201836, + "learning_rate": 0.00013870333815815835, + "loss": 3.085146427154541, + "step": 3695, + "token_acc": 0.29007323442682437 + }, + { + "epoch": 2.1665200820873642, + "grad_norm": 0.3540205292815972, + "learning_rate": 0.0001387020380428525, + "loss": 3.0569205284118652, + "step": 3696, + "token_acc": 0.2916895868682095 + }, + { + "epoch": 2.1671064204045734, + "grad_norm": 0.3807583869151187, + "learning_rate": 0.00013870073728218353, + "loss": 3.0659291744232178, + "step": 3697, + "token_acc": 0.29133554590652094 + }, + { + "epoch": 2.1676927587217825, + "grad_norm": 0.37312172865874976, + "learning_rate": 0.00013869943587616374, + "loss": 3.0327696800231934, + "step": 3698, + "token_acc": 0.29609208543366416 + }, + { + "epoch": 2.1682790970389916, + "grad_norm": 0.3548253832132521, + "learning_rate": 0.00013869813382480533, + "loss": 3.032527208328247, + "step": 3699, + "token_acc": 0.2986480769679966 + }, + { + "epoch": 2.1688654353562007, + "grad_norm": 0.4321092988735974, + "learning_rate": 0.0001386968311281205, + "loss": 3.0885863304138184, + "step": 3700, + "token_acc": 0.28938733016170054 + }, + { + "epoch": 2.1694517736734094, + "grad_norm": 0.344140991691171, + "learning_rate": 0.00013869552778612154, + "loss": 3.0269875526428223, + "step": 3701, + "token_acc": 0.2985170113688803 + }, + { + "epoch": 2.1700381119906185, + "grad_norm": 0.3870202579124156, + "learning_rate": 0.00013869422379882065, + "loss": 3.0187439918518066, + "step": 3702, + "token_acc": 0.29769086374038933 + }, + { + "epoch": 2.1706244503078276, + "grad_norm": 0.4179396836032735, + "learning_rate": 0.0001386929191662301, + "loss": 3.0423059463500977, + "step": 3703, + "token_acc": 0.29574282595093293 + }, + { + "epoch": 2.1712107886250367, + "grad_norm": 0.3295170274460673, + "learning_rate": 0.00013869161388836213, + "loss": 3.089437961578369, + "step": 3704, + "token_acc": 0.2878570626033268 + }, + { + "epoch": 2.171797126942246, + "grad_norm": 0.35577094804729825, + "learning_rate": 0.00013869030796522902, + "loss": 3.0802865028381348, + "step": 3705, + "token_acc": 0.29013122653135004 + }, + { + "epoch": 2.172383465259455, + "grad_norm": 0.3543273750504757, + "learning_rate": 0.000138689001396843, + "loss": 3.0879061222076416, + "step": 3706, + "token_acc": 0.2902451620743081 + }, + { + "epoch": 2.1729698035766636, + "grad_norm": 0.3756799407650146, + "learning_rate": 0.0001386876941832164, + "loss": 3.043858051300049, + "step": 3707, + "token_acc": 0.29578152848461425 + }, + { + "epoch": 2.1735561418938727, + "grad_norm": 0.3765999992318311, + "learning_rate": 0.00013868638632436148, + "loss": 3.061802864074707, + "step": 3708, + "token_acc": 0.2922717184099537 + }, + { + "epoch": 2.174142480211082, + "grad_norm": 0.3715973833120276, + "learning_rate": 0.00013868507782029049, + "loss": 3.0692548751831055, + "step": 3709, + "token_acc": 0.29245102748038776 + }, + { + "epoch": 2.174728818528291, + "grad_norm": 0.3814896201935661, + "learning_rate": 0.00013868376867101577, + "loss": 3.042123317718506, + "step": 3710, + "token_acc": 0.2942606733158249 + }, + { + "epoch": 2.1753151568455, + "grad_norm": 0.3967303330174871, + "learning_rate": 0.0001386824588765496, + "loss": 3.0813355445861816, + "step": 3711, + "token_acc": 0.29048891032580915 + }, + { + "epoch": 2.1759014951627087, + "grad_norm": 0.35923419579956684, + "learning_rate": 0.00013868114843690424, + "loss": 3.1104512214660645, + "step": 3712, + "token_acc": 0.286604014470948 + }, + { + "epoch": 2.176487833479918, + "grad_norm": 0.30757260834184413, + "learning_rate": 0.00013867983735209207, + "loss": 3.049495220184326, + "step": 3713, + "token_acc": 0.29556019413745854 + }, + { + "epoch": 2.177074171797127, + "grad_norm": 0.33801634603338193, + "learning_rate": 0.00013867852562212538, + "loss": 3.0556013584136963, + "step": 3714, + "token_acc": 0.2951947197433894 + }, + { + "epoch": 2.177660510114336, + "grad_norm": 0.36698135234241874, + "learning_rate": 0.00013867721324701648, + "loss": 3.056035041809082, + "step": 3715, + "token_acc": 0.2917627576678079 + }, + { + "epoch": 2.178246848431545, + "grad_norm": 0.4645937714476885, + "learning_rate": 0.0001386759002267777, + "loss": 3.0827581882476807, + "step": 3716, + "token_acc": 0.29038527802903047 + }, + { + "epoch": 2.1788331867487543, + "grad_norm": 0.4618781778891149, + "learning_rate": 0.00013867458656142138, + "loss": 3.0646395683288574, + "step": 3717, + "token_acc": 0.29132272786752156 + }, + { + "epoch": 2.179419525065963, + "grad_norm": 0.35099768923673663, + "learning_rate": 0.00013867327225095986, + "loss": 3.0314223766326904, + "step": 3718, + "token_acc": 0.29740863679962065 + }, + { + "epoch": 2.180005863383172, + "grad_norm": 0.32948210480712653, + "learning_rate": 0.0001386719572954055, + "loss": 3.0496826171875, + "step": 3719, + "token_acc": 0.2968249466973354 + }, + { + "epoch": 2.180592201700381, + "grad_norm": 0.37304514387205356, + "learning_rate": 0.00013867064169477062, + "loss": 3.0601322650909424, + "step": 3720, + "token_acc": 0.2912145168493548 + }, + { + "epoch": 2.1811785400175903, + "grad_norm": 0.3463072682259301, + "learning_rate": 0.0001386693254490676, + "loss": 3.0533735752105713, + "step": 3721, + "token_acc": 0.29294265823310406 + }, + { + "epoch": 2.1817648783347994, + "grad_norm": 0.31209225171639965, + "learning_rate": 0.00013866800855830881, + "loss": 3.0868101119995117, + "step": 3722, + "token_acc": 0.2904257002690299 + }, + { + "epoch": 2.182351216652008, + "grad_norm": 0.3312393845306281, + "learning_rate": 0.0001386666910225066, + "loss": 3.1240670680999756, + "step": 3723, + "token_acc": 0.2852433199737349 + }, + { + "epoch": 2.182937554969217, + "grad_norm": 0.4087886688339445, + "learning_rate": 0.00013866537284167336, + "loss": 3.076992988586426, + "step": 3724, + "token_acc": 0.2910729985434409 + }, + { + "epoch": 2.1835238932864263, + "grad_norm": 0.4687490233916709, + "learning_rate": 0.00013866405401582145, + "loss": 3.0642905235290527, + "step": 3725, + "token_acc": 0.2942508627933894 + }, + { + "epoch": 2.1841102316036354, + "grad_norm": 0.3552835404382854, + "learning_rate": 0.0001386627345449633, + "loss": 3.087520122528076, + "step": 3726, + "token_acc": 0.28893885173554235 + }, + { + "epoch": 2.1846965699208445, + "grad_norm": 0.3991863088209864, + "learning_rate": 0.0001386614144291113, + "loss": 3.087493419647217, + "step": 3727, + "token_acc": 0.28767080117338406 + }, + { + "epoch": 2.185282908238053, + "grad_norm": 0.3898983406995071, + "learning_rate": 0.0001386600936682778, + "loss": 3.0735762119293213, + "step": 3728, + "token_acc": 0.29093964826114865 + }, + { + "epoch": 2.1858692465552623, + "grad_norm": 0.40206861132769756, + "learning_rate": 0.00013865877226247527, + "loss": 3.095548629760742, + "step": 3729, + "token_acc": 0.2870144700738571 + }, + { + "epoch": 2.1864555848724714, + "grad_norm": 0.34366341482979906, + "learning_rate": 0.0001386574502117161, + "loss": 3.0830752849578857, + "step": 3730, + "token_acc": 0.2895688093784407 + }, + { + "epoch": 2.1870419231896805, + "grad_norm": 0.41961794331987, + "learning_rate": 0.00013865612751601266, + "loss": 3.0800833702087402, + "step": 3731, + "token_acc": 0.293000142140194 + }, + { + "epoch": 2.1876282615068896, + "grad_norm": 0.37030852390643393, + "learning_rate": 0.00013865480417537743, + "loss": 3.079542636871338, + "step": 3732, + "token_acc": 0.28971192783998323 + }, + { + "epoch": 2.1882145998240983, + "grad_norm": 0.4036872442160519, + "learning_rate": 0.00013865348018982283, + "loss": 3.060689926147461, + "step": 3733, + "token_acc": 0.29304401346467046 + }, + { + "epoch": 2.1888009381413074, + "grad_norm": 0.3725636356502434, + "learning_rate": 0.00013865215555936133, + "loss": 3.0625882148742676, + "step": 3734, + "token_acc": 0.2935405476843642 + }, + { + "epoch": 2.1893872764585165, + "grad_norm": 0.4327971640345311, + "learning_rate": 0.0001386508302840053, + "loss": 3.125330924987793, + "step": 3735, + "token_acc": 0.2839445367564527 + }, + { + "epoch": 2.1899736147757256, + "grad_norm": 0.3450048080568765, + "learning_rate": 0.00013864950436376724, + "loss": 3.0916874408721924, + "step": 3736, + "token_acc": 0.2909560544428036 + }, + { + "epoch": 2.1905599530929347, + "grad_norm": 0.4171269872296327, + "learning_rate": 0.0001386481777986596, + "loss": 3.073681116104126, + "step": 3737, + "token_acc": 0.29043564699882735 + }, + { + "epoch": 2.191146291410144, + "grad_norm": 0.41354734320136044, + "learning_rate": 0.00013864685058869483, + "loss": 3.051309108734131, + "step": 3738, + "token_acc": 0.29414511456133463 + }, + { + "epoch": 2.1917326297273525, + "grad_norm": 0.4149795160748362, + "learning_rate": 0.00013864552273388538, + "loss": 3.11872935295105, + "step": 3739, + "token_acc": 0.2837969252053957 + }, + { + "epoch": 2.1923189680445616, + "grad_norm": 0.40006614351374076, + "learning_rate": 0.00013864419423424376, + "loss": 3.099200487136841, + "step": 3740, + "token_acc": 0.28900737295264484 + }, + { + "epoch": 2.1929053063617707, + "grad_norm": 0.3304117022601082, + "learning_rate": 0.00013864286508978243, + "loss": 3.0832107067108154, + "step": 3741, + "token_acc": 0.29107287645927415 + }, + { + "epoch": 2.19349164467898, + "grad_norm": 0.3967139123174787, + "learning_rate": 0.0001386415353005139, + "loss": 3.075504779815674, + "step": 3742, + "token_acc": 0.2906743215250833 + }, + { + "epoch": 2.194077982996189, + "grad_norm": 0.3955180790135535, + "learning_rate": 0.00013864020486645061, + "loss": 3.087817907333374, + "step": 3743, + "token_acc": 0.2895387234920799 + }, + { + "epoch": 2.1946643213133976, + "grad_norm": 0.3786986022680428, + "learning_rate": 0.00013863887378760513, + "loss": 3.067044496536255, + "step": 3744, + "token_acc": 0.29155823321806706 + }, + { + "epoch": 2.1952506596306067, + "grad_norm": 0.42917368254151966, + "learning_rate": 0.0001386375420639899, + "loss": 3.091047763824463, + "step": 3745, + "token_acc": 0.2883813733294231 + }, + { + "epoch": 2.195836997947816, + "grad_norm": 0.3661432374750033, + "learning_rate": 0.00013863620969561746, + "loss": 3.1084158420562744, + "step": 3746, + "token_acc": 0.28657014222858895 + }, + { + "epoch": 2.196423336265025, + "grad_norm": 0.41026014686501694, + "learning_rate": 0.00013863487668250028, + "loss": 3.097309112548828, + "step": 3747, + "token_acc": 0.2882603008502289 + }, + { + "epoch": 2.197009674582234, + "grad_norm": 0.35200472358163115, + "learning_rate": 0.00013863354302465097, + "loss": 3.1072463989257812, + "step": 3748, + "token_acc": 0.28545067013854253 + }, + { + "epoch": 2.197596012899443, + "grad_norm": 0.3842943663948009, + "learning_rate": 0.000138632208722082, + "loss": 3.086787223815918, + "step": 3749, + "token_acc": 0.2899879663056558 + }, + { + "epoch": 2.198182351216652, + "grad_norm": 0.40559178945964, + "learning_rate": 0.00013863087377480587, + "loss": 3.1021575927734375, + "step": 3750, + "token_acc": 0.2876891923810838 + }, + { + "epoch": 2.198768689533861, + "grad_norm": 0.32600158488324765, + "learning_rate": 0.00013862953818283521, + "loss": 3.035088062286377, + "step": 3751, + "token_acc": 0.2969977958904364 + }, + { + "epoch": 2.19935502785107, + "grad_norm": 0.41950201319552866, + "learning_rate": 0.0001386282019461825, + "loss": 3.009361505508423, + "step": 3752, + "token_acc": 0.29905385561140896 + }, + { + "epoch": 2.199941366168279, + "grad_norm": 0.41036158603621264, + "learning_rate": 0.0001386268650648603, + "loss": 3.0970406532287598, + "step": 3753, + "token_acc": 0.28790319472812054 + }, + { + "epoch": 2.2005277044854883, + "grad_norm": 0.42631355562131457, + "learning_rate": 0.0001386255275388812, + "loss": 3.0976290702819824, + "step": 3754, + "token_acc": 0.28765802934411766 + }, + { + "epoch": 2.201114042802697, + "grad_norm": 0.33669948795740023, + "learning_rate": 0.00013862418936825773, + "loss": 3.024677276611328, + "step": 3755, + "token_acc": 0.29783821174522596 + }, + { + "epoch": 2.201700381119906, + "grad_norm": 0.4239687563564588, + "learning_rate": 0.0001386228505530025, + "loss": 3.0452051162719727, + "step": 3756, + "token_acc": 0.29310620430955475 + }, + { + "epoch": 2.202286719437115, + "grad_norm": 0.35681349673147544, + "learning_rate": 0.000138621511093128, + "loss": 3.065423011779785, + "step": 3757, + "token_acc": 0.292034497413203 + }, + { + "epoch": 2.2028730577543243, + "grad_norm": 0.4468831781078109, + "learning_rate": 0.00013862017098864693, + "loss": 3.0564935207366943, + "step": 3758, + "token_acc": 0.2935169793398243 + }, + { + "epoch": 2.2034593960715334, + "grad_norm": 0.3804211722185984, + "learning_rate": 0.0001386188302395718, + "loss": 3.0392606258392334, + "step": 3759, + "token_acc": 0.296826453617399 + }, + { + "epoch": 2.2040457343887425, + "grad_norm": 0.3595889514401457, + "learning_rate": 0.00013861748884591522, + "loss": 3.0608508586883545, + "step": 3760, + "token_acc": 0.2949711169896885 + }, + { + "epoch": 2.204632072705951, + "grad_norm": 0.34024012274823767, + "learning_rate": 0.0001386161468076898, + "loss": 3.032320737838745, + "step": 3761, + "token_acc": 0.2976641397230184 + }, + { + "epoch": 2.2052184110231603, + "grad_norm": 0.3707747622958935, + "learning_rate": 0.00013861480412490814, + "loss": 3.0459506511688232, + "step": 3762, + "token_acc": 0.29445863600929 + }, + { + "epoch": 2.2058047493403694, + "grad_norm": 0.4134574340058096, + "learning_rate": 0.00013861346079758284, + "loss": 3.0669643878936768, + "step": 3763, + "token_acc": 0.29578423548190225 + }, + { + "epoch": 2.2063910876575785, + "grad_norm": 0.3827924929971751, + "learning_rate": 0.00013861211682572656, + "loss": 3.0904345512390137, + "step": 3764, + "token_acc": 0.2895340594678232 + }, + { + "epoch": 2.2069774259747876, + "grad_norm": 0.3975125811324629, + "learning_rate": 0.0001386107722093519, + "loss": 3.062269687652588, + "step": 3765, + "token_acc": 0.29238909892798004 + }, + { + "epoch": 2.2075637642919963, + "grad_norm": 0.37070096987124396, + "learning_rate": 0.00013860942694847146, + "loss": 3.096532106399536, + "step": 3766, + "token_acc": 0.2885560858233541 + }, + { + "epoch": 2.2081501026092054, + "grad_norm": 0.3733632937318072, + "learning_rate": 0.00013860808104309793, + "loss": 3.0766539573669434, + "step": 3767, + "token_acc": 0.291986503811225 + }, + { + "epoch": 2.2087364409264145, + "grad_norm": 0.3861482177380816, + "learning_rate": 0.00013860673449324392, + "loss": 3.0388424396514893, + "step": 3768, + "token_acc": 0.29634862735533946 + }, + { + "epoch": 2.2093227792436236, + "grad_norm": 0.3328788152812705, + "learning_rate": 0.0001386053872989221, + "loss": 3.049903631210327, + "step": 3769, + "token_acc": 0.29628138347213584 + }, + { + "epoch": 2.2099091175608327, + "grad_norm": 0.3665032437250928, + "learning_rate": 0.0001386040394601451, + "loss": 3.0990562438964844, + "step": 3770, + "token_acc": 0.2877302443901239 + }, + { + "epoch": 2.210495455878042, + "grad_norm": 0.3935427250092516, + "learning_rate": 0.00013860269097692562, + "loss": 3.042750120162964, + "step": 3771, + "token_acc": 0.2950490217941688 + }, + { + "epoch": 2.2110817941952505, + "grad_norm": 0.3790368450262934, + "learning_rate": 0.00013860134184927626, + "loss": 3.087726593017578, + "step": 3772, + "token_acc": 0.2890953251907334 + }, + { + "epoch": 2.2116681325124596, + "grad_norm": 0.3329180179693466, + "learning_rate": 0.00013859999207720976, + "loss": 3.066281795501709, + "step": 3773, + "token_acc": 0.292274211671706 + }, + { + "epoch": 2.2122544708296688, + "grad_norm": 0.36837434853005435, + "learning_rate": 0.00013859864166073876, + "loss": 3.0711097717285156, + "step": 3774, + "token_acc": 0.29330375526361235 + }, + { + "epoch": 2.212840809146878, + "grad_norm": 0.34550041619504096, + "learning_rate": 0.00013859729059987596, + "loss": 3.068091869354248, + "step": 3775, + "token_acc": 0.2911572429221351 + }, + { + "epoch": 2.213427147464087, + "grad_norm": 0.34686041244132104, + "learning_rate": 0.0001385959388946341, + "loss": 3.0785999298095703, + "step": 3776, + "token_acc": 0.290350476396988 + }, + { + "epoch": 2.2140134857812956, + "grad_norm": 0.4427015523386205, + "learning_rate": 0.00013859458654502576, + "loss": 3.0862464904785156, + "step": 3777, + "token_acc": 0.2881395686791404 + }, + { + "epoch": 2.2145998240985048, + "grad_norm": 0.3380676468191562, + "learning_rate": 0.00013859323355106372, + "loss": 3.0615451335906982, + "step": 3778, + "token_acc": 0.29317260934898676 + }, + { + "epoch": 2.215186162415714, + "grad_norm": 0.32874635886861964, + "learning_rate": 0.0001385918799127607, + "loss": 3.0828118324279785, + "step": 3779, + "token_acc": 0.291167430230768 + }, + { + "epoch": 2.215772500732923, + "grad_norm": 0.35866249491572844, + "learning_rate": 0.00013859052563012941, + "loss": 3.0996599197387695, + "step": 3780, + "token_acc": 0.28767575704820325 + }, + { + "epoch": 2.216358839050132, + "grad_norm": 0.3356062617591622, + "learning_rate": 0.00013858917070318254, + "loss": 3.096961498260498, + "step": 3781, + "token_acc": 0.287208802327052 + }, + { + "epoch": 2.2169451773673408, + "grad_norm": 0.2866381658289626, + "learning_rate": 0.00013858781513193287, + "loss": 3.034224033355713, + "step": 3782, + "token_acc": 0.2949192544990444 + }, + { + "epoch": 2.21753151568455, + "grad_norm": 0.3422182396837577, + "learning_rate": 0.00013858645891639306, + "loss": 3.080829620361328, + "step": 3783, + "token_acc": 0.2900252162036785 + }, + { + "epoch": 2.218117854001759, + "grad_norm": 0.33660444229786707, + "learning_rate": 0.00013858510205657588, + "loss": 3.023237705230713, + "step": 3784, + "token_acc": 0.29838801844542784 + }, + { + "epoch": 2.218704192318968, + "grad_norm": 0.39360677680222356, + "learning_rate": 0.00013858374455249413, + "loss": 3.0547568798065186, + "step": 3785, + "token_acc": 0.29511132685257335 + }, + { + "epoch": 2.219290530636177, + "grad_norm": 0.36827221552212785, + "learning_rate": 0.00013858238640416048, + "loss": 3.0956835746765137, + "step": 3786, + "token_acc": 0.2875418816002425 + }, + { + "epoch": 2.219876868953386, + "grad_norm": 0.3436965245656828, + "learning_rate": 0.00013858102761158775, + "loss": 3.0983381271362305, + "step": 3787, + "token_acc": 0.2858509422076356 + }, + { + "epoch": 2.220463207270595, + "grad_norm": 0.3689248687425219, + "learning_rate": 0.0001385796681747887, + "loss": 3.0596258640289307, + "step": 3788, + "token_acc": 0.29293988195418114 + }, + { + "epoch": 2.221049545587804, + "grad_norm": 0.3554970149908357, + "learning_rate": 0.00013857830809377604, + "loss": 3.0135722160339355, + "step": 3789, + "token_acc": 0.29909582193842166 + }, + { + "epoch": 2.221635883905013, + "grad_norm": 0.3805101502623777, + "learning_rate": 0.00013857694736856257, + "loss": 3.064178466796875, + "step": 3790, + "token_acc": 0.29190498273818727 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.3787794242671034, + "learning_rate": 0.0001385755859991611, + "loss": 3.0673046112060547, + "step": 3791, + "token_acc": 0.29237323109282853 + }, + { + "epoch": 2.2228085605394314, + "grad_norm": 0.4147574363674826, + "learning_rate": 0.00013857422398558443, + "loss": 3.07576322555542, + "step": 3792, + "token_acc": 0.2910539248632925 + }, + { + "epoch": 2.22339489885664, + "grad_norm": 0.3503271229218247, + "learning_rate": 0.00013857286132784534, + "loss": 3.0468931198120117, + "step": 3793, + "token_acc": 0.2936412116479316 + }, + { + "epoch": 2.223981237173849, + "grad_norm": 0.3423016308594863, + "learning_rate": 0.0001385714980259566, + "loss": 3.043551445007324, + "step": 3794, + "token_acc": 0.29536154241155327 + }, + { + "epoch": 2.2245675754910583, + "grad_norm": 0.37258745494018086, + "learning_rate": 0.00013857013407993105, + "loss": 3.0831363201141357, + "step": 3795, + "token_acc": 0.2908149930779283 + }, + { + "epoch": 2.2251539138082674, + "grad_norm": 0.3218087826637603, + "learning_rate": 0.00013856876948978146, + "loss": 3.094085216522217, + "step": 3796, + "token_acc": 0.28828291987823046 + }, + { + "epoch": 2.2257402521254765, + "grad_norm": 0.42376246305839144, + "learning_rate": 0.00013856740425552072, + "loss": 3.100907802581787, + "step": 3797, + "token_acc": 0.2863208623837981 + }, + { + "epoch": 2.226326590442685, + "grad_norm": 0.3653197680591713, + "learning_rate": 0.00013856603837716157, + "loss": 3.112792730331421, + "step": 3798, + "token_acc": 0.28421200187256507 + }, + { + "epoch": 2.2269129287598943, + "grad_norm": 0.3814786710666813, + "learning_rate": 0.00013856467185471692, + "loss": 3.0908679962158203, + "step": 3799, + "token_acc": 0.28907967837651527 + }, + { + "epoch": 2.2274992670771034, + "grad_norm": 0.3453140554524636, + "learning_rate": 0.00013856330468819955, + "loss": 3.0593624114990234, + "step": 3800, + "token_acc": 0.29216386284833695 + }, + { + "epoch": 2.2280856053943126, + "grad_norm": 0.31385205146097345, + "learning_rate": 0.00013856193687762232, + "loss": 3.0305235385894775, + "step": 3801, + "token_acc": 0.2981718995738585 + }, + { + "epoch": 2.2286719437115217, + "grad_norm": 0.3659994272375126, + "learning_rate": 0.0001385605684229981, + "loss": 3.0897340774536133, + "step": 3802, + "token_acc": 0.28929190256612264 + }, + { + "epoch": 2.2292582820287308, + "grad_norm": 0.30963311315338893, + "learning_rate": 0.0001385591993243397, + "loss": 3.054443597793579, + "step": 3803, + "token_acc": 0.29312993051564706 + }, + { + "epoch": 2.2298446203459394, + "grad_norm": 0.3463074572278685, + "learning_rate": 0.00013855782958166005, + "loss": 3.0653886795043945, + "step": 3804, + "token_acc": 0.2917200269782454 + }, + { + "epoch": 2.2304309586631486, + "grad_norm": 0.34667633162718664, + "learning_rate": 0.00013855645919497192, + "loss": 3.0768229961395264, + "step": 3805, + "token_acc": 0.2915734249270794 + }, + { + "epoch": 2.2310172969803577, + "grad_norm": 0.33314599253969146, + "learning_rate": 0.00013855508816428827, + "loss": 3.0449976921081543, + "step": 3806, + "token_acc": 0.2950463091015041 + }, + { + "epoch": 2.231603635297567, + "grad_norm": 0.34785222809156313, + "learning_rate": 0.00013855371648962192, + "loss": 3.125851631164551, + "step": 3807, + "token_acc": 0.284079295875442 + }, + { + "epoch": 2.232189973614776, + "grad_norm": 0.37300313620739417, + "learning_rate": 0.0001385523441709858, + "loss": 3.060353994369507, + "step": 3808, + "token_acc": 0.29168526977367176 + }, + { + "epoch": 2.2327763119319846, + "grad_norm": 0.3257428815474043, + "learning_rate": 0.00013855097120839277, + "loss": 3.0714316368103027, + "step": 3809, + "token_acc": 0.29021098976499515 + }, + { + "epoch": 2.2333626502491937, + "grad_norm": 0.3591617873836489, + "learning_rate": 0.00013854959760185577, + "loss": 3.0790562629699707, + "step": 3810, + "token_acc": 0.2906487716918742 + }, + { + "epoch": 2.233948988566403, + "grad_norm": 0.36750578779158793, + "learning_rate": 0.00013854822335138765, + "loss": 3.053553581237793, + "step": 3811, + "token_acc": 0.293160701344496 + }, + { + "epoch": 2.234535326883612, + "grad_norm": 0.354244645765748, + "learning_rate": 0.00013854684845700135, + "loss": 3.044809341430664, + "step": 3812, + "token_acc": 0.29472120966653165 + }, + { + "epoch": 2.235121665200821, + "grad_norm": 0.3537797132358028, + "learning_rate": 0.00013854547291870976, + "loss": 3.1144967079162598, + "step": 3813, + "token_acc": 0.28588642402521164 + }, + { + "epoch": 2.23570800351803, + "grad_norm": 0.34587562793021065, + "learning_rate": 0.00013854409673652585, + "loss": 3.0583996772766113, + "step": 3814, + "token_acc": 0.2921480357488666 + }, + { + "epoch": 2.236294341835239, + "grad_norm": 0.27271027611091136, + "learning_rate": 0.00013854271991046248, + "loss": 3.108921766281128, + "step": 3815, + "token_acc": 0.28559805766750174 + }, + { + "epoch": 2.236880680152448, + "grad_norm": 0.36674670409614357, + "learning_rate": 0.00013854134244053264, + "loss": 3.035670757293701, + "step": 3816, + "token_acc": 0.2959816894867647 + }, + { + "epoch": 2.237467018469657, + "grad_norm": 0.35812017812214175, + "learning_rate": 0.00013853996432674923, + "loss": 3.0336666107177734, + "step": 3817, + "token_acc": 0.29797116441456 + }, + { + "epoch": 2.238053356786866, + "grad_norm": 0.306802967751694, + "learning_rate": 0.0001385385855691252, + "loss": 3.0306594371795654, + "step": 3818, + "token_acc": 0.2985627244938917 + }, + { + "epoch": 2.2386396951040752, + "grad_norm": 0.3605161758675863, + "learning_rate": 0.00013853720616767358, + "loss": 3.050117015838623, + "step": 3819, + "token_acc": 0.2942519705658994 + }, + { + "epoch": 2.239226033421284, + "grad_norm": 0.3523337502953437, + "learning_rate": 0.0001385358261224072, + "loss": 3.0416762828826904, + "step": 3820, + "token_acc": 0.29354767315696 + }, + { + "epoch": 2.239812371738493, + "grad_norm": 0.35480552714582975, + "learning_rate": 0.0001385344454333391, + "loss": 3.0634946823120117, + "step": 3821, + "token_acc": 0.2935524139277311 + }, + { + "epoch": 2.240398710055702, + "grad_norm": 0.3873114416482302, + "learning_rate": 0.00013853306410048228, + "loss": 3.0801591873168945, + "step": 3822, + "token_acc": 0.2903039762956692 + }, + { + "epoch": 2.2409850483729112, + "grad_norm": 0.4681869828660558, + "learning_rate": 0.00013853168212384962, + "loss": 3.0847954750061035, + "step": 3823, + "token_acc": 0.2892951614161893 + }, + { + "epoch": 2.2415713866901203, + "grad_norm": 0.45669102775933923, + "learning_rate": 0.00013853029950345417, + "loss": 3.0518932342529297, + "step": 3824, + "token_acc": 0.2955840327827603 + }, + { + "epoch": 2.2421577250073295, + "grad_norm": 0.47172488476489194, + "learning_rate": 0.0001385289162393089, + "loss": 3.0572433471679688, + "step": 3825, + "token_acc": 0.2922535027116924 + }, + { + "epoch": 2.242744063324538, + "grad_norm": 0.3479425232860213, + "learning_rate": 0.00013852753233142682, + "loss": 3.0926454067230225, + "step": 3826, + "token_acc": 0.2883968647597815 + }, + { + "epoch": 2.2433304016417472, + "grad_norm": 0.38904355744361435, + "learning_rate": 0.00013852614777982091, + "loss": 3.0816333293914795, + "step": 3827, + "token_acc": 0.28780635743070976 + }, + { + "epoch": 2.2439167399589564, + "grad_norm": 0.3534492471823705, + "learning_rate": 0.00013852476258450417, + "loss": 3.0962390899658203, + "step": 3828, + "token_acc": 0.28809711886251194 + }, + { + "epoch": 2.2445030782761655, + "grad_norm": 0.3680981354693585, + "learning_rate": 0.00013852337674548964, + "loss": 3.023831844329834, + "step": 3829, + "token_acc": 0.296342772439174 + }, + { + "epoch": 2.2450894165933746, + "grad_norm": 0.316223346961509, + "learning_rate": 0.00013852199026279032, + "loss": 3.069500684738159, + "step": 3830, + "token_acc": 0.2908919037384017 + }, + { + "epoch": 2.2456757549105832, + "grad_norm": 0.36365936128137066, + "learning_rate": 0.00013852060313641925, + "loss": 3.0748047828674316, + "step": 3831, + "token_acc": 0.2905084953877795 + }, + { + "epoch": 2.2462620932277924, + "grad_norm": 0.3868203613551509, + "learning_rate": 0.00013851921536638942, + "loss": 3.091153144836426, + "step": 3832, + "token_acc": 0.28856550970349376 + }, + { + "epoch": 2.2468484315450015, + "grad_norm": 0.30695453576793913, + "learning_rate": 0.00013851782695271388, + "loss": 3.058513879776001, + "step": 3833, + "token_acc": 0.2913674202236187 + }, + { + "epoch": 2.2474347698622106, + "grad_norm": 0.320822947984259, + "learning_rate": 0.00013851643789540569, + "loss": 3.0583903789520264, + "step": 3834, + "token_acc": 0.2920655363837905 + }, + { + "epoch": 2.2480211081794197, + "grad_norm": 0.3334916738846431, + "learning_rate": 0.00013851504819447792, + "loss": 3.0836739540100098, + "step": 3835, + "token_acc": 0.29022244823974863 + }, + { + "epoch": 2.2486074464966284, + "grad_norm": 0.3624911257458761, + "learning_rate": 0.0001385136578499436, + "loss": 3.0421528816223145, + "step": 3836, + "token_acc": 0.29452986498941275 + }, + { + "epoch": 2.2491937848138375, + "grad_norm": 0.3842212216548702, + "learning_rate": 0.00013851226686181577, + "loss": 3.086411952972412, + "step": 3837, + "token_acc": 0.28994420623030426 + }, + { + "epoch": 2.2497801231310466, + "grad_norm": 0.4396663585584787, + "learning_rate": 0.00013851087523010752, + "loss": 3.0686631202697754, + "step": 3838, + "token_acc": 0.29272678780324857 + }, + { + "epoch": 2.2503664614482557, + "grad_norm": 0.355390869958925, + "learning_rate": 0.0001385094829548319, + "loss": 3.051602363586426, + "step": 3839, + "token_acc": 0.29505380423899413 + }, + { + "epoch": 2.250952799765465, + "grad_norm": 0.3815435667966809, + "learning_rate": 0.000138508090036002, + "loss": 3.065894365310669, + "step": 3840, + "token_acc": 0.29146291946183334 + }, + { + "epoch": 2.2515391380826735, + "grad_norm": 0.3516208026766999, + "learning_rate": 0.00013850669647363093, + "loss": 3.0497169494628906, + "step": 3841, + "token_acc": 0.2940542736515152 + }, + { + "epoch": 2.2521254763998826, + "grad_norm": 0.31093523820407654, + "learning_rate": 0.00013850530226773176, + "loss": 3.0835113525390625, + "step": 3842, + "token_acc": 0.2908220698964202 + }, + { + "epoch": 2.2527118147170917, + "grad_norm": 0.4287870879387594, + "learning_rate": 0.0001385039074183176, + "loss": 3.081928253173828, + "step": 3843, + "token_acc": 0.2903566596912226 + }, + { + "epoch": 2.253298153034301, + "grad_norm": 0.4632590715839384, + "learning_rate": 0.00013850251192540152, + "loss": 3.0524673461914062, + "step": 3844, + "token_acc": 0.29384110166209704 + }, + { + "epoch": 2.25388449135151, + "grad_norm": 0.3257951862837143, + "learning_rate": 0.00013850111578899666, + "loss": 3.063847064971924, + "step": 3845, + "token_acc": 0.2917265757605643 + }, + { + "epoch": 2.254470829668719, + "grad_norm": 0.33257806071671053, + "learning_rate": 0.00013849971900911612, + "loss": 3.0801315307617188, + "step": 3846, + "token_acc": 0.28987972922420996 + }, + { + "epoch": 2.2550571679859277, + "grad_norm": 0.42810361042950146, + "learning_rate": 0.000138498321585773, + "loss": 3.082514762878418, + "step": 3847, + "token_acc": 0.2906722680729772 + }, + { + "epoch": 2.255643506303137, + "grad_norm": 0.3730060321896698, + "learning_rate": 0.0001384969235189805, + "loss": 3.0572128295898438, + "step": 3848, + "token_acc": 0.29362732349731757 + }, + { + "epoch": 2.256229844620346, + "grad_norm": 0.31898839106236154, + "learning_rate": 0.00013849552480875167, + "loss": 3.0367794036865234, + "step": 3849, + "token_acc": 0.2955675304435111 + }, + { + "epoch": 2.256816182937555, + "grad_norm": 0.40894332547215756, + "learning_rate": 0.0001384941254550997, + "loss": 3.0966711044311523, + "step": 3850, + "token_acc": 0.2863093326149351 + }, + { + "epoch": 2.257402521254764, + "grad_norm": 0.41981508593598515, + "learning_rate": 0.0001384927254580377, + "loss": 3.0757436752319336, + "step": 3851, + "token_acc": 0.2913163342910259 + }, + { + "epoch": 2.257988859571973, + "grad_norm": 0.38290068043357256, + "learning_rate": 0.00013849132481757887, + "loss": 3.0381224155426025, + "step": 3852, + "token_acc": 0.2966919877779664 + }, + { + "epoch": 2.258575197889182, + "grad_norm": 0.35797918625685216, + "learning_rate": 0.0001384899235337363, + "loss": 3.084138870239258, + "step": 3853, + "token_acc": 0.28986087759626694 + }, + { + "epoch": 2.259161536206391, + "grad_norm": 0.4266088980771241, + "learning_rate": 0.0001384885216065232, + "loss": 3.0886075496673584, + "step": 3854, + "token_acc": 0.2905526597888183 + }, + { + "epoch": 2.2597478745236, + "grad_norm": 0.37619670303682273, + "learning_rate": 0.00013848711903595274, + "loss": 3.059359550476074, + "step": 3855, + "token_acc": 0.2928408096947423 + }, + { + "epoch": 2.2603342128408093, + "grad_norm": 0.3277123481096555, + "learning_rate": 0.00013848571582203808, + "loss": 3.043466567993164, + "step": 3856, + "token_acc": 0.2938416138551594 + }, + { + "epoch": 2.2609205511580184, + "grad_norm": 0.4149954854926732, + "learning_rate": 0.0001384843119647924, + "loss": 3.0852713584899902, + "step": 3857, + "token_acc": 0.2891124940460165 + }, + { + "epoch": 2.261506889475227, + "grad_norm": 0.35594866015117504, + "learning_rate": 0.0001384829074642289, + "loss": 3.083378314971924, + "step": 3858, + "token_acc": 0.2878920332445703 + }, + { + "epoch": 2.262093227792436, + "grad_norm": 0.3596749401916214, + "learning_rate": 0.00013848150232036077, + "loss": 3.0727322101593018, + "step": 3859, + "token_acc": 0.2911177220569486 + }, + { + "epoch": 2.2626795661096453, + "grad_norm": 0.31384890629757456, + "learning_rate": 0.00013848009653320118, + "loss": 3.056255340576172, + "step": 3860, + "token_acc": 0.2937604992254056 + }, + { + "epoch": 2.2632659044268544, + "grad_norm": 0.3693556109221701, + "learning_rate": 0.00013847869010276338, + "loss": 3.0808157920837402, + "step": 3861, + "token_acc": 0.2905301360985094 + }, + { + "epoch": 2.2638522427440635, + "grad_norm": 0.28549176013999433, + "learning_rate": 0.00013847728302906058, + "loss": 3.045353889465332, + "step": 3862, + "token_acc": 0.2952277911751287 + }, + { + "epoch": 2.264438581061272, + "grad_norm": 0.34361119960744657, + "learning_rate": 0.00013847587531210596, + "loss": 3.008805990219116, + "step": 3863, + "token_acc": 0.30027796456972666 + }, + { + "epoch": 2.2650249193784813, + "grad_norm": 0.3438879524843519, + "learning_rate": 0.00013847446695191277, + "loss": 3.0809240341186523, + "step": 3864, + "token_acc": 0.2918190527653998 + }, + { + "epoch": 2.2656112576956904, + "grad_norm": 0.4481729237126814, + "learning_rate": 0.00013847305794849422, + "loss": 3.112427234649658, + "step": 3865, + "token_acc": 0.2845875798051524 + }, + { + "epoch": 2.2661975960128995, + "grad_norm": 0.4158568425389823, + "learning_rate": 0.00013847164830186356, + "loss": 3.0549936294555664, + "step": 3866, + "token_acc": 0.29271134958212686 + }, + { + "epoch": 2.2667839343301086, + "grad_norm": 0.38058550616335823, + "learning_rate": 0.00013847023801203404, + "loss": 3.058981418609619, + "step": 3867, + "token_acc": 0.29141808410219733 + }, + { + "epoch": 2.2673702726473177, + "grad_norm": 0.3847258988173558, + "learning_rate": 0.00013846882707901892, + "loss": 3.02866530418396, + "step": 3868, + "token_acc": 0.29838261944418254 + }, + { + "epoch": 2.2679566109645264, + "grad_norm": 0.3297279481140381, + "learning_rate": 0.0001384674155028314, + "loss": 3.075563430786133, + "step": 3869, + "token_acc": 0.29091768170118565 + }, + { + "epoch": 2.2685429492817355, + "grad_norm": 0.38615549597048326, + "learning_rate": 0.00013846600328348477, + "loss": 3.053814649581909, + "step": 3870, + "token_acc": 0.29585497798604427 + }, + { + "epoch": 2.2691292875989446, + "grad_norm": 0.3442695087994908, + "learning_rate": 0.00013846459042099232, + "loss": 3.1013071537017822, + "step": 3871, + "token_acc": 0.2869471977404085 + }, + { + "epoch": 2.2697156259161537, + "grad_norm": 0.3491753752334351, + "learning_rate": 0.0001384631769153673, + "loss": 3.0239322185516357, + "step": 3872, + "token_acc": 0.29861107402799214 + }, + { + "epoch": 2.270301964233363, + "grad_norm": 0.31757208227376005, + "learning_rate": 0.00013846176276662296, + "loss": 3.0704989433288574, + "step": 3873, + "token_acc": 0.2924044996861807 + }, + { + "epoch": 2.2708883025505715, + "grad_norm": 0.33313170151748045, + "learning_rate": 0.00013846034797477264, + "loss": 3.104428291320801, + "step": 3874, + "token_acc": 0.2878942808498635 + }, + { + "epoch": 2.2714746408677806, + "grad_norm": 0.3178161077129083, + "learning_rate": 0.0001384589325398296, + "loss": 3.0700361728668213, + "step": 3875, + "token_acc": 0.2909593489251511 + }, + { + "epoch": 2.2720609791849897, + "grad_norm": 0.2934594505903017, + "learning_rate": 0.00013845751646180714, + "loss": 3.0353221893310547, + "step": 3876, + "token_acc": 0.29744088931800133 + }, + { + "epoch": 2.272647317502199, + "grad_norm": 0.30526435293479054, + "learning_rate": 0.00013845609974071855, + "loss": 3.0820460319519043, + "step": 3877, + "token_acc": 0.28996609024412423 + }, + { + "epoch": 2.273233655819408, + "grad_norm": 0.31425111242090714, + "learning_rate": 0.00013845468237657716, + "loss": 3.1019535064697266, + "step": 3878, + "token_acc": 0.28769741474700733 + }, + { + "epoch": 2.273819994136617, + "grad_norm": 0.3695404059605474, + "learning_rate": 0.00013845326436939627, + "loss": 3.0397562980651855, + "step": 3879, + "token_acc": 0.2965638262058948 + }, + { + "epoch": 2.2744063324538257, + "grad_norm": 0.44180940491657555, + "learning_rate": 0.0001384518457191892, + "loss": 3.0991034507751465, + "step": 3880, + "token_acc": 0.28856595254454437 + }, + { + "epoch": 2.274992670771035, + "grad_norm": 0.4338804599948091, + "learning_rate": 0.00013845042642596927, + "loss": 3.0665316581726074, + "step": 3881, + "token_acc": 0.293010533815073 + }, + { + "epoch": 2.275579009088244, + "grad_norm": 0.3490998043387444, + "learning_rate": 0.0001384490064897498, + "loss": 3.045743703842163, + "step": 3882, + "token_acc": 0.29341090544365217 + }, + { + "epoch": 2.276165347405453, + "grad_norm": 0.4132368175385659, + "learning_rate": 0.00013844758591054421, + "loss": 3.07841157913208, + "step": 3883, + "token_acc": 0.2898292054126155 + }, + { + "epoch": 2.2767516857226617, + "grad_norm": 0.3615695796033046, + "learning_rate": 0.00013844616468836575, + "loss": 3.060102939605713, + "step": 3884, + "token_acc": 0.2920097947675321 + }, + { + "epoch": 2.277338024039871, + "grad_norm": 0.4054110198104882, + "learning_rate": 0.0001384447428232278, + "loss": 3.0752532482147217, + "step": 3885, + "token_acc": 0.2910475907166496 + }, + { + "epoch": 2.27792436235708, + "grad_norm": 0.33057903400938954, + "learning_rate": 0.00013844332031514374, + "loss": 3.1007637977600098, + "step": 3886, + "token_acc": 0.28695548521250813 + }, + { + "epoch": 2.278510700674289, + "grad_norm": 0.36419562016706897, + "learning_rate": 0.00013844189716412692, + "loss": 3.071690559387207, + "step": 3887, + "token_acc": 0.29020181790171007 + }, + { + "epoch": 2.279097038991498, + "grad_norm": 0.3637926280923247, + "learning_rate": 0.00013844047337019066, + "loss": 3.104456663131714, + "step": 3888, + "token_acc": 0.2894872674894673 + }, + { + "epoch": 2.2796833773087073, + "grad_norm": 0.3604741317456975, + "learning_rate": 0.00013843904893334842, + "loss": 3.102541208267212, + "step": 3889, + "token_acc": 0.2872509046429846 + }, + { + "epoch": 2.280269715625916, + "grad_norm": 0.3754668565514983, + "learning_rate": 0.00013843762385361353, + "loss": 3.069880962371826, + "step": 3890, + "token_acc": 0.2927078214191207 + }, + { + "epoch": 2.280856053943125, + "grad_norm": 0.303617069769762, + "learning_rate": 0.00013843619813099937, + "loss": 3.011373281478882, + "step": 3891, + "token_acc": 0.29978096577400526 + }, + { + "epoch": 2.281442392260334, + "grad_norm": 0.40685262449068804, + "learning_rate": 0.00013843477176551935, + "loss": 3.123256206512451, + "step": 3892, + "token_acc": 0.2862076523765991 + }, + { + "epoch": 2.2820287305775433, + "grad_norm": 0.3755941747882267, + "learning_rate": 0.0001384333447571869, + "loss": 3.0905954837799072, + "step": 3893, + "token_acc": 0.28829916841627495 + }, + { + "epoch": 2.2826150688947524, + "grad_norm": 0.3174386471742889, + "learning_rate": 0.00013843191710601535, + "loss": 3.072826385498047, + "step": 3894, + "token_acc": 0.2911789489711005 + }, + { + "epoch": 2.283201407211961, + "grad_norm": 0.3469984076873514, + "learning_rate": 0.00013843048881201814, + "loss": 3.0636138916015625, + "step": 3895, + "token_acc": 0.29226631896340965 + }, + { + "epoch": 2.28378774552917, + "grad_norm": 0.34896365888440845, + "learning_rate": 0.00013842905987520874, + "loss": 3.0686473846435547, + "step": 3896, + "token_acc": 0.29159882774340606 + }, + { + "epoch": 2.2843740838463793, + "grad_norm": 0.3258553343889736, + "learning_rate": 0.0001384276302956005, + "loss": 3.091007947921753, + "step": 3897, + "token_acc": 0.2896524105242369 + }, + { + "epoch": 2.2849604221635884, + "grad_norm": 0.30338385508668364, + "learning_rate": 0.00013842620007320692, + "loss": 3.0607547760009766, + "step": 3898, + "token_acc": 0.29499240410669963 + }, + { + "epoch": 2.2855467604807975, + "grad_norm": 0.3059890838355351, + "learning_rate": 0.00013842476920804137, + "loss": 3.099848508834839, + "step": 3899, + "token_acc": 0.28921514187038666 + }, + { + "epoch": 2.2861330987980066, + "grad_norm": 0.3102744394809184, + "learning_rate": 0.0001384233377001173, + "loss": 3.082677125930786, + "step": 3900, + "token_acc": 0.289320009697583 + }, + { + "epoch": 2.2867194371152153, + "grad_norm": 0.29937684176137513, + "learning_rate": 0.0001384219055494482, + "loss": 3.088576078414917, + "step": 3901, + "token_acc": 0.28855555080122375 + }, + { + "epoch": 2.2873057754324244, + "grad_norm": 0.3386098938733586, + "learning_rate": 0.00013842047275604752, + "loss": 3.0769803524017334, + "step": 3902, + "token_acc": 0.2905053338666287 + }, + { + "epoch": 2.2878921137496335, + "grad_norm": 0.3349865834286317, + "learning_rate": 0.00013841903931992866, + "loss": 3.060807228088379, + "step": 3903, + "token_acc": 0.29270408163265305 + }, + { + "epoch": 2.2884784520668426, + "grad_norm": 0.31807035820314544, + "learning_rate": 0.00013841760524110512, + "loss": 3.031798839569092, + "step": 3904, + "token_acc": 0.2985571181041039 + }, + { + "epoch": 2.2890647903840518, + "grad_norm": 0.39711321733173727, + "learning_rate": 0.00013841617051959038, + "loss": 3.1210246086120605, + "step": 3905, + "token_acc": 0.28511430981812086 + }, + { + "epoch": 2.2896511287012604, + "grad_norm": 0.3421190504437334, + "learning_rate": 0.0001384147351553979, + "loss": 3.0354483127593994, + "step": 3906, + "token_acc": 0.295446856931972 + }, + { + "epoch": 2.2902374670184695, + "grad_norm": 0.3453646496565421, + "learning_rate": 0.0001384132991485412, + "loss": 3.059192180633545, + "step": 3907, + "token_acc": 0.2917562575985586 + }, + { + "epoch": 2.2908238053356786, + "grad_norm": 0.39893429695253313, + "learning_rate": 0.00013841186249903371, + "loss": 3.053541898727417, + "step": 3908, + "token_acc": 0.29299291704644403 + }, + { + "epoch": 2.2914101436528878, + "grad_norm": 0.2959880178171732, + "learning_rate": 0.00013841042520688898, + "loss": 3.0286571979522705, + "step": 3909, + "token_acc": 0.2968736752861382 + }, + { + "epoch": 2.291996481970097, + "grad_norm": 0.4179579431544344, + "learning_rate": 0.0001384089872721205, + "loss": 3.0871942043304443, + "step": 3910, + "token_acc": 0.2885968491943638 + }, + { + "epoch": 2.292582820287306, + "grad_norm": 0.3633772333212466, + "learning_rate": 0.00013840754869474172, + "loss": 3.0757594108581543, + "step": 3911, + "token_acc": 0.29027082514859265 + }, + { + "epoch": 2.2931691586045146, + "grad_norm": 0.3988546266852141, + "learning_rate": 0.00013840610947476626, + "loss": 3.055666446685791, + "step": 3912, + "token_acc": 0.29214874077468944 + }, + { + "epoch": 2.2937554969217238, + "grad_norm": 0.3655916915048003, + "learning_rate": 0.00013840466961220755, + "loss": 3.061676502227783, + "step": 3913, + "token_acc": 0.2922934236818837 + }, + { + "epoch": 2.294341835238933, + "grad_norm": 0.403967399510546, + "learning_rate": 0.00013840322910707914, + "loss": 3.06962251663208, + "step": 3914, + "token_acc": 0.2916321716179497 + }, + { + "epoch": 2.294928173556142, + "grad_norm": 0.3998517520050068, + "learning_rate": 0.00013840178795939458, + "loss": 3.0726213455200195, + "step": 3915, + "token_acc": 0.29180546651140277 + }, + { + "epoch": 2.295514511873351, + "grad_norm": 0.35832908035239197, + "learning_rate": 0.0001384003461691674, + "loss": 3.040363311767578, + "step": 3916, + "token_acc": 0.29413685398832634 + }, + { + "epoch": 2.2961008501905598, + "grad_norm": 0.37753816086791764, + "learning_rate": 0.00013839890373641112, + "loss": 3.0885636806488037, + "step": 3917, + "token_acc": 0.28859214979101183 + }, + { + "epoch": 2.296687188507769, + "grad_norm": 0.44766841565925836, + "learning_rate": 0.0001383974606611393, + "loss": 3.1028342247009277, + "step": 3918, + "token_acc": 0.28623087550660276 + }, + { + "epoch": 2.297273526824978, + "grad_norm": 0.42745971793668625, + "learning_rate": 0.00013839601694336551, + "loss": 3.0878496170043945, + "step": 3919, + "token_acc": 0.288244458340521 + }, + { + "epoch": 2.297859865142187, + "grad_norm": 0.42854937699000895, + "learning_rate": 0.00013839457258310332, + "loss": 3.0849950313568115, + "step": 3920, + "token_acc": 0.2897338053422831 + }, + { + "epoch": 2.298446203459396, + "grad_norm": 0.3725991936571028, + "learning_rate": 0.00013839312758036628, + "loss": 3.086827039718628, + "step": 3921, + "token_acc": 0.29129848655594054 + }, + { + "epoch": 2.2990325417766053, + "grad_norm": 0.3285078645432387, + "learning_rate": 0.00013839168193516797, + "loss": 3.0785536766052246, + "step": 3922, + "token_acc": 0.2902410825161649 + }, + { + "epoch": 2.299618880093814, + "grad_norm": 0.3182123846032993, + "learning_rate": 0.00013839023564752196, + "loss": 3.070664167404175, + "step": 3923, + "token_acc": 0.2925844299782157 + }, + { + "epoch": 2.300205218411023, + "grad_norm": 0.3117010777522154, + "learning_rate": 0.00013838878871744184, + "loss": 3.079596996307373, + "step": 3924, + "token_acc": 0.2895881279674898 + }, + { + "epoch": 2.300791556728232, + "grad_norm": 0.3178648079353162, + "learning_rate": 0.0001383873411449412, + "loss": 3.0349442958831787, + "step": 3925, + "token_acc": 0.2950673775901584 + }, + { + "epoch": 2.3013778950454413, + "grad_norm": 0.3647867046323575, + "learning_rate": 0.00013838589293003366, + "loss": 3.08089017868042, + "step": 3926, + "token_acc": 0.2898600162031057 + }, + { + "epoch": 2.3019642333626504, + "grad_norm": 0.43829197298564454, + "learning_rate": 0.00013838444407273282, + "loss": 3.0685341358184814, + "step": 3927, + "token_acc": 0.28976690624627727 + }, + { + "epoch": 2.302550571679859, + "grad_norm": 0.4551887875512502, + "learning_rate": 0.00013838299457305224, + "loss": 3.033466339111328, + "step": 3928, + "token_acc": 0.29643042041125167 + }, + { + "epoch": 2.303136909997068, + "grad_norm": 0.33368281754810836, + "learning_rate": 0.0001383815444310056, + "loss": 3.0512032508850098, + "step": 3929, + "token_acc": 0.2940415111336609 + }, + { + "epoch": 2.3037232483142773, + "grad_norm": 0.4647301235279653, + "learning_rate": 0.00013838009364660646, + "loss": 3.0661444664001465, + "step": 3930, + "token_acc": 0.29198844437566096 + }, + { + "epoch": 2.3043095866314864, + "grad_norm": 0.34507979452575765, + "learning_rate": 0.00013837864221986852, + "loss": 3.0297296047210693, + "step": 3931, + "token_acc": 0.2966106792174789 + }, + { + "epoch": 2.3048959249486956, + "grad_norm": 0.368975645597791, + "learning_rate": 0.00013837719015080536, + "loss": 3.0287535190582275, + "step": 3932, + "token_acc": 0.2963790024254367 + }, + { + "epoch": 2.3054822632659047, + "grad_norm": 0.30871522169311133, + "learning_rate": 0.00013837573743943066, + "loss": 3.074644088745117, + "step": 3933, + "token_acc": 0.2908353147937558 + }, + { + "epoch": 2.3060686015831133, + "grad_norm": 0.3678700493138395, + "learning_rate": 0.00013837428408575804, + "loss": 3.0418813228607178, + "step": 3934, + "token_acc": 0.2938495740142057 + }, + { + "epoch": 2.3066549399003224, + "grad_norm": 0.30303871732615073, + "learning_rate": 0.00013837283008980112, + "loss": 3.0823864936828613, + "step": 3935, + "token_acc": 0.2888338936620595 + }, + { + "epoch": 2.3072412782175316, + "grad_norm": 0.38016233836064756, + "learning_rate": 0.00013837137545157362, + "loss": 3.0457088947296143, + "step": 3936, + "token_acc": 0.29506458164500604 + }, + { + "epoch": 2.3078276165347407, + "grad_norm": 0.411991614263233, + "learning_rate": 0.00013836992017108918, + "loss": 3.1036641597747803, + "step": 3937, + "token_acc": 0.28654254114060923 + }, + { + "epoch": 2.3084139548519493, + "grad_norm": 0.41491000002170797, + "learning_rate": 0.00013836846424836147, + "loss": 3.08748197555542, + "step": 3938, + "token_acc": 0.28838537894717003 + }, + { + "epoch": 2.3090002931691584, + "grad_norm": 0.29636185087528, + "learning_rate": 0.00013836700768340418, + "loss": 3.067830801010132, + "step": 3939, + "token_acc": 0.2922965308006859 + }, + { + "epoch": 2.3095866314863676, + "grad_norm": 0.3966499283124552, + "learning_rate": 0.00013836555047623094, + "loss": 3.094179153442383, + "step": 3940, + "token_acc": 0.28791486188003784 + }, + { + "epoch": 2.3101729698035767, + "grad_norm": 0.3775647204566085, + "learning_rate": 0.00013836409262685552, + "loss": 3.0462324619293213, + "step": 3941, + "token_acc": 0.2934821221097166 + }, + { + "epoch": 2.310759308120786, + "grad_norm": 0.30155060987588705, + "learning_rate": 0.00013836263413529153, + "loss": 3.0329737663269043, + "step": 3942, + "token_acc": 0.2971600507031549 + }, + { + "epoch": 2.311345646437995, + "grad_norm": 0.3334891402063084, + "learning_rate": 0.00013836117500155276, + "loss": 3.073120594024658, + "step": 3943, + "token_acc": 0.2929092131736944 + }, + { + "epoch": 2.3119319847552036, + "grad_norm": 0.31005464660951365, + "learning_rate": 0.00013835971522565283, + "loss": 3.038090229034424, + "step": 3944, + "token_acc": 0.29504287315280564 + }, + { + "epoch": 2.3125183230724127, + "grad_norm": 0.326270648546799, + "learning_rate": 0.0001383582548076055, + "loss": 3.0682320594787598, + "step": 3945, + "token_acc": 0.2906784853156251 + }, + { + "epoch": 2.313104661389622, + "grad_norm": 0.37340992572194265, + "learning_rate": 0.0001383567937474245, + "loss": 3.0554776191711426, + "step": 3946, + "token_acc": 0.29302766593064034 + }, + { + "epoch": 2.313690999706831, + "grad_norm": 0.30478096597378435, + "learning_rate": 0.00013835533204512352, + "loss": 3.1246190071105957, + "step": 3947, + "token_acc": 0.28467782616304804 + }, + { + "epoch": 2.31427733802404, + "grad_norm": 0.3516625512867287, + "learning_rate": 0.0001383538697007163, + "loss": 3.0827436447143555, + "step": 3948, + "token_acc": 0.2911390433468841 + }, + { + "epoch": 2.3148636763412487, + "grad_norm": 0.35923479240990774, + "learning_rate": 0.0001383524067142166, + "loss": 3.048055648803711, + "step": 3949, + "token_acc": 0.294329034163631 + }, + { + "epoch": 2.315450014658458, + "grad_norm": 0.3066055700570638, + "learning_rate": 0.0001383509430856381, + "loss": 3.1093990802764893, + "step": 3950, + "token_acc": 0.286667708369197 + }, + { + "epoch": 2.316036352975667, + "grad_norm": 0.3508106270957589, + "learning_rate": 0.00013834947881499464, + "loss": 3.0936403274536133, + "step": 3951, + "token_acc": 0.2883760622977634 + }, + { + "epoch": 2.316622691292876, + "grad_norm": 0.33554112783780493, + "learning_rate": 0.0001383480139022999, + "loss": 3.068114757537842, + "step": 3952, + "token_acc": 0.2914524538521919 + }, + { + "epoch": 2.317209029610085, + "grad_norm": 0.3260311605565729, + "learning_rate": 0.0001383465483475677, + "loss": 3.047159194946289, + "step": 3953, + "token_acc": 0.2940285214385092 + }, + { + "epoch": 2.3177953679272942, + "grad_norm": 0.3462536981459243, + "learning_rate": 0.00013834508215081177, + "loss": 3.0494046211242676, + "step": 3954, + "token_acc": 0.2942017254451936 + }, + { + "epoch": 2.318381706244503, + "grad_norm": 0.371036549300742, + "learning_rate": 0.00013834361531204586, + "loss": 3.0578203201293945, + "step": 3955, + "token_acc": 0.2923353354895015 + }, + { + "epoch": 2.318968044561712, + "grad_norm": 0.39886508513731955, + "learning_rate": 0.00013834214783128382, + "loss": 3.0986618995666504, + "step": 3956, + "token_acc": 0.2892249934691217 + }, + { + "epoch": 2.319554382878921, + "grad_norm": 0.3136540350075441, + "learning_rate": 0.00013834067970853935, + "loss": 3.0377135276794434, + "step": 3957, + "token_acc": 0.2965276396117156 + }, + { + "epoch": 2.3201407211961302, + "grad_norm": 0.35490422535823235, + "learning_rate": 0.0001383392109438263, + "loss": 3.1068856716156006, + "step": 3958, + "token_acc": 0.285132527133979 + }, + { + "epoch": 2.3207270595133394, + "grad_norm": 0.3777219580936742, + "learning_rate": 0.0001383377415371585, + "loss": 3.081146240234375, + "step": 3959, + "token_acc": 0.2904690630744562 + }, + { + "epoch": 2.321313397830548, + "grad_norm": 0.27863890762171567, + "learning_rate": 0.00013833627148854963, + "loss": 3.0828027725219727, + "step": 3960, + "token_acc": 0.2909300430742364 + }, + { + "epoch": 2.321899736147757, + "grad_norm": 0.4312174518679636, + "learning_rate": 0.00013833480079801361, + "loss": 3.087311267852783, + "step": 3961, + "token_acc": 0.2890677292503347 + }, + { + "epoch": 2.3224860744649662, + "grad_norm": 0.3575567588294255, + "learning_rate": 0.0001383333294655642, + "loss": 3.0697579383850098, + "step": 3962, + "token_acc": 0.29103192475824613 + }, + { + "epoch": 2.3230724127821754, + "grad_norm": 0.36566810034540165, + "learning_rate": 0.00013833185749121527, + "loss": 3.0293309688568115, + "step": 3963, + "token_acc": 0.2974110679844744 + }, + { + "epoch": 2.3236587510993845, + "grad_norm": 0.3734852795516362, + "learning_rate": 0.00013833038487498058, + "loss": 3.0859031677246094, + "step": 3964, + "token_acc": 0.28922819046027637 + }, + { + "epoch": 2.3242450894165936, + "grad_norm": 0.37806452367667803, + "learning_rate": 0.000138328911616874, + "loss": 3.032557249069214, + "step": 3965, + "token_acc": 0.29631383692477703 + }, + { + "epoch": 2.3248314277338022, + "grad_norm": 0.3650133085647914, + "learning_rate": 0.00013832743771690942, + "loss": 3.071094036102295, + "step": 3966, + "token_acc": 0.29140922084741 + }, + { + "epoch": 2.3254177660510114, + "grad_norm": 0.4120146141710093, + "learning_rate": 0.00013832596317510062, + "loss": 3.092928886413574, + "step": 3967, + "token_acc": 0.2880726216119415 + }, + { + "epoch": 2.3260041043682205, + "grad_norm": 0.38112498813787155, + "learning_rate": 0.00013832448799146145, + "loss": 3.0728306770324707, + "step": 3968, + "token_acc": 0.29063424270070537 + }, + { + "epoch": 2.3265904426854296, + "grad_norm": 0.3477489528267661, + "learning_rate": 0.0001383230121660058, + "loss": 3.093782424926758, + "step": 3969, + "token_acc": 0.2889436555570175 + }, + { + "epoch": 2.3271767810026387, + "grad_norm": 0.34481013484783046, + "learning_rate": 0.00013832153569874747, + "loss": 3.083939552307129, + "step": 3970, + "token_acc": 0.28950840923117444 + }, + { + "epoch": 2.3277631193198474, + "grad_norm": 0.3933006452051719, + "learning_rate": 0.0001383200585897004, + "loss": 3.081540107727051, + "step": 3971, + "token_acc": 0.2901016091660821 + }, + { + "epoch": 2.3283494576370565, + "grad_norm": 0.3686053933521343, + "learning_rate": 0.00013831858083887847, + "loss": 3.0922231674194336, + "step": 3972, + "token_acc": 0.28701522513767413 + }, + { + "epoch": 2.3289357959542656, + "grad_norm": 0.3605850353219423, + "learning_rate": 0.00013831710244629553, + "loss": 3.1031837463378906, + "step": 3973, + "token_acc": 0.28654869258099214 + }, + { + "epoch": 2.3295221342714747, + "grad_norm": 0.38370149366935163, + "learning_rate": 0.00013831562341196544, + "loss": 3.1072187423706055, + "step": 3974, + "token_acc": 0.2846763574531618 + }, + { + "epoch": 2.330108472588684, + "grad_norm": 0.3735128990342422, + "learning_rate": 0.00013831414373590215, + "loss": 3.06618332862854, + "step": 3975, + "token_acc": 0.2922717001196331 + }, + { + "epoch": 2.330694810905893, + "grad_norm": 0.2993427822473872, + "learning_rate": 0.0001383126634181195, + "loss": 3.0443787574768066, + "step": 3976, + "token_acc": 0.295177867991284 + }, + { + "epoch": 2.3312811492231016, + "grad_norm": 0.36019972953393525, + "learning_rate": 0.0001383111824586315, + "loss": 3.0392229557037354, + "step": 3977, + "token_acc": 0.2962553881430682 + }, + { + "epoch": 2.3318674875403107, + "grad_norm": 0.3577751018932838, + "learning_rate": 0.00013830970085745191, + "loss": 3.055182933807373, + "step": 3978, + "token_acc": 0.2928139342197195 + }, + { + "epoch": 2.33245382585752, + "grad_norm": 0.45208972173851025, + "learning_rate": 0.0001383082186145948, + "loss": 3.0525264739990234, + "step": 3979, + "token_acc": 0.2943688880864675 + }, + { + "epoch": 2.333040164174729, + "grad_norm": 0.34428818472429057, + "learning_rate": 0.00013830673573007396, + "loss": 3.026096820831299, + "step": 3980, + "token_acc": 0.2993248352022832 + }, + { + "epoch": 2.333626502491938, + "grad_norm": 0.3795493699860801, + "learning_rate": 0.0001383052522039034, + "loss": 3.1066219806671143, + "step": 3981, + "token_acc": 0.28654674768497274 + }, + { + "epoch": 2.3342128408091467, + "grad_norm": 0.38052898550665104, + "learning_rate": 0.00013830376803609706, + "loss": 3.0857529640197754, + "step": 3982, + "token_acc": 0.2893822467211557 + }, + { + "epoch": 2.334799179126356, + "grad_norm": 0.4202396438888431, + "learning_rate": 0.00013830228322666885, + "loss": 3.034611701965332, + "step": 3983, + "token_acc": 0.29587338693614834 + }, + { + "epoch": 2.335385517443565, + "grad_norm": 0.32271000943399963, + "learning_rate": 0.00013830079777563272, + "loss": 3.067209243774414, + "step": 3984, + "token_acc": 0.29072697027362326 + }, + { + "epoch": 2.335971855760774, + "grad_norm": 0.4078850918242615, + "learning_rate": 0.00013829931168300263, + "loss": 3.047999858856201, + "step": 3985, + "token_acc": 0.2958718607081161 + }, + { + "epoch": 2.336558194077983, + "grad_norm": 0.3768602634313092, + "learning_rate": 0.00013829782494879255, + "loss": 3.0607588291168213, + "step": 3986, + "token_acc": 0.2933943948106573 + }, + { + "epoch": 2.3371445323951923, + "grad_norm": 0.4410022734936046, + "learning_rate": 0.00013829633757301643, + "loss": 3.0240979194641113, + "step": 3987, + "token_acc": 0.2974994757810862 + }, + { + "epoch": 2.337730870712401, + "grad_norm": 0.3610629135036041, + "learning_rate": 0.00013829484955568824, + "loss": 3.07926607131958, + "step": 3988, + "token_acc": 0.28910768907618184 + }, + { + "epoch": 2.33831720902961, + "grad_norm": 0.3735683157088353, + "learning_rate": 0.000138293360896822, + "loss": 3.0647778511047363, + "step": 3989, + "token_acc": 0.292597104866706 + }, + { + "epoch": 2.338903547346819, + "grad_norm": 0.4249051342349725, + "learning_rate": 0.00013829187159643166, + "loss": 3.0688180923461914, + "step": 3990, + "token_acc": 0.2910318081236073 + }, + { + "epoch": 2.3394898856640283, + "grad_norm": 0.3321492461878886, + "learning_rate": 0.00013829038165453117, + "loss": 3.0527029037475586, + "step": 3991, + "token_acc": 0.29314810700610305 + }, + { + "epoch": 2.340076223981237, + "grad_norm": 0.3922092786319316, + "learning_rate": 0.0001382888910711346, + "loss": 3.073105812072754, + "step": 3992, + "token_acc": 0.2891765056618127 + }, + { + "epoch": 2.340662562298446, + "grad_norm": 0.4373794295474907, + "learning_rate": 0.00013828739984625592, + "loss": 3.0416932106018066, + "step": 3993, + "token_acc": 0.2940508878823105 + }, + { + "epoch": 2.341248900615655, + "grad_norm": 0.35129363799170327, + "learning_rate": 0.00013828590797990912, + "loss": 3.081122398376465, + "step": 3994, + "token_acc": 0.28935319566993356 + }, + { + "epoch": 2.3418352389328643, + "grad_norm": 0.37607118493416813, + "learning_rate": 0.00013828441547210823, + "loss": 3.054319143295288, + "step": 3995, + "token_acc": 0.2933143910738876 + }, + { + "epoch": 2.3424215772500734, + "grad_norm": 0.37915250336417033, + "learning_rate": 0.00013828292232286727, + "loss": 3.076972723007202, + "step": 3996, + "token_acc": 0.2919744661416152 + }, + { + "epoch": 2.3430079155672825, + "grad_norm": 0.3246571348230093, + "learning_rate": 0.00013828142853220026, + "loss": 3.0523529052734375, + "step": 3997, + "token_acc": 0.29483920223951565 + }, + { + "epoch": 2.343594253884491, + "grad_norm": 0.37870966536023726, + "learning_rate": 0.00013827993410012125, + "loss": 3.0730700492858887, + "step": 3998, + "token_acc": 0.29078057806549495 + }, + { + "epoch": 2.3441805922017003, + "grad_norm": 0.2924652010089367, + "learning_rate": 0.00013827843902664428, + "loss": 3.042371988296509, + "step": 3999, + "token_acc": 0.29554569559366295 + }, + { + "epoch": 2.3447669305189094, + "grad_norm": 0.32864397948620866, + "learning_rate": 0.00013827694331178337, + "loss": 3.015341281890869, + "step": 4000, + "token_acc": 0.29874758171265653 + }, + { + "epoch": 2.3453532688361185, + "grad_norm": 0.3260975546136628, + "learning_rate": 0.0001382754469555526, + "loss": 3.0426340103149414, + "step": 4001, + "token_acc": 0.29593337387190705 + }, + { + "epoch": 2.3459396071533276, + "grad_norm": 0.33821429921382246, + "learning_rate": 0.00013827394995796598, + "loss": 3.09721040725708, + "step": 4002, + "token_acc": 0.2862814662422622 + }, + { + "epoch": 2.3465259454705363, + "grad_norm": 0.3607584146514768, + "learning_rate": 0.00013827245231903763, + "loss": 3.073962688446045, + "step": 4003, + "token_acc": 0.29062085060153703 + }, + { + "epoch": 2.3471122837877454, + "grad_norm": 0.3614024588757974, + "learning_rate": 0.00013827095403878156, + "loss": 3.1109650135040283, + "step": 4004, + "token_acc": 0.28504770291323855 + }, + { + "epoch": 2.3476986221049545, + "grad_norm": 0.36607633484900964, + "learning_rate": 0.00013826945511721188, + "loss": 3.0866904258728027, + "step": 4005, + "token_acc": 0.2891883117700204 + }, + { + "epoch": 2.3482849604221636, + "grad_norm": 0.3168241154340406, + "learning_rate": 0.00013826795555434268, + "loss": 3.1120781898498535, + "step": 4006, + "token_acc": 0.2855197446055293 + }, + { + "epoch": 2.3488712987393727, + "grad_norm": 0.3536726381899361, + "learning_rate": 0.000138266455350188, + "loss": 3.0480971336364746, + "step": 4007, + "token_acc": 0.2941428017094604 + }, + { + "epoch": 2.349457637056582, + "grad_norm": 0.3068375626359624, + "learning_rate": 0.000138264954504762, + "loss": 3.0502212047576904, + "step": 4008, + "token_acc": 0.2954183908869099 + }, + { + "epoch": 2.3500439753737905, + "grad_norm": 0.34688065477987445, + "learning_rate": 0.0001382634530180787, + "loss": 3.1053340435028076, + "step": 4009, + "token_acc": 0.2878484294890683 + }, + { + "epoch": 2.3506303136909996, + "grad_norm": 0.33483633305720745, + "learning_rate": 0.00013826195089015227, + "loss": 3.086273670196533, + "step": 4010, + "token_acc": 0.28825379609544466 + }, + { + "epoch": 2.3512166520082087, + "grad_norm": 0.31842708647938556, + "learning_rate": 0.0001382604481209968, + "loss": 3.051870822906494, + "step": 4011, + "token_acc": 0.2937826847809247 + }, + { + "epoch": 2.351802990325418, + "grad_norm": 0.314056255568696, + "learning_rate": 0.00013825894471062637, + "loss": 3.067373752593994, + "step": 4012, + "token_acc": 0.2918770268517317 + }, + { + "epoch": 2.352389328642627, + "grad_norm": 0.3910271371527118, + "learning_rate": 0.00013825744065905517, + "loss": 3.0934102535247803, + "step": 4013, + "token_acc": 0.2864583466244753 + }, + { + "epoch": 2.3529756669598356, + "grad_norm": 0.41182589367096817, + "learning_rate": 0.00013825593596629727, + "loss": 3.0705127716064453, + "step": 4014, + "token_acc": 0.2906387154812856 + }, + { + "epoch": 2.3535620052770447, + "grad_norm": 0.4127548309459529, + "learning_rate": 0.00013825443063236685, + "loss": 3.065591812133789, + "step": 4015, + "token_acc": 0.29140705255604826 + }, + { + "epoch": 2.354148343594254, + "grad_norm": 0.42932804815668946, + "learning_rate": 0.00013825292465727802, + "loss": 3.0428242683410645, + "step": 4016, + "token_acc": 0.29601421134811623 + }, + { + "epoch": 2.354734681911463, + "grad_norm": 0.45083453956684727, + "learning_rate": 0.0001382514180410449, + "loss": 3.0786654949188232, + "step": 4017, + "token_acc": 0.2891833062821626 + }, + { + "epoch": 2.355321020228672, + "grad_norm": 0.3228720853274811, + "learning_rate": 0.00013824991078368175, + "loss": 3.0895023345947266, + "step": 4018, + "token_acc": 0.2898784972320375 + }, + { + "epoch": 2.355907358545881, + "grad_norm": 0.4158063542761776, + "learning_rate": 0.0001382484028852026, + "loss": 3.0418193340301514, + "step": 4019, + "token_acc": 0.29579330207917254 + }, + { + "epoch": 2.35649369686309, + "grad_norm": 0.3075804236829235, + "learning_rate": 0.0001382468943456217, + "loss": 3.071878433227539, + "step": 4020, + "token_acc": 0.2917890149086774 + }, + { + "epoch": 2.357080035180299, + "grad_norm": 0.4080120503605802, + "learning_rate": 0.00013824538516495316, + "loss": 3.0993309020996094, + "step": 4021, + "token_acc": 0.2860343424959516 + }, + { + "epoch": 2.357666373497508, + "grad_norm": 0.3270273033840868, + "learning_rate": 0.00013824387534321122, + "loss": 3.081183433532715, + "step": 4022, + "token_acc": 0.28872161265628987 + }, + { + "epoch": 2.358252711814717, + "grad_norm": 0.38403149582779955, + "learning_rate": 0.00013824236488041002, + "loss": 3.0490152835845947, + "step": 4023, + "token_acc": 0.29524286402970246 + }, + { + "epoch": 2.3588390501319263, + "grad_norm": 0.3256975565097746, + "learning_rate": 0.00013824085377656375, + "loss": 3.055072546005249, + "step": 4024, + "token_acc": 0.2930164939733559 + }, + { + "epoch": 2.359425388449135, + "grad_norm": 0.36096604892148604, + "learning_rate": 0.0001382393420316866, + "loss": 3.0699915885925293, + "step": 4025, + "token_acc": 0.2933423364028838 + }, + { + "epoch": 2.360011726766344, + "grad_norm": 0.3689472404169113, + "learning_rate": 0.0001382378296457928, + "loss": 3.078838348388672, + "step": 4026, + "token_acc": 0.29007424213374233 + }, + { + "epoch": 2.360598065083553, + "grad_norm": 0.3672161005937233, + "learning_rate": 0.00013823631661889657, + "loss": 3.0636298656463623, + "step": 4027, + "token_acc": 0.2930266696752073 + }, + { + "epoch": 2.3611844034007623, + "grad_norm": 0.3334483766193717, + "learning_rate": 0.00013823480295101207, + "loss": 3.0760507583618164, + "step": 4028, + "token_acc": 0.2907202409601143 + }, + { + "epoch": 2.3617707417179714, + "grad_norm": 0.30872890845480866, + "learning_rate": 0.00013823328864215353, + "loss": 3.0882153511047363, + "step": 4029, + "token_acc": 0.28696937184963256 + }, + { + "epoch": 2.3623570800351805, + "grad_norm": 0.31365788170921904, + "learning_rate": 0.0001382317736923352, + "loss": 3.045743703842163, + "step": 4030, + "token_acc": 0.29274624717252723 + }, + { + "epoch": 2.362943418352389, + "grad_norm": 0.33863753721491446, + "learning_rate": 0.00013823025810157132, + "loss": 3.090925693511963, + "step": 4031, + "token_acc": 0.28993738322209106 + }, + { + "epoch": 2.3635297566695983, + "grad_norm": 0.31331428377038156, + "learning_rate": 0.00013822874186987608, + "loss": 3.0555782318115234, + "step": 4032, + "token_acc": 0.29357890838075357 + }, + { + "epoch": 2.3641160949868074, + "grad_norm": 0.31758580577439677, + "learning_rate": 0.00013822722499726378, + "loss": 3.022827625274658, + "step": 4033, + "token_acc": 0.29760879182285965 + }, + { + "epoch": 2.3647024333040165, + "grad_norm": 0.33168212276889436, + "learning_rate": 0.00013822570748374863, + "loss": 3.083014488220215, + "step": 4034, + "token_acc": 0.28863155695971177 + }, + { + "epoch": 2.3652887716212256, + "grad_norm": 0.3310220935725908, + "learning_rate": 0.00013822418932934487, + "loss": 3.0169708728790283, + "step": 4035, + "token_acc": 0.30038244840628964 + }, + { + "epoch": 2.3658751099384343, + "grad_norm": 0.3462380091944063, + "learning_rate": 0.0001382226705340668, + "loss": 3.086747169494629, + "step": 4036, + "token_acc": 0.28855244281970144 + }, + { + "epoch": 2.3664614482556434, + "grad_norm": 0.3575901350959513, + "learning_rate": 0.00013822115109792866, + "loss": 3.051285743713379, + "step": 4037, + "token_acc": 0.2941521564425395 + }, + { + "epoch": 2.3670477865728525, + "grad_norm": 0.3637067129632318, + "learning_rate": 0.00013821963102094475, + "loss": 3.0806288719177246, + "step": 4038, + "token_acc": 0.29084709176371215 + }, + { + "epoch": 2.3676341248900616, + "grad_norm": 0.36692319209159735, + "learning_rate": 0.00013821811030312933, + "loss": 3.033203601837158, + "step": 4039, + "token_acc": 0.2974709283008838 + }, + { + "epoch": 2.3682204632072708, + "grad_norm": 0.3729396393889702, + "learning_rate": 0.0001382165889444967, + "loss": 3.054262638092041, + "step": 4040, + "token_acc": 0.29372414941159897 + }, + { + "epoch": 2.36880680152448, + "grad_norm": 0.355000632695877, + "learning_rate": 0.00013821506694506108, + "loss": 3.058180332183838, + "step": 4041, + "token_acc": 0.29371287923377387 + }, + { + "epoch": 2.3693931398416885, + "grad_norm": 0.3427230134181248, + "learning_rate": 0.0001382135443048369, + "loss": 3.0629005432128906, + "step": 4042, + "token_acc": 0.29037819367865186 + }, + { + "epoch": 2.3699794781588976, + "grad_norm": 0.3248163754550457, + "learning_rate": 0.00013821202102383838, + "loss": 3.011953592300415, + "step": 4043, + "token_acc": 0.2999598493293253 + }, + { + "epoch": 2.3705658164761068, + "grad_norm": 0.2916359564803973, + "learning_rate": 0.0001382104971020798, + "loss": 3.001657485961914, + "step": 4044, + "token_acc": 0.30137082674323773 + }, + { + "epoch": 2.371152154793316, + "grad_norm": 0.304637871324074, + "learning_rate": 0.00013820897253957553, + "loss": 3.0536303520202637, + "step": 4045, + "token_acc": 0.2933927648578811 + }, + { + "epoch": 2.3717384931105245, + "grad_norm": 0.37829603603059114, + "learning_rate": 0.0001382074473363399, + "loss": 3.059512138366699, + "step": 4046, + "token_acc": 0.2918110652232478 + }, + { + "epoch": 2.3723248314277336, + "grad_norm": 0.3743648619708405, + "learning_rate": 0.0001382059214923872, + "loss": 3.08774995803833, + "step": 4047, + "token_acc": 0.28890100566402616 + }, + { + "epoch": 2.3729111697449428, + "grad_norm": 0.3362355758585943, + "learning_rate": 0.00013820439500773177, + "loss": 3.04176926612854, + "step": 4048, + "token_acc": 0.29599399568439816 + }, + { + "epoch": 2.373497508062152, + "grad_norm": 0.367099024288784, + "learning_rate": 0.00013820286788238795, + "loss": 3.0716967582702637, + "step": 4049, + "token_acc": 0.29085849779601997 + }, + { + "epoch": 2.374083846379361, + "grad_norm": 0.3583516299294482, + "learning_rate": 0.00013820134011637009, + "loss": 3.1031441688537598, + "step": 4050, + "token_acc": 0.2872665363448994 + }, + { + "epoch": 2.37467018469657, + "grad_norm": 0.3391823702835769, + "learning_rate": 0.00013819981170969255, + "loss": 3.0645999908447266, + "step": 4051, + "token_acc": 0.29363430883664293 + }, + { + "epoch": 2.3752565230137788, + "grad_norm": 0.3568092605151071, + "learning_rate": 0.0001381982826623697, + "loss": 3.052489995956421, + "step": 4052, + "token_acc": 0.29440629634461624 + }, + { + "epoch": 2.375842861330988, + "grad_norm": 0.28824426709113227, + "learning_rate": 0.00013819675297441585, + "loss": 3.0608763694763184, + "step": 4053, + "token_acc": 0.2928354868886868 + }, + { + "epoch": 2.376429199648197, + "grad_norm": 0.3483493696699502, + "learning_rate": 0.0001381952226458454, + "loss": 3.084655284881592, + "step": 4054, + "token_acc": 0.28927451795657977 + }, + { + "epoch": 2.377015537965406, + "grad_norm": 0.3484689034429514, + "learning_rate": 0.00013819369167667275, + "loss": 3.0497865676879883, + "step": 4055, + "token_acc": 0.2952654560378618 + }, + { + "epoch": 2.377601876282615, + "grad_norm": 0.3216616728709735, + "learning_rate": 0.00013819216006691223, + "loss": 3.0622987747192383, + "step": 4056, + "token_acc": 0.29198270811230226 + }, + { + "epoch": 2.378188214599824, + "grad_norm": 0.3404862466038519, + "learning_rate": 0.0001381906278165783, + "loss": 3.0224781036376953, + "step": 4057, + "token_acc": 0.29733951952516796 + }, + { + "epoch": 2.378774552917033, + "grad_norm": 0.28324993627748535, + "learning_rate": 0.00013818909492568527, + "loss": 3.074069023132324, + "step": 4058, + "token_acc": 0.29172482127446486 + }, + { + "epoch": 2.379360891234242, + "grad_norm": 0.32150627228025674, + "learning_rate": 0.00013818756139424761, + "loss": 3.079206943511963, + "step": 4059, + "token_acc": 0.29092672485870036 + }, + { + "epoch": 2.379947229551451, + "grad_norm": 0.3279346158769462, + "learning_rate": 0.00013818602722227966, + "loss": 3.050701379776001, + "step": 4060, + "token_acc": 0.2939685081398452 + }, + { + "epoch": 2.3805335678686603, + "grad_norm": 0.2807842248919441, + "learning_rate": 0.00013818449240979593, + "loss": 3.0847315788269043, + "step": 4061, + "token_acc": 0.2897007860924503 + }, + { + "epoch": 2.3811199061858694, + "grad_norm": 0.3953699349411169, + "learning_rate": 0.0001381829569568107, + "loss": 3.061007499694824, + "step": 4062, + "token_acc": 0.29251529250733455 + }, + { + "epoch": 2.381706244503078, + "grad_norm": 0.3375840157269756, + "learning_rate": 0.0001381814208633385, + "loss": 3.033616542816162, + "step": 4063, + "token_acc": 0.29557286454956144 + }, + { + "epoch": 2.382292582820287, + "grad_norm": 0.3270374510362583, + "learning_rate": 0.00013817988412939374, + "loss": 3.0157980918884277, + "step": 4064, + "token_acc": 0.2977575207058893 + }, + { + "epoch": 2.3828789211374963, + "grad_norm": 0.3751430500405435, + "learning_rate": 0.0001381783467549908, + "loss": 3.0645761489868164, + "step": 4065, + "token_acc": 0.2918709697159773 + }, + { + "epoch": 2.3834652594547054, + "grad_norm": 0.3404453545662311, + "learning_rate": 0.0001381768087401442, + "loss": 3.0646448135375977, + "step": 4066, + "token_acc": 0.29160191725529766 + }, + { + "epoch": 2.3840515977719146, + "grad_norm": 0.3132388633272704, + "learning_rate": 0.00013817527008486835, + "loss": 3.1022961139678955, + "step": 4067, + "token_acc": 0.28859802253078076 + }, + { + "epoch": 2.3846379360891232, + "grad_norm": 0.3233519304952651, + "learning_rate": 0.0001381737307891777, + "loss": 3.0591681003570557, + "step": 4068, + "token_acc": 0.292330152345415 + }, + { + "epoch": 2.3852242744063323, + "grad_norm": 0.29387491556139494, + "learning_rate": 0.0001381721908530867, + "loss": 3.0418269634246826, + "step": 4069, + "token_acc": 0.2960987599855043 + }, + { + "epoch": 2.3858106127235414, + "grad_norm": 0.34138560002834906, + "learning_rate": 0.00013817065027660984, + "loss": 3.032656192779541, + "step": 4070, + "token_acc": 0.2977244548753906 + }, + { + "epoch": 2.3863969510407506, + "grad_norm": 0.2985401427057655, + "learning_rate": 0.0001381691090597616, + "loss": 3.063190221786499, + "step": 4071, + "token_acc": 0.29104182757627434 + }, + { + "epoch": 2.3869832893579597, + "grad_norm": 0.33331121800188135, + "learning_rate": 0.0001381675672025564, + "loss": 3.054957151412964, + "step": 4072, + "token_acc": 0.2910006967070347 + }, + { + "epoch": 2.387569627675169, + "grad_norm": 0.3555551781024421, + "learning_rate": 0.0001381660247050088, + "loss": 3.0243213176727295, + "step": 4073, + "token_acc": 0.2968951220076446 + }, + { + "epoch": 2.3881559659923774, + "grad_norm": 0.3500690881799634, + "learning_rate": 0.00013816448156713323, + "loss": 3.077343463897705, + "step": 4074, + "token_acc": 0.2915393109187217 + }, + { + "epoch": 2.3887423043095866, + "grad_norm": 0.30615546925582227, + "learning_rate": 0.0001381629377889442, + "loss": 3.082634449005127, + "step": 4075, + "token_acc": 0.29029830905030307 + }, + { + "epoch": 2.3893286426267957, + "grad_norm": 0.35025707918105214, + "learning_rate": 0.00013816139337045625, + "loss": 3.0680062770843506, + "step": 4076, + "token_acc": 0.2908369482010717 + }, + { + "epoch": 2.389914980944005, + "grad_norm": 0.3818079734501489, + "learning_rate": 0.00013815984831168384, + "loss": 3.0846099853515625, + "step": 4077, + "token_acc": 0.28951831371312364 + }, + { + "epoch": 2.390501319261214, + "grad_norm": 0.3383494767519999, + "learning_rate": 0.0001381583026126415, + "loss": 3.0626983642578125, + "step": 4078, + "token_acc": 0.29318502305778327 + }, + { + "epoch": 2.3910876575784226, + "grad_norm": 0.2829571379063603, + "learning_rate": 0.00013815675627334376, + "loss": 3.0563549995422363, + "step": 4079, + "token_acc": 0.29453555573753 + }, + { + "epoch": 2.3916739958956317, + "grad_norm": 0.35026984101785025, + "learning_rate": 0.00013815520929380513, + "loss": 3.0235755443573, + "step": 4080, + "token_acc": 0.2954108885798578 + }, + { + "epoch": 2.392260334212841, + "grad_norm": 0.3250948930710766, + "learning_rate": 0.00013815366167404017, + "loss": 3.0632219314575195, + "step": 4081, + "token_acc": 0.29336491030656703 + }, + { + "epoch": 2.39284667253005, + "grad_norm": 0.35948364736233557, + "learning_rate": 0.00013815211341406335, + "loss": 3.0566649436950684, + "step": 4082, + "token_acc": 0.29364804712781833 + }, + { + "epoch": 2.393433010847259, + "grad_norm": 0.4016717665118695, + "learning_rate": 0.0001381505645138893, + "loss": 3.0661568641662598, + "step": 4083, + "token_acc": 0.2920331507175917 + }, + { + "epoch": 2.394019349164468, + "grad_norm": 0.4045526919121566, + "learning_rate": 0.00013814901497353254, + "loss": 3.0538201332092285, + "step": 4084, + "token_acc": 0.29284649776453053 + }, + { + "epoch": 2.394605687481677, + "grad_norm": 0.3478007513464753, + "learning_rate": 0.00013814746479300758, + "loss": 3.081631660461426, + "step": 4085, + "token_acc": 0.28880027457628005 + }, + { + "epoch": 2.395192025798886, + "grad_norm": 0.3685976875080815, + "learning_rate": 0.00013814591397232903, + "loss": 3.0492920875549316, + "step": 4086, + "token_acc": 0.29437172360690667 + }, + { + "epoch": 2.395778364116095, + "grad_norm": 0.36336117919587485, + "learning_rate": 0.00013814436251151146, + "loss": 3.1037445068359375, + "step": 4087, + "token_acc": 0.2874033555988008 + }, + { + "epoch": 2.396364702433304, + "grad_norm": 0.3036246117557848, + "learning_rate": 0.00013814281041056941, + "loss": 3.021918296813965, + "step": 4088, + "token_acc": 0.29836310099844904 + }, + { + "epoch": 2.3969510407505132, + "grad_norm": 0.3832705588070443, + "learning_rate": 0.0001381412576695175, + "loss": 3.049736976623535, + "step": 4089, + "token_acc": 0.2942974455009713 + }, + { + "epoch": 2.397537379067722, + "grad_norm": 0.3366002295942935, + "learning_rate": 0.00013813970428837029, + "loss": 3.0911481380462646, + "step": 4090, + "token_acc": 0.2875600930203456 + }, + { + "epoch": 2.398123717384931, + "grad_norm": 0.27993328149928026, + "learning_rate": 0.00013813815026714237, + "loss": 3.099857807159424, + "step": 4091, + "token_acc": 0.2871189203303152 + }, + { + "epoch": 2.39871005570214, + "grad_norm": 0.37378110567489703, + "learning_rate": 0.00013813659560584835, + "loss": 3.066340923309326, + "step": 4092, + "token_acc": 0.2914561660292629 + }, + { + "epoch": 2.3992963940193492, + "grad_norm": 0.29518569623757934, + "learning_rate": 0.00013813504030450282, + "loss": 3.0006675720214844, + "step": 4093, + "token_acc": 0.2999981794491013 + }, + { + "epoch": 2.3998827323365584, + "grad_norm": 0.41463019821573593, + "learning_rate": 0.0001381334843631204, + "loss": 3.086228370666504, + "step": 4094, + "token_acc": 0.28825555216872817 + }, + { + "epoch": 2.4004690706537675, + "grad_norm": 0.3181574594553303, + "learning_rate": 0.00013813192778171573, + "loss": 3.080528736114502, + "step": 4095, + "token_acc": 0.29000351510082184 + }, + { + "epoch": 2.401055408970976, + "grad_norm": 0.3395088721927154, + "learning_rate": 0.00013813037056030337, + "loss": 3.1221415996551514, + "step": 4096, + "token_acc": 0.2835148163681646 + }, + { + "epoch": 2.4016417472881852, + "grad_norm": 0.34549356259812153, + "learning_rate": 0.00013812881269889804, + "loss": 3.0727949142456055, + "step": 4097, + "token_acc": 0.2906959099232162 + }, + { + "epoch": 2.4022280856053944, + "grad_norm": 0.3542869715372162, + "learning_rate": 0.0001381272541975143, + "loss": 3.076119899749756, + "step": 4098, + "token_acc": 0.29077665782766254 + }, + { + "epoch": 2.4028144239226035, + "grad_norm": 0.35754848630682295, + "learning_rate": 0.00013812569505616677, + "loss": 3.0876035690307617, + "step": 4099, + "token_acc": 0.2894302962336122 + }, + { + "epoch": 2.403400762239812, + "grad_norm": 0.34169924982136807, + "learning_rate": 0.00013812413527487016, + "loss": 3.0799460411071777, + "step": 4100, + "token_acc": 0.29011599868010257 + }, + { + "epoch": 2.4039871005570213, + "grad_norm": 0.3175183564288105, + "learning_rate": 0.0001381225748536391, + "loss": 3.095269203186035, + "step": 4101, + "token_acc": 0.2890923194682037 + }, + { + "epoch": 2.4045734388742304, + "grad_norm": 0.31458374342242673, + "learning_rate": 0.00013812101379248826, + "loss": 3.0431973934173584, + "step": 4102, + "token_acc": 0.295406758608326 + }, + { + "epoch": 2.4051597771914395, + "grad_norm": 0.39569875724962733, + "learning_rate": 0.00013811945209143227, + "loss": 3.0634074211120605, + "step": 4103, + "token_acc": 0.2914431758516673 + }, + { + "epoch": 2.4057461155086486, + "grad_norm": 0.38632586166988386, + "learning_rate": 0.00013811788975048582, + "loss": 3.1201887130737305, + "step": 4104, + "token_acc": 0.2855339129062381 + }, + { + "epoch": 2.4063324538258577, + "grad_norm": 0.3362628417464489, + "learning_rate": 0.00013811632676966358, + "loss": 3.095259189605713, + "step": 4105, + "token_acc": 0.2882208109276658 + }, + { + "epoch": 2.4069187921430664, + "grad_norm": 0.34542818907432, + "learning_rate": 0.00013811476314898026, + "loss": 3.045332431793213, + "step": 4106, + "token_acc": 0.2965409988210801 + }, + { + "epoch": 2.4075051304602755, + "grad_norm": 0.3366463327741982, + "learning_rate": 0.0001381131988884505, + "loss": 3.041891098022461, + "step": 4107, + "token_acc": 0.2958037809545309 + }, + { + "epoch": 2.4080914687774846, + "grad_norm": 0.3828885604448513, + "learning_rate": 0.00013811163398808903, + "loss": 3.033005714416504, + "step": 4108, + "token_acc": 0.2961006329399703 + }, + { + "epoch": 2.4086778070946937, + "grad_norm": 0.3408223977179087, + "learning_rate": 0.00013811006844791055, + "loss": 3.070070266723633, + "step": 4109, + "token_acc": 0.29177589888713346 + }, + { + "epoch": 2.409264145411903, + "grad_norm": 0.3506097798331669, + "learning_rate": 0.00013810850226792973, + "loss": 3.051692008972168, + "step": 4110, + "token_acc": 0.29312885992010723 + }, + { + "epoch": 2.4098504837291115, + "grad_norm": 0.4665151771964703, + "learning_rate": 0.00013810693544816135, + "loss": 3.0511651039123535, + "step": 4111, + "token_acc": 0.2935248322147651 + }, + { + "epoch": 2.4104368220463206, + "grad_norm": 0.38130740465766716, + "learning_rate": 0.00013810536798862006, + "loss": 3.0713272094726562, + "step": 4112, + "token_acc": 0.2899629213337984 + }, + { + "epoch": 2.4110231603635297, + "grad_norm": 0.33428071514048635, + "learning_rate": 0.00013810379988932062, + "loss": 3.0229110717773438, + "step": 4113, + "token_acc": 0.29891578512417644 + }, + { + "epoch": 2.411609498680739, + "grad_norm": 0.3781170500494203, + "learning_rate": 0.00013810223115027774, + "loss": 3.06805419921875, + "step": 4114, + "token_acc": 0.2924023240258633 + }, + { + "epoch": 2.412195836997948, + "grad_norm": 0.3042728702577096, + "learning_rate": 0.0001381006617715062, + "loss": 3.0791373252868652, + "step": 4115, + "token_acc": 0.29112361199945475 + }, + { + "epoch": 2.412782175315157, + "grad_norm": 0.3659165864863998, + "learning_rate": 0.00013809909175302066, + "loss": 3.0979957580566406, + "step": 4116, + "token_acc": 0.28708677766185214 + }, + { + "epoch": 2.4133685136323657, + "grad_norm": 0.3526604954457835, + "learning_rate": 0.00013809752109483596, + "loss": 3.056419610977173, + "step": 4117, + "token_acc": 0.29475044624649754 + }, + { + "epoch": 2.413954851949575, + "grad_norm": 0.3399916853247518, + "learning_rate": 0.00013809594979696677, + "loss": 3.0682530403137207, + "step": 4118, + "token_acc": 0.2909325390091659 + }, + { + "epoch": 2.414541190266784, + "grad_norm": 0.32480849818618657, + "learning_rate": 0.00013809437785942792, + "loss": 3.094308853149414, + "step": 4119, + "token_acc": 0.2887992829157878 + }, + { + "epoch": 2.415127528583993, + "grad_norm": 0.38793502750738823, + "learning_rate": 0.00013809280528223416, + "loss": 3.0941684246063232, + "step": 4120, + "token_acc": 0.2857783805279956 + }, + { + "epoch": 2.415713866901202, + "grad_norm": 0.34947814006673344, + "learning_rate": 0.00013809123206540022, + "loss": 3.06655216217041, + "step": 4121, + "token_acc": 0.29107961381285236 + }, + { + "epoch": 2.416300205218411, + "grad_norm": 0.3408482759064972, + "learning_rate": 0.00013808965820894092, + "loss": 3.0576772689819336, + "step": 4122, + "token_acc": 0.29238496102850015 + }, + { + "epoch": 2.41688654353562, + "grad_norm": 0.34873401508744295, + "learning_rate": 0.00013808808371287105, + "loss": 3.0357532501220703, + "step": 4123, + "token_acc": 0.2948350492903997 + }, + { + "epoch": 2.417472881852829, + "grad_norm": 0.3484309951695064, + "learning_rate": 0.00013808650857720535, + "loss": 3.0974361896514893, + "step": 4124, + "token_acc": 0.28813157481133006 + }, + { + "epoch": 2.418059220170038, + "grad_norm": 0.34779541211803133, + "learning_rate": 0.00013808493280195868, + "loss": 3.096229076385498, + "step": 4125, + "token_acc": 0.28604804334588474 + }, + { + "epoch": 2.4186455584872473, + "grad_norm": 0.3873329635330523, + "learning_rate": 0.00013808335638714581, + "loss": 3.074026107788086, + "step": 4126, + "token_acc": 0.29147395964774797 + }, + { + "epoch": 2.4192318968044564, + "grad_norm": 0.32389180456593564, + "learning_rate": 0.00013808177933278154, + "loss": 3.0681073665618896, + "step": 4127, + "token_acc": 0.2900738277419066 + }, + { + "epoch": 2.419818235121665, + "grad_norm": 0.318698966441261, + "learning_rate": 0.00013808020163888067, + "loss": 3.0403025150299072, + "step": 4128, + "token_acc": 0.2947781245478784 + }, + { + "epoch": 2.420404573438874, + "grad_norm": 0.3514803052091189, + "learning_rate": 0.00013807862330545808, + "loss": 3.067467212677002, + "step": 4129, + "token_acc": 0.29139676485576343 + }, + { + "epoch": 2.4209909117560833, + "grad_norm": 0.34673503270889744, + "learning_rate": 0.00013807704433252855, + "loss": 3.0547804832458496, + "step": 4130, + "token_acc": 0.29271940122178275 + }, + { + "epoch": 2.4215772500732924, + "grad_norm": 0.34038758080968573, + "learning_rate": 0.00013807546472010694, + "loss": 3.048649787902832, + "step": 4131, + "token_acc": 0.2934819118631204 + }, + { + "epoch": 2.4221635883905015, + "grad_norm": 0.3593713278239768, + "learning_rate": 0.00013807388446820804, + "loss": 3.041304111480713, + "step": 4132, + "token_acc": 0.29546170691084145 + }, + { + "epoch": 2.42274992670771, + "grad_norm": 0.2914221756299142, + "learning_rate": 0.00013807230357684675, + "loss": 3.077401638031006, + "step": 4133, + "token_acc": 0.2917599564426384 + }, + { + "epoch": 2.4233362650249193, + "grad_norm": 0.3841534764097119, + "learning_rate": 0.0001380707220460379, + "loss": 3.0268115997314453, + "step": 4134, + "token_acc": 0.2964310276635034 + }, + { + "epoch": 2.4239226033421284, + "grad_norm": 0.34832612159022314, + "learning_rate": 0.00013806913987579633, + "loss": 3.0552332401275635, + "step": 4135, + "token_acc": 0.29305657746969754 + }, + { + "epoch": 2.4245089416593375, + "grad_norm": 0.37900066728091414, + "learning_rate": 0.00013806755706613692, + "loss": 3.013362407684326, + "step": 4136, + "token_acc": 0.2992748307214118 + }, + { + "epoch": 2.4250952799765466, + "grad_norm": 0.3743038784304707, + "learning_rate": 0.00013806597361707454, + "loss": 3.106978416442871, + "step": 4137, + "token_acc": 0.2859262083789297 + }, + { + "epoch": 2.4256816182937557, + "grad_norm": 0.2796962271559677, + "learning_rate": 0.00013806438952862404, + "loss": 3.0778822898864746, + "step": 4138, + "token_acc": 0.290056378034085 + }, + { + "epoch": 2.4262679566109644, + "grad_norm": 0.35445996165094373, + "learning_rate": 0.00013806280480080032, + "loss": 3.062171697616577, + "step": 4139, + "token_acc": 0.29274304870834156 + }, + { + "epoch": 2.4268542949281735, + "grad_norm": 0.3197776807815522, + "learning_rate": 0.00013806121943361828, + "loss": 3.0503427982330322, + "step": 4140, + "token_acc": 0.2937312371209506 + }, + { + "epoch": 2.4274406332453826, + "grad_norm": 0.31685236906073194, + "learning_rate": 0.0001380596334270928, + "loss": 3.083651542663574, + "step": 4141, + "token_acc": 0.2886351545977086 + }, + { + "epoch": 2.4280269715625917, + "grad_norm": 0.3268678662638589, + "learning_rate": 0.00013805804678123875, + "loss": 3.045289993286133, + "step": 4142, + "token_acc": 0.2936968160773866 + }, + { + "epoch": 2.4286133098798004, + "grad_norm": 0.32024777989946906, + "learning_rate": 0.00013805645949607108, + "loss": 3.046217918395996, + "step": 4143, + "token_acc": 0.294906431559688 + }, + { + "epoch": 2.4291996481970095, + "grad_norm": 0.35472478216331943, + "learning_rate": 0.00013805487157160469, + "loss": 3.0657083988189697, + "step": 4144, + "token_acc": 0.29113440790428124 + }, + { + "epoch": 2.4297859865142186, + "grad_norm": 0.35457093820853947, + "learning_rate": 0.00013805328300785444, + "loss": 3.0784337520599365, + "step": 4145, + "token_acc": 0.2908242743236856 + }, + { + "epoch": 2.4303723248314277, + "grad_norm": 0.31114379962915933, + "learning_rate": 0.00013805169380483534, + "loss": 3.104710102081299, + "step": 4146, + "token_acc": 0.2857458625297893 + }, + { + "epoch": 2.430958663148637, + "grad_norm": 0.3363819020622537, + "learning_rate": 0.00013805010396256227, + "loss": 3.072657585144043, + "step": 4147, + "token_acc": 0.2898475601971248 + }, + { + "epoch": 2.431545001465846, + "grad_norm": 0.32003371158401184, + "learning_rate": 0.00013804851348105018, + "loss": 3.0489306449890137, + "step": 4148, + "token_acc": 0.29531111430531864 + }, + { + "epoch": 2.432131339783055, + "grad_norm": 0.35150323795537414, + "learning_rate": 0.00013804692236031398, + "loss": 3.1220552921295166, + "step": 4149, + "token_acc": 0.28486208911165095 + }, + { + "epoch": 2.4327176781002637, + "grad_norm": 0.3398683657955479, + "learning_rate": 0.00013804533060036867, + "loss": 3.079378604888916, + "step": 4150, + "token_acc": 0.2912951190093701 + }, + { + "epoch": 2.433304016417473, + "grad_norm": 0.3169236722930002, + "learning_rate": 0.00013804373820122914, + "loss": 3.1077394485473633, + "step": 4151, + "token_acc": 0.28583589574261714 + }, + { + "epoch": 2.433890354734682, + "grad_norm": 0.30439378510381315, + "learning_rate": 0.0001380421451629104, + "loss": 3.065969705581665, + "step": 4152, + "token_acc": 0.2930793322989956 + }, + { + "epoch": 2.434476693051891, + "grad_norm": 0.3326924502404188, + "learning_rate": 0.00013804055148542737, + "loss": 3.0168094635009766, + "step": 4153, + "token_acc": 0.29893113253050574 + }, + { + "epoch": 2.4350630313690997, + "grad_norm": 0.31292841762111473, + "learning_rate": 0.00013803895716879507, + "loss": 3.0405845642089844, + "step": 4154, + "token_acc": 0.2942930628622149 + }, + { + "epoch": 2.435649369686309, + "grad_norm": 0.2991690033583295, + "learning_rate": 0.00013803736221302846, + "loss": 3.082681179046631, + "step": 4155, + "token_acc": 0.2909080648498741 + }, + { + "epoch": 2.436235708003518, + "grad_norm": 0.3035620146367998, + "learning_rate": 0.00013803576661814248, + "loss": 3.0515425205230713, + "step": 4156, + "token_acc": 0.29372080018760033 + }, + { + "epoch": 2.436822046320727, + "grad_norm": 0.32100518437957365, + "learning_rate": 0.00013803417038415217, + "loss": 3.1041226387023926, + "step": 4157, + "token_acc": 0.28512866570551454 + }, + { + "epoch": 2.437408384637936, + "grad_norm": 0.32976823122775684, + "learning_rate": 0.0001380325735110725, + "loss": 3.061105251312256, + "step": 4158, + "token_acc": 0.29313765953393867 + }, + { + "epoch": 2.4379947229551453, + "grad_norm": 0.34293108290177926, + "learning_rate": 0.0001380309759989185, + "loss": 3.108186721801758, + "step": 4159, + "token_acc": 0.28545691384870736 + }, + { + "epoch": 2.438581061272354, + "grad_norm": 0.3534184961986147, + "learning_rate": 0.0001380293778477051, + "loss": 3.0628538131713867, + "step": 4160, + "token_acc": 0.2912351482300944 + }, + { + "epoch": 2.439167399589563, + "grad_norm": 0.3677103532815081, + "learning_rate": 0.00013802777905744742, + "loss": 3.053551435470581, + "step": 4161, + "token_acc": 0.29412439243689814 + }, + { + "epoch": 2.439753737906772, + "grad_norm": 0.37126677113923023, + "learning_rate": 0.0001380261796281604, + "loss": 3.0701839923858643, + "step": 4162, + "token_acc": 0.29078938947907335 + }, + { + "epoch": 2.4403400762239813, + "grad_norm": 0.42593488210705166, + "learning_rate": 0.00013802457955985908, + "loss": 3.0736584663391113, + "step": 4163, + "token_acc": 0.2903134353583871 + }, + { + "epoch": 2.4409264145411904, + "grad_norm": 0.3682124452188901, + "learning_rate": 0.0001380229788525585, + "loss": 3.065493583679199, + "step": 4164, + "token_acc": 0.29104621635347894 + }, + { + "epoch": 2.441512752858399, + "grad_norm": 0.338741231952482, + "learning_rate": 0.00013802137750627372, + "loss": 3.0811452865600586, + "step": 4165, + "token_acc": 0.291322581340693 + }, + { + "epoch": 2.442099091175608, + "grad_norm": 0.38782716445304827, + "learning_rate": 0.00013801977552101977, + "loss": 3.059845447540283, + "step": 4166, + "token_acc": 0.2933271547729379 + }, + { + "epoch": 2.4426854294928173, + "grad_norm": 0.29681801080742914, + "learning_rate": 0.00013801817289681165, + "loss": 3.1183009147644043, + "step": 4167, + "token_acc": 0.28458395821482135 + }, + { + "epoch": 2.4432717678100264, + "grad_norm": 0.3670902861080377, + "learning_rate": 0.00013801656963366446, + "loss": 3.0657567977905273, + "step": 4168, + "token_acc": 0.2926429497720558 + }, + { + "epoch": 2.4438581061272355, + "grad_norm": 0.3713852778755407, + "learning_rate": 0.00013801496573159327, + "loss": 3.1093242168426514, + "step": 4169, + "token_acc": 0.28587765033928 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 0.37071487779933704, + "learning_rate": 0.0001380133611906131, + "loss": 3.0888121128082275, + "step": 4170, + "token_acc": 0.2895343922349478 + }, + { + "epoch": 2.4450307827616533, + "grad_norm": 0.34995738420647576, + "learning_rate": 0.00013801175601073907, + "loss": 3.0730843544006348, + "step": 4171, + "token_acc": 0.29114552620961487 + }, + { + "epoch": 2.4456171210788624, + "grad_norm": 0.32570562175495255, + "learning_rate": 0.00013801015019198625, + "loss": 3.0866737365722656, + "step": 4172, + "token_acc": 0.2884402972899968 + }, + { + "epoch": 2.4462034593960715, + "grad_norm": 0.2835373071013756, + "learning_rate": 0.0001380085437343697, + "loss": 3.082204818725586, + "step": 4173, + "token_acc": 0.28977380800387714 + }, + { + "epoch": 2.4467897977132806, + "grad_norm": 0.35010643052180407, + "learning_rate": 0.00013800693663790453, + "loss": 3.092050790786743, + "step": 4174, + "token_acc": 0.28818902764248816 + }, + { + "epoch": 2.4473761360304898, + "grad_norm": 0.2870860082419632, + "learning_rate": 0.0001380053289026058, + "loss": 3.0260977745056152, + "step": 4175, + "token_acc": 0.29896858303394214 + }, + { + "epoch": 2.4479624743476984, + "grad_norm": 0.34355627229648456, + "learning_rate": 0.00013800372052848867, + "loss": 3.1050281524658203, + "step": 4176, + "token_acc": 0.28405991671328235 + }, + { + "epoch": 2.4485488126649075, + "grad_norm": 0.3586457357644718, + "learning_rate": 0.00013800211151556823, + "loss": 3.0616133213043213, + "step": 4177, + "token_acc": 0.2927519312582496 + }, + { + "epoch": 2.4491351509821166, + "grad_norm": 0.3453333050459176, + "learning_rate": 0.00013800050186385955, + "loss": 3.0728774070739746, + "step": 4178, + "token_acc": 0.2904125880523282 + }, + { + "epoch": 2.4497214892993258, + "grad_norm": 0.3650670633350832, + "learning_rate": 0.00013799889157337783, + "loss": 3.056148052215576, + "step": 4179, + "token_acc": 0.2930542741914226 + }, + { + "epoch": 2.450307827616535, + "grad_norm": 0.3082911038262733, + "learning_rate": 0.00013799728064413814, + "loss": 3.043065071105957, + "step": 4180, + "token_acc": 0.2955905180998006 + }, + { + "epoch": 2.450894165933744, + "grad_norm": 0.3087389667276047, + "learning_rate": 0.00013799566907615561, + "loss": 3.0644872188568115, + "step": 4181, + "token_acc": 0.2908530888045474 + }, + { + "epoch": 2.4514805042509527, + "grad_norm": 0.37281592886865744, + "learning_rate": 0.0001379940568694454, + "loss": 3.0889673233032227, + "step": 4182, + "token_acc": 0.28861430524441983 + }, + { + "epoch": 2.4520668425681618, + "grad_norm": 0.3629402595002331, + "learning_rate": 0.00013799244402402266, + "loss": 3.100498676300049, + "step": 4183, + "token_acc": 0.28660604494153213 + }, + { + "epoch": 2.452653180885371, + "grad_norm": 0.2949962127882119, + "learning_rate": 0.00013799083053990253, + "loss": 3.0804247856140137, + "step": 4184, + "token_acc": 0.2911146412069702 + }, + { + "epoch": 2.45323951920258, + "grad_norm": 0.30235329723241045, + "learning_rate": 0.00013798921641710015, + "loss": 3.0473012924194336, + "step": 4185, + "token_acc": 0.2922458091670044 + }, + { + "epoch": 2.453825857519789, + "grad_norm": 0.3237300262754433, + "learning_rate": 0.0001379876016556307, + "loss": 3.003633737564087, + "step": 4186, + "token_acc": 0.3018541302656411 + }, + { + "epoch": 2.4544121958369978, + "grad_norm": 0.3355090260537687, + "learning_rate": 0.00013798598625550936, + "loss": 3.0517759323120117, + "step": 4187, + "token_acc": 0.2939339760203214 + }, + { + "epoch": 2.454998534154207, + "grad_norm": 0.3345100329267295, + "learning_rate": 0.00013798437021675128, + "loss": 3.115407943725586, + "step": 4188, + "token_acc": 0.28513363392798796 + }, + { + "epoch": 2.455584872471416, + "grad_norm": 0.33651046669721235, + "learning_rate": 0.00013798275353937167, + "loss": 3.0267457962036133, + "step": 4189, + "token_acc": 0.297492838492154 + }, + { + "epoch": 2.456171210788625, + "grad_norm": 0.3347140999805152, + "learning_rate": 0.00013798113622338567, + "loss": 3.089869737625122, + "step": 4190, + "token_acc": 0.287722173687832 + }, + { + "epoch": 2.456757549105834, + "grad_norm": 0.3191418236261717, + "learning_rate": 0.00013797951826880855, + "loss": 2.9955966472625732, + "step": 4191, + "token_acc": 0.2995206632983547 + }, + { + "epoch": 2.4573438874230433, + "grad_norm": 0.3282229446733897, + "learning_rate": 0.0001379778996756554, + "loss": 3.0222959518432617, + "step": 4192, + "token_acc": 0.2986648738930849 + }, + { + "epoch": 2.457930225740252, + "grad_norm": 0.3467069275934962, + "learning_rate": 0.00013797628044394153, + "loss": 3.0708184242248535, + "step": 4193, + "token_acc": 0.29112522506278904 + }, + { + "epoch": 2.458516564057461, + "grad_norm": 0.38295441068961344, + "learning_rate": 0.0001379746605736821, + "loss": 3.0587100982666016, + "step": 4194, + "token_acc": 0.2913565488423632 + }, + { + "epoch": 2.45910290237467, + "grad_norm": 0.33147888650912616, + "learning_rate": 0.0001379730400648923, + "loss": 3.0722427368164062, + "step": 4195, + "token_acc": 0.2930065703022339 + }, + { + "epoch": 2.4596892406918793, + "grad_norm": 0.3835019517146763, + "learning_rate": 0.00013797141891758738, + "loss": 3.0927271842956543, + "step": 4196, + "token_acc": 0.28814234229693286 + }, + { + "epoch": 2.460275579009088, + "grad_norm": 0.3781963606717039, + "learning_rate": 0.00013796979713178259, + "loss": 3.069444179534912, + "step": 4197, + "token_acc": 0.2915625394251908 + }, + { + "epoch": 2.460861917326297, + "grad_norm": 0.4383793149311926, + "learning_rate": 0.00013796817470749316, + "loss": 3.0476956367492676, + "step": 4198, + "token_acc": 0.2950219135255517 + }, + { + "epoch": 2.4614482556435062, + "grad_norm": 0.3301844083927975, + "learning_rate": 0.00013796655164473431, + "loss": 3.073068141937256, + "step": 4199, + "token_acc": 0.28976582082442937 + }, + { + "epoch": 2.4620345939607153, + "grad_norm": 0.3167805654552109, + "learning_rate": 0.00013796492794352128, + "loss": 3.021474838256836, + "step": 4200, + "token_acc": 0.2974620437821783 + }, + { + "epoch": 2.4626209322779244, + "grad_norm": 0.3595840095340389, + "learning_rate": 0.00013796330360386935, + "loss": 3.061807155609131, + "step": 4201, + "token_acc": 0.2928636238327075 + }, + { + "epoch": 2.4632072705951336, + "grad_norm": 0.3277382270771204, + "learning_rate": 0.00013796167862579375, + "loss": 3.113220453262329, + "step": 4202, + "token_acc": 0.28578664035046497 + }, + { + "epoch": 2.4637936089123427, + "grad_norm": 0.3726005889623694, + "learning_rate": 0.00013796005300930977, + "loss": 3.129694700241089, + "step": 4203, + "token_acc": 0.28386860262637637 + }, + { + "epoch": 2.4643799472295513, + "grad_norm": 0.344278735875687, + "learning_rate": 0.00013795842675443266, + "loss": 3.055192708969116, + "step": 4204, + "token_acc": 0.2935059750352647 + }, + { + "epoch": 2.4649662855467604, + "grad_norm": 0.2731972046043155, + "learning_rate": 0.0001379567998611777, + "loss": 3.0766844749450684, + "step": 4205, + "token_acc": 0.28991060025542786 + }, + { + "epoch": 2.4655526238639696, + "grad_norm": 0.33538039210407306, + "learning_rate": 0.0001379551723295602, + "loss": 3.046630859375, + "step": 4206, + "token_acc": 0.29371758971167805 + }, + { + "epoch": 2.4661389621811787, + "grad_norm": 0.32186913923262134, + "learning_rate": 0.00013795354415959543, + "loss": 3.061422824859619, + "step": 4207, + "token_acc": 0.2916192995073815 + }, + { + "epoch": 2.4667253004983873, + "grad_norm": 0.325554838692534, + "learning_rate": 0.00013795191535129867, + "loss": 3.100823163986206, + "step": 4208, + "token_acc": 0.28755966331591776 + }, + { + "epoch": 2.4673116388155965, + "grad_norm": 0.30385234679839923, + "learning_rate": 0.00013795028590468523, + "loss": 3.0388662815093994, + "step": 4209, + "token_acc": 0.2945364843938463 + }, + { + "epoch": 2.4678979771328056, + "grad_norm": 0.2899638734538185, + "learning_rate": 0.0001379486558197704, + "loss": 3.061492919921875, + "step": 4210, + "token_acc": 0.29130047521224944 + }, + { + "epoch": 2.4684843154500147, + "grad_norm": 0.3441627767640145, + "learning_rate": 0.00013794702509656954, + "loss": 3.083706855773926, + "step": 4211, + "token_acc": 0.28896107270931864 + }, + { + "epoch": 2.469070653767224, + "grad_norm": 0.34356333370041703, + "learning_rate": 0.00013794539373509793, + "loss": 3.121530532836914, + "step": 4212, + "token_acc": 0.2831282601188637 + }, + { + "epoch": 2.469656992084433, + "grad_norm": 0.37041470260947146, + "learning_rate": 0.0001379437617353709, + "loss": 3.0889649391174316, + "step": 4213, + "token_acc": 0.28678788374195985 + }, + { + "epoch": 2.4702433304016416, + "grad_norm": 0.31331731876694996, + "learning_rate": 0.00013794212909740378, + "loss": 3.0927577018737793, + "step": 4214, + "token_acc": 0.28719045872914956 + }, + { + "epoch": 2.4708296687188507, + "grad_norm": 0.4463708742821819, + "learning_rate": 0.0001379404958212119, + "loss": 3.0862932205200195, + "step": 4215, + "token_acc": 0.2897759689627629 + }, + { + "epoch": 2.47141600703606, + "grad_norm": 0.4261334077994018, + "learning_rate": 0.00013793886190681065, + "loss": 3.037044048309326, + "step": 4216, + "token_acc": 0.2944011487500816 + }, + { + "epoch": 2.472002345353269, + "grad_norm": 0.40796280358072695, + "learning_rate": 0.00013793722735421532, + "loss": 3.0571494102478027, + "step": 4217, + "token_acc": 0.2939505625805196 + }, + { + "epoch": 2.472588683670478, + "grad_norm": 0.30887898669813973, + "learning_rate": 0.00013793559216344127, + "loss": 3.0305864810943604, + "step": 4218, + "token_acc": 0.2959044953141475 + }, + { + "epoch": 2.4731750219876867, + "grad_norm": 0.38995289323023974, + "learning_rate": 0.0001379339563345039, + "loss": 3.0861611366271973, + "step": 4219, + "token_acc": 0.2879450013474881 + }, + { + "epoch": 2.473761360304896, + "grad_norm": 0.36253828854885484, + "learning_rate": 0.00013793231986741853, + "loss": 3.0524117946624756, + "step": 4220, + "token_acc": 0.2953382423054511 + }, + { + "epoch": 2.474347698622105, + "grad_norm": 0.34332029062738817, + "learning_rate": 0.0001379306827622006, + "loss": 3.061717987060547, + "step": 4221, + "token_acc": 0.29321430066125387 + }, + { + "epoch": 2.474934036939314, + "grad_norm": 0.4183935803845294, + "learning_rate": 0.00013792904501886539, + "loss": 3.054370880126953, + "step": 4222, + "token_acc": 0.29172475814433646 + }, + { + "epoch": 2.475520375256523, + "grad_norm": 0.39349046268948934, + "learning_rate": 0.00013792740663742836, + "loss": 3.078822374343872, + "step": 4223, + "token_acc": 0.28989740094487537 + }, + { + "epoch": 2.4761067135737322, + "grad_norm": 0.38669689335923935, + "learning_rate": 0.00013792576761790487, + "loss": 3.0848100185394287, + "step": 4224, + "token_acc": 0.29057060071638796 + }, + { + "epoch": 2.476693051890941, + "grad_norm": 0.41822095385725067, + "learning_rate": 0.00013792412796031033, + "loss": 3.0765552520751953, + "step": 4225, + "token_acc": 0.2908542049172496 + }, + { + "epoch": 2.47727939020815, + "grad_norm": 0.3324041443033771, + "learning_rate": 0.00013792248766466013, + "loss": 3.041658878326416, + "step": 4226, + "token_acc": 0.2961091843321947 + }, + { + "epoch": 2.477865728525359, + "grad_norm": 0.3819159779147814, + "learning_rate": 0.0001379208467309697, + "loss": 3.069956064224243, + "step": 4227, + "token_acc": 0.29074014182830726 + }, + { + "epoch": 2.4784520668425682, + "grad_norm": 0.2911843360175998, + "learning_rate": 0.00013791920515925443, + "loss": 3.0632269382476807, + "step": 4228, + "token_acc": 0.29243242964270416 + }, + { + "epoch": 2.4790384051597774, + "grad_norm": 0.3641360918598509, + "learning_rate": 0.00013791756294952975, + "loss": 3.089081287384033, + "step": 4229, + "token_acc": 0.2892145005573902 + }, + { + "epoch": 2.479624743476986, + "grad_norm": 0.31224454446394195, + "learning_rate": 0.0001379159201018111, + "loss": 2.983107566833496, + "step": 4230, + "token_acc": 0.30269377030335065 + }, + { + "epoch": 2.480211081794195, + "grad_norm": 0.33907558095597723, + "learning_rate": 0.00013791427661611388, + "loss": 3.045147180557251, + "step": 4231, + "token_acc": 0.29418266915872365 + }, + { + "epoch": 2.4807974201114043, + "grad_norm": 0.29503397085956434, + "learning_rate": 0.00013791263249245356, + "loss": 3.080207586288452, + "step": 4232, + "token_acc": 0.2906566606114851 + }, + { + "epoch": 2.4813837584286134, + "grad_norm": 0.40526288667606025, + "learning_rate": 0.00013791098773084556, + "loss": 3.0536293983459473, + "step": 4233, + "token_acc": 0.2935106608885778 + }, + { + "epoch": 2.4819700967458225, + "grad_norm": 0.32034152184078535, + "learning_rate": 0.00013790934233130534, + "loss": 3.0912976264953613, + "step": 4234, + "token_acc": 0.2869190322153884 + }, + { + "epoch": 2.4825564350630316, + "grad_norm": 0.3282582083662335, + "learning_rate": 0.00013790769629384836, + "loss": 3.055314779281616, + "step": 4235, + "token_acc": 0.2945630239260668 + }, + { + "epoch": 2.4831427733802403, + "grad_norm": 0.30870432327588987, + "learning_rate": 0.0001379060496184901, + "loss": 3.088980197906494, + "step": 4236, + "token_acc": 0.28895415817392295 + }, + { + "epoch": 2.4837291116974494, + "grad_norm": 0.3173176333099791, + "learning_rate": 0.00013790440230524597, + "loss": 3.051692485809326, + "step": 4237, + "token_acc": 0.2938765194177441 + }, + { + "epoch": 2.4843154500146585, + "grad_norm": 0.31368686713275207, + "learning_rate": 0.0001379027543541315, + "loss": 3.049684524536133, + "step": 4238, + "token_acc": 0.2953380117975514 + }, + { + "epoch": 2.4849017883318676, + "grad_norm": 0.3149809801837422, + "learning_rate": 0.00013790110576516218, + "loss": 3.0769524574279785, + "step": 4239, + "token_acc": 0.2891963342448858 + }, + { + "epoch": 2.4854881266490767, + "grad_norm": 0.3462843572435515, + "learning_rate": 0.00013789945653835346, + "loss": 3.114223003387451, + "step": 4240, + "token_acc": 0.2855528626980354 + }, + { + "epoch": 2.4860744649662854, + "grad_norm": 0.28886109673806776, + "learning_rate": 0.00013789780667372082, + "loss": 3.0680408477783203, + "step": 4241, + "token_acc": 0.29057683042781146 + }, + { + "epoch": 2.4866608032834945, + "grad_norm": 0.349299940684079, + "learning_rate": 0.00013789615617127977, + "loss": 3.0425429344177246, + "step": 4242, + "token_acc": 0.296533724071615 + }, + { + "epoch": 2.4872471416007036, + "grad_norm": 0.31346690884541845, + "learning_rate": 0.00013789450503104585, + "loss": 3.055014133453369, + "step": 4243, + "token_acc": 0.29257387398356083 + }, + { + "epoch": 2.4878334799179127, + "grad_norm": 0.3650110410265722, + "learning_rate": 0.00013789285325303453, + "loss": 3.0652599334716797, + "step": 4244, + "token_acc": 0.2924835504680499 + }, + { + "epoch": 2.488419818235122, + "grad_norm": 0.3465783492132816, + "learning_rate": 0.00013789120083726133, + "loss": 3.0709056854248047, + "step": 4245, + "token_acc": 0.29106642147532985 + }, + { + "epoch": 2.489006156552331, + "grad_norm": 0.3420653267437592, + "learning_rate": 0.00013788954778374182, + "loss": 3.0507068634033203, + "step": 4246, + "token_acc": 0.29285746173218175 + }, + { + "epoch": 2.4895924948695396, + "grad_norm": 0.3499246183981242, + "learning_rate": 0.00013788789409249148, + "loss": 3.08219838142395, + "step": 4247, + "token_acc": 0.2891330973883964 + }, + { + "epoch": 2.4901788331867487, + "grad_norm": 0.3282782280895836, + "learning_rate": 0.00013788623976352583, + "loss": 3.064849376678467, + "step": 4248, + "token_acc": 0.2926813604947059 + }, + { + "epoch": 2.490765171503958, + "grad_norm": 0.34318138967780115, + "learning_rate": 0.00013788458479686044, + "loss": 3.058095932006836, + "step": 4249, + "token_acc": 0.2913211778210844 + }, + { + "epoch": 2.491351509821167, + "grad_norm": 0.30523612056741006, + "learning_rate": 0.00013788292919251086, + "loss": 3.0718131065368652, + "step": 4250, + "token_acc": 0.29100949209334837 + }, + { + "epoch": 2.4919378481383756, + "grad_norm": 0.3435281735714235, + "learning_rate": 0.00013788127295049266, + "loss": 3.0578694343566895, + "step": 4251, + "token_acc": 0.2934601132060105 + }, + { + "epoch": 2.4925241864555847, + "grad_norm": 0.3066973459152552, + "learning_rate": 0.00013787961607082135, + "loss": 3.060603618621826, + "step": 4252, + "token_acc": 0.29185448265582514 + }, + { + "epoch": 2.493110524772794, + "grad_norm": 0.3260400895731516, + "learning_rate": 0.0001378779585535125, + "loss": 3.0854859352111816, + "step": 4253, + "token_acc": 0.28763019131041095 + }, + { + "epoch": 2.493696863090003, + "grad_norm": 0.34171137484417746, + "learning_rate": 0.0001378763003985817, + "loss": 3.055359125137329, + "step": 4254, + "token_acc": 0.2943689986650297 + }, + { + "epoch": 2.494283201407212, + "grad_norm": 0.3255718948641697, + "learning_rate": 0.00013787464160604454, + "loss": 3.0776758193969727, + "step": 4255, + "token_acc": 0.2912002894801509 + }, + { + "epoch": 2.494869539724421, + "grad_norm": 0.40041071378112875, + "learning_rate": 0.0001378729821759166, + "loss": 3.0606276988983154, + "step": 4256, + "token_acc": 0.29183415865618023 + }, + { + "epoch": 2.49545587804163, + "grad_norm": 0.33064984618074594, + "learning_rate": 0.00013787132210821342, + "loss": 3.0584356784820557, + "step": 4257, + "token_acc": 0.29300502640453013 + }, + { + "epoch": 2.496042216358839, + "grad_norm": 0.3348112213850654, + "learning_rate": 0.00013786966140295063, + "loss": 3.060304641723633, + "step": 4258, + "token_acc": 0.29230683942091423 + }, + { + "epoch": 2.496628554676048, + "grad_norm": 0.3233111763876964, + "learning_rate": 0.00013786800006014384, + "loss": 3.010420322418213, + "step": 4259, + "token_acc": 0.29789607905642335 + }, + { + "epoch": 2.497214892993257, + "grad_norm": 0.3513053485669945, + "learning_rate": 0.00013786633807980864, + "loss": 3.024261951446533, + "step": 4260, + "token_acc": 0.2988729363715654 + }, + { + "epoch": 2.4978012313104663, + "grad_norm": 0.33377933403005405, + "learning_rate": 0.0001378646754619606, + "loss": 3.1106936931610107, + "step": 4261, + "token_acc": 0.2863433497307132 + }, + { + "epoch": 2.498387569627675, + "grad_norm": 0.32618028403367255, + "learning_rate": 0.00013786301220661544, + "loss": 3.039785861968994, + "step": 4262, + "token_acc": 0.29525250714443896 + }, + { + "epoch": 2.498973907944884, + "grad_norm": 0.32642125152802226, + "learning_rate": 0.0001378613483137887, + "loss": 3.0649285316467285, + "step": 4263, + "token_acc": 0.2920402547809539 + }, + { + "epoch": 2.499560246262093, + "grad_norm": 0.3279319853392137, + "learning_rate": 0.00013785968378349608, + "loss": 3.109575033187866, + "step": 4264, + "token_acc": 0.2874243555102476 + }, + { + "epoch": 2.5001465845793023, + "grad_norm": 0.34062724187402776, + "learning_rate": 0.00013785801861575312, + "loss": 3.059476613998413, + "step": 4265, + "token_acc": 0.2932670943639223 + }, + { + "epoch": 2.5007329228965114, + "grad_norm": 0.35822122953879704, + "learning_rate": 0.00013785635281057552, + "loss": 3.057438373565674, + "step": 4266, + "token_acc": 0.29250561493031696 + }, + { + "epoch": 2.5013192612137205, + "grad_norm": 0.2878873626445056, + "learning_rate": 0.00013785468636797894, + "loss": 3.0433664321899414, + "step": 4267, + "token_acc": 0.29565203337876494 + }, + { + "epoch": 2.5019055995309296, + "grad_norm": 0.34055077064709366, + "learning_rate": 0.00013785301928797902, + "loss": 3.068225145339966, + "step": 4268, + "token_acc": 0.29149898173607247 + }, + { + "epoch": 2.5024919378481383, + "grad_norm": 0.2897239315356686, + "learning_rate": 0.0001378513515705914, + "loss": 3.0642426013946533, + "step": 4269, + "token_acc": 0.2906341777380225 + }, + { + "epoch": 2.5030782761653474, + "grad_norm": 0.3189977555595917, + "learning_rate": 0.00013784968321583178, + "loss": 3.09875750541687, + "step": 4270, + "token_acc": 0.28753457743280625 + }, + { + "epoch": 2.5036646144825565, + "grad_norm": 0.34650821922634417, + "learning_rate": 0.00013784801422371583, + "loss": 3.0858874320983887, + "step": 4271, + "token_acc": 0.2894563405059801 + }, + { + "epoch": 2.5042509527997656, + "grad_norm": 0.33144367372316236, + "learning_rate": 0.00013784634459425917, + "loss": 3.0393478870391846, + "step": 4272, + "token_acc": 0.2951814964532814 + }, + { + "epoch": 2.5048372911169743, + "grad_norm": 0.34466773472240236, + "learning_rate": 0.00013784467432747757, + "loss": 3.0754594802856445, + "step": 4273, + "token_acc": 0.28756313289317587 + }, + { + "epoch": 2.5054236294341834, + "grad_norm": 0.3910145280389394, + "learning_rate": 0.00013784300342338662, + "loss": 3.0994250774383545, + "step": 4274, + "token_acc": 0.28978384818313335 + }, + { + "epoch": 2.5060099677513925, + "grad_norm": 0.3157179757155058, + "learning_rate": 0.00013784133188200214, + "loss": 3.048058032989502, + "step": 4275, + "token_acc": 0.2947905013864074 + }, + { + "epoch": 2.5065963060686016, + "grad_norm": 0.3567279973630884, + "learning_rate": 0.0001378396597033397, + "loss": 3.0974514484405518, + "step": 4276, + "token_acc": 0.28893859998024607 + }, + { + "epoch": 2.5071826443858107, + "grad_norm": 0.43479296556506014, + "learning_rate": 0.0001378379868874151, + "loss": 3.068594455718994, + "step": 4277, + "token_acc": 0.2904294314801297 + }, + { + "epoch": 2.50776898270302, + "grad_norm": 0.379952031943408, + "learning_rate": 0.00013783631343424404, + "loss": 3.087218761444092, + "step": 4278, + "token_acc": 0.28977908627195903 + }, + { + "epoch": 2.5083553210202285, + "grad_norm": 0.323089690761266, + "learning_rate": 0.00013783463934384223, + "loss": 3.067884922027588, + "step": 4279, + "token_acc": 0.2924608166591504 + }, + { + "epoch": 2.5089416593374376, + "grad_norm": 0.38301383582038023, + "learning_rate": 0.00013783296461622536, + "loss": 3.068693161010742, + "step": 4280, + "token_acc": 0.2913203074416759 + }, + { + "epoch": 2.5095279976546467, + "grad_norm": 0.3335160923088287, + "learning_rate": 0.00013783128925140922, + "loss": 3.076441764831543, + "step": 4281, + "token_acc": 0.29036081954779547 + }, + { + "epoch": 2.510114335971856, + "grad_norm": 0.32782932537831677, + "learning_rate": 0.00013782961324940952, + "loss": 3.0684382915496826, + "step": 4282, + "token_acc": 0.2917029886245009 + }, + { + "epoch": 2.5107006742890645, + "grad_norm": 0.3482103885560477, + "learning_rate": 0.00013782793661024198, + "loss": 3.0610227584838867, + "step": 4283, + "token_acc": 0.29191147460573036 + }, + { + "epoch": 2.5112870126062736, + "grad_norm": 0.28217670080206, + "learning_rate": 0.00013782625933392238, + "loss": 3.080874443054199, + "step": 4284, + "token_acc": 0.28883241914324376 + }, + { + "epoch": 2.5118733509234827, + "grad_norm": 0.3524378323998762, + "learning_rate": 0.0001378245814204665, + "loss": 3.096837043762207, + "step": 4285, + "token_acc": 0.2887206501310321 + }, + { + "epoch": 2.512459689240692, + "grad_norm": 0.29302820360266363, + "learning_rate": 0.00013782290286989005, + "loss": 3.076169013977051, + "step": 4286, + "token_acc": 0.2922551717671974 + }, + { + "epoch": 2.513046027557901, + "grad_norm": 0.3516223373681763, + "learning_rate": 0.00013782122368220882, + "loss": 3.0763235092163086, + "step": 4287, + "token_acc": 0.2910851470418489 + }, + { + "epoch": 2.51363236587511, + "grad_norm": 0.3276716543671828, + "learning_rate": 0.0001378195438574386, + "loss": 3.082284688949585, + "step": 4288, + "token_acc": 0.28893827226602764 + }, + { + "epoch": 2.514218704192319, + "grad_norm": 0.3058665190306901, + "learning_rate": 0.00013781786339559513, + "loss": 3.071730136871338, + "step": 4289, + "token_acc": 0.29066122692671276 + }, + { + "epoch": 2.514805042509528, + "grad_norm": 0.30779023975600356, + "learning_rate": 0.00013781618229669423, + "loss": 3.0408101081848145, + "step": 4290, + "token_acc": 0.2950144008919262 + }, + { + "epoch": 2.515391380826737, + "grad_norm": 0.3202775836283203, + "learning_rate": 0.00013781450056075167, + "loss": 3.051098108291626, + "step": 4291, + "token_acc": 0.2926012300491859 + }, + { + "epoch": 2.515977719143946, + "grad_norm": 0.3006103798600977, + "learning_rate": 0.0001378128181877833, + "loss": 3.074449062347412, + "step": 4292, + "token_acc": 0.2913724072017152 + }, + { + "epoch": 2.516564057461155, + "grad_norm": 0.2997866337920747, + "learning_rate": 0.00013781113517780482, + "loss": 3.071866035461426, + "step": 4293, + "token_acc": 0.2908849892036599 + }, + { + "epoch": 2.517150395778364, + "grad_norm": 0.3612803089453866, + "learning_rate": 0.00013780945153083214, + "loss": 3.0768675804138184, + "step": 4294, + "token_acc": 0.2905536803683768 + }, + { + "epoch": 2.517736734095573, + "grad_norm": 0.3492051065809907, + "learning_rate": 0.00013780776724688104, + "loss": 3.0407323837280273, + "step": 4295, + "token_acc": 0.2955518063254867 + }, + { + "epoch": 2.518323072412782, + "grad_norm": 0.32233859543079346, + "learning_rate": 0.00013780608232596733, + "loss": 3.0610733032226562, + "step": 4296, + "token_acc": 0.29308770439583776 + }, + { + "epoch": 2.518909410729991, + "grad_norm": 0.2898712553405986, + "learning_rate": 0.00013780439676810684, + "loss": 3.0600483417510986, + "step": 4297, + "token_acc": 0.2937109165624442 + }, + { + "epoch": 2.5194957490472003, + "grad_norm": 0.357939894791837, + "learning_rate": 0.0001378027105733154, + "loss": 3.102938175201416, + "step": 4298, + "token_acc": 0.28729378278539025 + }, + { + "epoch": 2.5200820873644094, + "grad_norm": 0.34509748601686585, + "learning_rate": 0.0001378010237416089, + "loss": 3.080904722213745, + "step": 4299, + "token_acc": 0.28989886547981897 + }, + { + "epoch": 2.5206684256816185, + "grad_norm": 0.31208613715378564, + "learning_rate": 0.00013779933627300312, + "loss": 3.067377805709839, + "step": 4300, + "token_acc": 0.290322498878364 + }, + { + "epoch": 2.521254763998827, + "grad_norm": 0.3994709338086788, + "learning_rate": 0.00013779764816751393, + "loss": 3.050339698791504, + "step": 4301, + "token_acc": 0.2936385796307557 + }, + { + "epoch": 2.5218411023160363, + "grad_norm": 0.2721892782996403, + "learning_rate": 0.0001377959594251572, + "loss": 3.055325508117676, + "step": 4302, + "token_acc": 0.29275273794710127 + }, + { + "epoch": 2.5224274406332454, + "grad_norm": 0.32841498187703594, + "learning_rate": 0.0001377942700459488, + "loss": 3.057478427886963, + "step": 4303, + "token_acc": 0.2921735958404406 + }, + { + "epoch": 2.5230137789504545, + "grad_norm": 0.3451368924238096, + "learning_rate": 0.00013779258002990456, + "loss": 3.0445172786712646, + "step": 4304, + "token_acc": 0.29383035599467006 + }, + { + "epoch": 2.523600117267663, + "grad_norm": 0.32853011091045264, + "learning_rate": 0.0001377908893770404, + "loss": 3.054691791534424, + "step": 4305, + "token_acc": 0.29331485205548496 + }, + { + "epoch": 2.5241864555848723, + "grad_norm": 0.3054309590948566, + "learning_rate": 0.00013778919808737217, + "loss": 3.0670371055603027, + "step": 4306, + "token_acc": 0.29250297038105744 + }, + { + "epoch": 2.5247727939020814, + "grad_norm": 0.3418705175986472, + "learning_rate": 0.00013778750616091578, + "loss": 3.0597779750823975, + "step": 4307, + "token_acc": 0.2910893950140542 + }, + { + "epoch": 2.5253591322192905, + "grad_norm": 0.2898373075052804, + "learning_rate": 0.00013778581359768713, + "loss": 3.0802512168884277, + "step": 4308, + "token_acc": 0.2887073114304818 + }, + { + "epoch": 2.5259454705364996, + "grad_norm": 0.40397109090333105, + "learning_rate": 0.0001377841203977021, + "loss": 3.0814902782440186, + "step": 4309, + "token_acc": 0.28875920544149664 + }, + { + "epoch": 2.5265318088537088, + "grad_norm": 0.411873298303554, + "learning_rate": 0.00013778242656097657, + "loss": 3.0762405395507812, + "step": 4310, + "token_acc": 0.2892339643486944 + }, + { + "epoch": 2.527118147170918, + "grad_norm": 0.35473920263633446, + "learning_rate": 0.0001377807320875265, + "loss": 3.0415313243865967, + "step": 4311, + "token_acc": 0.29668444817921213 + }, + { + "epoch": 2.5277044854881265, + "grad_norm": 0.3793501293213284, + "learning_rate": 0.0001377790369773678, + "loss": 3.0777368545532227, + "step": 4312, + "token_acc": 0.2898947193898698 + }, + { + "epoch": 2.5282908238053357, + "grad_norm": 0.3204294208503375, + "learning_rate": 0.00013777734123051634, + "loss": 3.054312229156494, + "step": 4313, + "token_acc": 0.293183333874517 + }, + { + "epoch": 2.5288771621225448, + "grad_norm": 0.34049830855116353, + "learning_rate": 0.00013777564484698815, + "loss": 3.134559154510498, + "step": 4314, + "token_acc": 0.28249032152792214 + }, + { + "epoch": 2.529463500439754, + "grad_norm": 0.31651240769526845, + "learning_rate": 0.00013777394782679906, + "loss": 3.0874271392822266, + "step": 4315, + "token_acc": 0.28961005643919957 + }, + { + "epoch": 2.5300498387569625, + "grad_norm": 0.366071591727774, + "learning_rate": 0.00013777225016996507, + "loss": 3.068683624267578, + "step": 4316, + "token_acc": 0.29148187716601087 + }, + { + "epoch": 2.5306361770741717, + "grad_norm": 0.3357283503405669, + "learning_rate": 0.00013777055187650213, + "loss": 3.119310140609741, + "step": 4317, + "token_acc": 0.2856781330306944 + }, + { + "epoch": 2.5312225153913808, + "grad_norm": 0.31886789637334434, + "learning_rate": 0.00013776885294642616, + "loss": 3.0851244926452637, + "step": 4318, + "token_acc": 0.2885755906077981 + }, + { + "epoch": 2.53180885370859, + "grad_norm": 0.32345173941705485, + "learning_rate": 0.00013776715337975314, + "loss": 3.085324764251709, + "step": 4319, + "token_acc": 0.28887107971251225 + }, + { + "epoch": 2.532395192025799, + "grad_norm": 0.41026587487479954, + "learning_rate": 0.00013776545317649902, + "loss": 3.040525197982788, + "step": 4320, + "token_acc": 0.29548629808945015 + }, + { + "epoch": 2.532981530343008, + "grad_norm": 0.29701460776718747, + "learning_rate": 0.0001377637523366798, + "loss": 3.042804718017578, + "step": 4321, + "token_acc": 0.2950354609929078 + }, + { + "epoch": 2.533567868660217, + "grad_norm": 0.3341609948907965, + "learning_rate": 0.00013776205086031142, + "loss": 3.0456385612487793, + "step": 4322, + "token_acc": 0.29432571651446476 + }, + { + "epoch": 2.534154206977426, + "grad_norm": 0.3270859982234725, + "learning_rate": 0.0001377603487474099, + "loss": 3.0311717987060547, + "step": 4323, + "token_acc": 0.2976740437634372 + }, + { + "epoch": 2.534740545294635, + "grad_norm": 0.34589640576174213, + "learning_rate": 0.00013775864599799122, + "loss": 3.078972816467285, + "step": 4324, + "token_acc": 0.2886036429434313 + }, + { + "epoch": 2.535326883611844, + "grad_norm": 0.3798025012938151, + "learning_rate": 0.00013775694261207132, + "loss": 3.035907030105591, + "step": 4325, + "token_acc": 0.2963933913587325 + }, + { + "epoch": 2.535913221929053, + "grad_norm": 0.3411297394985252, + "learning_rate": 0.0001377552385896663, + "loss": 3.0305399894714355, + "step": 4326, + "token_acc": 0.295746139780577 + }, + { + "epoch": 2.536499560246262, + "grad_norm": 0.36270059116668296, + "learning_rate": 0.00013775353393079208, + "loss": 3.0688321590423584, + "step": 4327, + "token_acc": 0.29262879857055923 + }, + { + "epoch": 2.537085898563471, + "grad_norm": 0.3178199827036867, + "learning_rate": 0.00013775182863546472, + "loss": 3.0445261001586914, + "step": 4328, + "token_acc": 0.29479870778572076 + }, + { + "epoch": 2.53767223688068, + "grad_norm": 0.3572778037203079, + "learning_rate": 0.00013775012270370024, + "loss": 3.067387819290161, + "step": 4329, + "token_acc": 0.2911311900756086 + }, + { + "epoch": 2.538258575197889, + "grad_norm": 0.3756105965781068, + "learning_rate": 0.00013774841613551462, + "loss": 3.101994514465332, + "step": 4330, + "token_acc": 0.28699396668822347 + }, + { + "epoch": 2.5388449135150983, + "grad_norm": 0.31387746581542386, + "learning_rate": 0.00013774670893092395, + "loss": 3.0922460556030273, + "step": 4331, + "token_acc": 0.2873343540194689 + }, + { + "epoch": 2.5394312518323074, + "grad_norm": 0.34828542273918517, + "learning_rate": 0.00013774500108994426, + "loss": 3.057373523712158, + "step": 4332, + "token_acc": 0.2925273236743204 + }, + { + "epoch": 2.540017590149516, + "grad_norm": 0.34236386419673176, + "learning_rate": 0.00013774329261259153, + "loss": 3.0289864540100098, + "step": 4333, + "token_acc": 0.2971059136912446 + }, + { + "epoch": 2.5406039284667252, + "grad_norm": 0.306711554906487, + "learning_rate": 0.00013774158349888187, + "loss": 3.0758490562438965, + "step": 4334, + "token_acc": 0.289871342810008 + }, + { + "epoch": 2.5411902667839343, + "grad_norm": 0.2958320791178048, + "learning_rate": 0.00013773987374883132, + "loss": 3.0591182708740234, + "step": 4335, + "token_acc": 0.29313643748931073 + }, + { + "epoch": 2.5417766051011434, + "grad_norm": 0.30292561541597957, + "learning_rate": 0.0001377381633624559, + "loss": 3.087907314300537, + "step": 4336, + "token_acc": 0.28757834066277427 + }, + { + "epoch": 2.542362943418352, + "grad_norm": 0.3095580043840721, + "learning_rate": 0.00013773645233977177, + "loss": 3.035019636154175, + "step": 4337, + "token_acc": 0.2948293623271184 + }, + { + "epoch": 2.5429492817355612, + "grad_norm": 0.34191527654712034, + "learning_rate": 0.00013773474068079492, + "loss": 3.115413188934326, + "step": 4338, + "token_acc": 0.28637658596513527 + }, + { + "epoch": 2.5435356200527703, + "grad_norm": 0.4868577647090121, + "learning_rate": 0.00013773302838554144, + "loss": 3.0621137619018555, + "step": 4339, + "token_acc": 0.2913441723760179 + }, + { + "epoch": 2.5441219583699795, + "grad_norm": 0.45508902422407604, + "learning_rate": 0.00013773131545402744, + "loss": 3.048271656036377, + "step": 4340, + "token_acc": 0.2947804487010453 + }, + { + "epoch": 2.5447082966871886, + "grad_norm": 0.34032648949072936, + "learning_rate": 0.000137729601886269, + "loss": 3.0537521839141846, + "step": 4341, + "token_acc": 0.294519686030233 + }, + { + "epoch": 2.5452946350043977, + "grad_norm": 0.4238679292428818, + "learning_rate": 0.00013772788768228223, + "loss": 3.075558662414551, + "step": 4342, + "token_acc": 0.2908509224505294 + }, + { + "epoch": 2.545880973321607, + "grad_norm": 0.3412280308881297, + "learning_rate": 0.00013772617284208322, + "loss": 3.08547306060791, + "step": 4343, + "token_acc": 0.28871251388374675 + }, + { + "epoch": 2.5464673116388155, + "grad_norm": 0.3445728552098471, + "learning_rate": 0.00013772445736568806, + "loss": 3.0621161460876465, + "step": 4344, + "token_acc": 0.292539300267252 + }, + { + "epoch": 2.5470536499560246, + "grad_norm": 0.39660816859009007, + "learning_rate": 0.0001377227412531129, + "loss": 3.058565616607666, + "step": 4345, + "token_acc": 0.2931757312086949 + }, + { + "epoch": 2.5476399882732337, + "grad_norm": 0.31858294672020915, + "learning_rate": 0.00013772102450437384, + "loss": 3.098022699356079, + "step": 4346, + "token_acc": 0.2871662657022624 + }, + { + "epoch": 2.548226326590443, + "grad_norm": 0.3704959565672287, + "learning_rate": 0.000137719307119487, + "loss": 3.129561424255371, + "step": 4347, + "token_acc": 0.2836818821636839 + }, + { + "epoch": 2.5488126649076515, + "grad_norm": 0.30503791512508854, + "learning_rate": 0.00013771758909846853, + "loss": 3.0888030529022217, + "step": 4348, + "token_acc": 0.28812711252648393 + }, + { + "epoch": 2.5493990032248606, + "grad_norm": 0.3249582948008529, + "learning_rate": 0.00013771587044133458, + "loss": 3.0452113151550293, + "step": 4349, + "token_acc": 0.29335038696962973 + }, + { + "epoch": 2.5499853415420697, + "grad_norm": 0.3627027210169185, + "learning_rate": 0.00013771415114810124, + "loss": 3.0877909660339355, + "step": 4350, + "token_acc": 0.2896060295726454 + }, + { + "epoch": 2.550571679859279, + "grad_norm": 0.2863728945010151, + "learning_rate": 0.0001377124312187847, + "loss": 3.0836799144744873, + "step": 4351, + "token_acc": 0.28822915703701746 + }, + { + "epoch": 2.551158018176488, + "grad_norm": 0.32658920826979826, + "learning_rate": 0.00013771071065340112, + "loss": 3.0722928047180176, + "step": 4352, + "token_acc": 0.2915955693473332 + }, + { + "epoch": 2.551744356493697, + "grad_norm": 0.30630292575854784, + "learning_rate": 0.00013770898945196665, + "loss": 3.0500426292419434, + "step": 4353, + "token_acc": 0.2935960199795438 + }, + { + "epoch": 2.552330694810906, + "grad_norm": 0.33534661856836845, + "learning_rate": 0.00013770726761449747, + "loss": 3.0550284385681152, + "step": 4354, + "token_acc": 0.2929844358919804 + }, + { + "epoch": 2.552917033128115, + "grad_norm": 0.3354512800164192, + "learning_rate": 0.00013770554514100974, + "loss": 3.109459400177002, + "step": 4355, + "token_acc": 0.2850557203559689 + }, + { + "epoch": 2.553503371445324, + "grad_norm": 0.28807130552191323, + "learning_rate": 0.00013770382203151968, + "loss": 3.05765438079834, + "step": 4356, + "token_acc": 0.29253885431328774 + }, + { + "epoch": 2.554089709762533, + "grad_norm": 0.32104397198699836, + "learning_rate": 0.0001377020982860434, + "loss": 3.0617847442626953, + "step": 4357, + "token_acc": 0.2930197893376279 + }, + { + "epoch": 2.554676048079742, + "grad_norm": 0.33569226700335814, + "learning_rate": 0.00013770037390459718, + "loss": 3.030935287475586, + "step": 4358, + "token_acc": 0.2970840059775139 + }, + { + "epoch": 2.555262386396951, + "grad_norm": 0.26900104899872107, + "learning_rate": 0.00013769864888719716, + "loss": 3.08833646774292, + "step": 4359, + "token_acc": 0.2887131646276103 + }, + { + "epoch": 2.55584872471416, + "grad_norm": 0.3346223539889032, + "learning_rate": 0.00013769692323385954, + "loss": 3.0506608486175537, + "step": 4360, + "token_acc": 0.29234854762995344 + }, + { + "epoch": 2.556435063031369, + "grad_norm": 0.33583176041228646, + "learning_rate": 0.00013769519694460056, + "loss": 3.034132957458496, + "step": 4361, + "token_acc": 0.29706054216102235 + }, + { + "epoch": 2.557021401348578, + "grad_norm": 0.29859857114913757, + "learning_rate": 0.00013769347001943641, + "loss": 3.113027811050415, + "step": 4362, + "token_acc": 0.2839442084840858 + }, + { + "epoch": 2.5576077396657872, + "grad_norm": 0.3481407533014389, + "learning_rate": 0.00013769174245838333, + "loss": 3.1086015701293945, + "step": 4363, + "token_acc": 0.2844130172333736 + }, + { + "epoch": 2.5581940779829964, + "grad_norm": 0.32083260961973054, + "learning_rate": 0.00013769001426145757, + "loss": 3.0893335342407227, + "step": 4364, + "token_acc": 0.2881651092336576 + }, + { + "epoch": 2.5587804163002055, + "grad_norm": 0.3464046346646123, + "learning_rate": 0.00013768828542867534, + "loss": 3.066728115081787, + "step": 4365, + "token_acc": 0.29186022203096706 + }, + { + "epoch": 2.559366754617414, + "grad_norm": 0.3557937497058574, + "learning_rate": 0.00013768655596005285, + "loss": 3.0987534523010254, + "step": 4366, + "token_acc": 0.285965893837859 + }, + { + "epoch": 2.5599530929346233, + "grad_norm": 0.33181976637690075, + "learning_rate": 0.0001376848258556064, + "loss": 3.0534462928771973, + "step": 4367, + "token_acc": 0.2924701525889038 + }, + { + "epoch": 2.5605394312518324, + "grad_norm": 0.35568632908601167, + "learning_rate": 0.00013768309511535222, + "loss": 3.087346315383911, + "step": 4368, + "token_acc": 0.2880468422418785 + }, + { + "epoch": 2.5611257695690415, + "grad_norm": 0.30334467315122554, + "learning_rate": 0.00013768136373930654, + "loss": 3.0978758335113525, + "step": 4369, + "token_acc": 0.2873987975951904 + }, + { + "epoch": 2.56171210788625, + "grad_norm": 0.30311635466958564, + "learning_rate": 0.00013767963172748565, + "loss": 3.102729558944702, + "step": 4370, + "token_acc": 0.28758861664955937 + }, + { + "epoch": 2.5622984462034593, + "grad_norm": 0.3505588600232638, + "learning_rate": 0.00013767789907990582, + "loss": 3.0719733238220215, + "step": 4371, + "token_acc": 0.2896677532532818 + }, + { + "epoch": 2.5628847845206684, + "grad_norm": 0.31030622273303227, + "learning_rate": 0.00013767616579658335, + "loss": 3.068525552749634, + "step": 4372, + "token_acc": 0.2900499979339697 + }, + { + "epoch": 2.5634711228378775, + "grad_norm": 0.31996443758474635, + "learning_rate": 0.0001376744318775345, + "loss": 3.091991662979126, + "step": 4373, + "token_acc": 0.28940168478652173 + }, + { + "epoch": 2.5640574611550866, + "grad_norm": 0.33831918930364835, + "learning_rate": 0.00013767269732277554, + "loss": 3.077623128890991, + "step": 4374, + "token_acc": 0.29023333289954845 + }, + { + "epoch": 2.5646437994722957, + "grad_norm": 0.3533177035662115, + "learning_rate": 0.00013767096213232276, + "loss": 3.0500428676605225, + "step": 4375, + "token_acc": 0.29363014658919195 + }, + { + "epoch": 2.565230137789505, + "grad_norm": 0.3759404470112113, + "learning_rate": 0.00013766922630619252, + "loss": 3.113585948944092, + "step": 4376, + "token_acc": 0.28521281838868323 + }, + { + "epoch": 2.5658164761067135, + "grad_norm": 0.28073353947347796, + "learning_rate": 0.00013766748984440105, + "loss": 3.0696945190429688, + "step": 4377, + "token_acc": 0.2909410936399906 + }, + { + "epoch": 2.5664028144239226, + "grad_norm": 0.3388425200840157, + "learning_rate": 0.0001376657527469647, + "loss": 3.0615196228027344, + "step": 4378, + "token_acc": 0.29084006733150597 + }, + { + "epoch": 2.5669891527411317, + "grad_norm": 0.3225644700019978, + "learning_rate": 0.0001376640150138998, + "loss": 3.0399904251098633, + "step": 4379, + "token_acc": 0.2940290073460162 + }, + { + "epoch": 2.567575491058341, + "grad_norm": 0.2794387992121348, + "learning_rate": 0.00013766227664522266, + "loss": 3.0509774684906006, + "step": 4380, + "token_acc": 0.29323887303964896 + }, + { + "epoch": 2.5681618293755495, + "grad_norm": 0.29651086689929507, + "learning_rate": 0.0001376605376409496, + "loss": 3.104763984680176, + "step": 4381, + "token_acc": 0.28595544492969993 + }, + { + "epoch": 2.5687481676927586, + "grad_norm": 0.28481032604770207, + "learning_rate": 0.00013765879800109695, + "loss": 3.037874460220337, + "step": 4382, + "token_acc": 0.2940751099318874 + }, + { + "epoch": 2.5693345060099677, + "grad_norm": 0.3023541757431778, + "learning_rate": 0.00013765705772568107, + "loss": 3.062717914581299, + "step": 4383, + "token_acc": 0.292416659887314 + }, + { + "epoch": 2.569920844327177, + "grad_norm": 0.3484368017764343, + "learning_rate": 0.0001376553168147183, + "loss": 3.0805723667144775, + "step": 4384, + "token_acc": 0.28964922606693305 + }, + { + "epoch": 2.570507182644386, + "grad_norm": 0.2978548722386638, + "learning_rate": 0.000137653575268225, + "loss": 3.062324047088623, + "step": 4385, + "token_acc": 0.29283459244429827 + }, + { + "epoch": 2.571093520961595, + "grad_norm": 0.3597604048050465, + "learning_rate": 0.00013765183308621752, + "loss": 3.0830540657043457, + "step": 4386, + "token_acc": 0.2922269150288201 + }, + { + "epoch": 2.5716798592788037, + "grad_norm": 0.3932886884573602, + "learning_rate": 0.00013765009026871225, + "loss": 3.0906801223754883, + "step": 4387, + "token_acc": 0.28683908419433796 + }, + { + "epoch": 2.572266197596013, + "grad_norm": 0.44127435472880217, + "learning_rate": 0.00013764834681572553, + "loss": 3.078768253326416, + "step": 4388, + "token_acc": 0.2906192997864125 + }, + { + "epoch": 2.572852535913222, + "grad_norm": 0.3287074289209757, + "learning_rate": 0.00013764660272727375, + "loss": 3.0593152046203613, + "step": 4389, + "token_acc": 0.29274742398365383 + }, + { + "epoch": 2.573438874230431, + "grad_norm": 0.34523479112255184, + "learning_rate": 0.00013764485800337326, + "loss": 3.093247175216675, + "step": 4390, + "token_acc": 0.28899363064011147 + }, + { + "epoch": 2.5740252125476397, + "grad_norm": 0.35931216919659087, + "learning_rate": 0.0001376431126440405, + "loss": 3.0674726963043213, + "step": 4391, + "token_acc": 0.2914914151279091 + }, + { + "epoch": 2.574611550864849, + "grad_norm": 0.32560710316110114, + "learning_rate": 0.00013764136664929185, + "loss": 3.068941593170166, + "step": 4392, + "token_acc": 0.2888528913564522 + }, + { + "epoch": 2.575197889182058, + "grad_norm": 0.31078626938133513, + "learning_rate": 0.00013763962001914372, + "loss": 3.033418893814087, + "step": 4393, + "token_acc": 0.2967011077365646 + }, + { + "epoch": 2.575784227499267, + "grad_norm": 0.29736251295161276, + "learning_rate": 0.00013763787275361251, + "loss": 3.080585479736328, + "step": 4394, + "token_acc": 0.291277608484794 + }, + { + "epoch": 2.576370565816476, + "grad_norm": 0.36419033723368166, + "learning_rate": 0.00013763612485271463, + "loss": 3.06382417678833, + "step": 4395, + "token_acc": 0.2932593747184923 + }, + { + "epoch": 2.5769569041336853, + "grad_norm": 0.33271998331515396, + "learning_rate": 0.0001376343763164665, + "loss": 3.0995993614196777, + "step": 4396, + "token_acc": 0.2853398844997897 + }, + { + "epoch": 2.5775432424508944, + "grad_norm": 0.3288883294718207, + "learning_rate": 0.0001376326271448845, + "loss": 3.1448240280151367, + "step": 4397, + "token_acc": 0.2794351218838478 + }, + { + "epoch": 2.578129580768103, + "grad_norm": 0.39622551231249903, + "learning_rate": 0.00013763087733798513, + "loss": 3.0747480392456055, + "step": 4398, + "token_acc": 0.29000966898704983 + }, + { + "epoch": 2.578715919085312, + "grad_norm": 0.32933547860736917, + "learning_rate": 0.0001376291268957848, + "loss": 3.082432985305786, + "step": 4399, + "token_acc": 0.2898494915254237 + }, + { + "epoch": 2.5793022574025213, + "grad_norm": 0.31636511042155024, + "learning_rate": 0.00013762737581829998, + "loss": 3.08817720413208, + "step": 4400, + "token_acc": 0.2888427965578293 + }, + { + "epoch": 2.5798885957197304, + "grad_norm": 0.3799530777196542, + "learning_rate": 0.00013762562410554705, + "loss": 3.0421314239501953, + "step": 4401, + "token_acc": 0.2951059971233819 + }, + { + "epoch": 2.580474934036939, + "grad_norm": 0.3271510053089545, + "learning_rate": 0.00013762387175754253, + "loss": 3.0729103088378906, + "step": 4402, + "token_acc": 0.29212287893726324 + }, + { + "epoch": 2.581061272354148, + "grad_norm": 0.30043137893886745, + "learning_rate": 0.00013762211877430284, + "loss": 3.0770773887634277, + "step": 4403, + "token_acc": 0.2898860148939099 + }, + { + "epoch": 2.5816476106713573, + "grad_norm": 0.30752389383480555, + "learning_rate": 0.00013762036515584453, + "loss": 3.0742204189300537, + "step": 4404, + "token_acc": 0.29042238432892925 + }, + { + "epoch": 2.5822339489885664, + "grad_norm": 0.3423163134917392, + "learning_rate": 0.00013761861090218395, + "loss": 3.0139851570129395, + "step": 4405, + "token_acc": 0.2978366013407122 + }, + { + "epoch": 2.5828202873057755, + "grad_norm": 0.3306011378717173, + "learning_rate": 0.00013761685601333765, + "loss": 3.0308685302734375, + "step": 4406, + "token_acc": 0.29587622558392096 + }, + { + "epoch": 2.5834066256229846, + "grad_norm": 0.3382412141939098, + "learning_rate": 0.0001376151004893221, + "loss": 3.030334949493408, + "step": 4407, + "token_acc": 0.29731246276912604 + }, + { + "epoch": 2.5839929639401937, + "grad_norm": 0.34161288647340077, + "learning_rate": 0.0001376133443301538, + "loss": 3.067458152770996, + "step": 4408, + "token_acc": 0.29262269107175126 + }, + { + "epoch": 2.5845793022574024, + "grad_norm": 0.34660139986282135, + "learning_rate": 0.00013761158753584923, + "loss": 3.044356107711792, + "step": 4409, + "token_acc": 0.2941715928980975 + }, + { + "epoch": 2.5851656405746115, + "grad_norm": 0.44300900563454804, + "learning_rate": 0.0001376098301064249, + "loss": 3.1110329627990723, + "step": 4410, + "token_acc": 0.2852833491892515 + }, + { + "epoch": 2.5857519788918206, + "grad_norm": 0.4185773792082414, + "learning_rate": 0.00013760807204189735, + "loss": 3.0119833946228027, + "step": 4411, + "token_acc": 0.2983083851206553 + }, + { + "epoch": 2.5863383172090297, + "grad_norm": 0.3245734483685519, + "learning_rate": 0.00013760631334228305, + "loss": 3.1030757427215576, + "step": 4412, + "token_acc": 0.2866843719828082 + }, + { + "epoch": 2.5869246555262384, + "grad_norm": 0.3918930447489074, + "learning_rate": 0.00013760455400759855, + "loss": 3.040175437927246, + "step": 4413, + "token_acc": 0.2951974680776859 + }, + { + "epoch": 2.5875109938434475, + "grad_norm": 0.3280369382564077, + "learning_rate": 0.00013760279403786033, + "loss": 3.0834498405456543, + "step": 4414, + "token_acc": 0.28941022860091187 + }, + { + "epoch": 2.5880973321606566, + "grad_norm": 0.3721016978335643, + "learning_rate": 0.00013760103343308497, + "loss": 3.050204277038574, + "step": 4415, + "token_acc": 0.2933115954617739 + }, + { + "epoch": 2.5886836704778657, + "grad_norm": 0.36765475093283806, + "learning_rate": 0.000137599272193289, + "loss": 3.0471296310424805, + "step": 4416, + "token_acc": 0.29502397064944275 + }, + { + "epoch": 2.589270008795075, + "grad_norm": 0.3164681351459282, + "learning_rate": 0.00013759751031848898, + "loss": 3.0877580642700195, + "step": 4417, + "token_acc": 0.28837652965171007 + }, + { + "epoch": 2.589856347112284, + "grad_norm": 0.38515495908202785, + "learning_rate": 0.0001375957478087014, + "loss": 3.120756149291992, + "step": 4418, + "token_acc": 0.2851830119052276 + }, + { + "epoch": 2.590442685429493, + "grad_norm": 0.32897531029187227, + "learning_rate": 0.00013759398466394287, + "loss": 3.0457823276519775, + "step": 4419, + "token_acc": 0.2943217151097315 + }, + { + "epoch": 2.5910290237467017, + "grad_norm": 0.3476061476502033, + "learning_rate": 0.00013759222088422993, + "loss": 3.0698466300964355, + "step": 4420, + "token_acc": 0.29171058117343013 + }, + { + "epoch": 2.591615362063911, + "grad_norm": 0.32636911759625703, + "learning_rate": 0.0001375904564695792, + "loss": 3.059525966644287, + "step": 4421, + "token_acc": 0.2919048731728847 + }, + { + "epoch": 2.59220170038112, + "grad_norm": 0.2797280582960858, + "learning_rate": 0.00013758869142000714, + "loss": 3.068411350250244, + "step": 4422, + "token_acc": 0.29225366165143474 + }, + { + "epoch": 2.592788038698329, + "grad_norm": 0.3190398761294155, + "learning_rate": 0.00013758692573553048, + "loss": 3.0230913162231445, + "step": 4423, + "token_acc": 0.2979669297058839 + }, + { + "epoch": 2.5933743770155377, + "grad_norm": 0.3234341558786688, + "learning_rate": 0.00013758515941616567, + "loss": 3.0942068099975586, + "step": 4424, + "token_acc": 0.2883273609039131 + }, + { + "epoch": 2.593960715332747, + "grad_norm": 0.33706690408519574, + "learning_rate": 0.00013758339246192937, + "loss": 3.071619987487793, + "step": 4425, + "token_acc": 0.29056958058674753 + }, + { + "epoch": 2.594547053649956, + "grad_norm": 0.3033290298802906, + "learning_rate": 0.00013758162487283816, + "loss": 3.081899404525757, + "step": 4426, + "token_acc": 0.2907523591975111 + }, + { + "epoch": 2.595133391967165, + "grad_norm": 0.3345095515346861, + "learning_rate": 0.00013757985664890866, + "loss": 3.038045644760132, + "step": 4427, + "token_acc": 0.2950555972013993 + }, + { + "epoch": 2.595719730284374, + "grad_norm": 0.32593143230337646, + "learning_rate": 0.00013757808779015748, + "loss": 3.0481486320495605, + "step": 4428, + "token_acc": 0.29386033713499543 + }, + { + "epoch": 2.5963060686015833, + "grad_norm": 0.30887006936395417, + "learning_rate": 0.00013757631829660125, + "loss": 3.048729181289673, + "step": 4429, + "token_acc": 0.2955832829970524 + }, + { + "epoch": 2.5968924069187924, + "grad_norm": 0.4095987781046983, + "learning_rate": 0.00013757454816825654, + "loss": 3.075457811355591, + "step": 4430, + "token_acc": 0.2919607656315585 + }, + { + "epoch": 2.597478745236001, + "grad_norm": 0.28132534412322546, + "learning_rate": 0.00013757277740514, + "loss": 3.0803680419921875, + "step": 4431, + "token_acc": 0.289418904138588 + }, + { + "epoch": 2.59806508355321, + "grad_norm": 0.37715632161876317, + "learning_rate": 0.00013757100600726827, + "loss": 3.0984158515930176, + "step": 4432, + "token_acc": 0.284995511820597 + }, + { + "epoch": 2.5986514218704193, + "grad_norm": 0.364524089161537, + "learning_rate": 0.00013756923397465802, + "loss": 3.0697059631347656, + "step": 4433, + "token_acc": 0.2905751769119277 + }, + { + "epoch": 2.5992377601876284, + "grad_norm": 0.3695863138738358, + "learning_rate": 0.00013756746130732587, + "loss": 3.063328981399536, + "step": 4434, + "token_acc": 0.29063575697287175 + }, + { + "epoch": 2.599824098504837, + "grad_norm": 0.3057216796050019, + "learning_rate": 0.00013756568800528843, + "loss": 3.055208921432495, + "step": 4435, + "token_acc": 0.29375575458274045 + }, + { + "epoch": 2.600410436822046, + "grad_norm": 0.39354702922972107, + "learning_rate": 0.00013756391406856243, + "loss": 3.0608880519866943, + "step": 4436, + "token_acc": 0.29189058711809734 + }, + { + "epoch": 2.6009967751392553, + "grad_norm": 0.3220702758052491, + "learning_rate": 0.00013756213949716448, + "loss": 3.0279791355133057, + "step": 4437, + "token_acc": 0.2958560226815017 + }, + { + "epoch": 2.6015831134564644, + "grad_norm": 0.3077477488023142, + "learning_rate": 0.0001375603642911113, + "loss": 3.069695472717285, + "step": 4438, + "token_acc": 0.2913389516765209 + }, + { + "epoch": 2.6021694517736735, + "grad_norm": 0.32640741989334876, + "learning_rate": 0.0001375585884504195, + "loss": 3.0521297454833984, + "step": 4439, + "token_acc": 0.2927884900088313 + }, + { + "epoch": 2.6027557900908826, + "grad_norm": 0.2758617142478373, + "learning_rate": 0.00013755681197510583, + "loss": 3.0593180656433105, + "step": 4440, + "token_acc": 0.29155545291966295 + }, + { + "epoch": 2.6033421284080913, + "grad_norm": 0.3163299292395558, + "learning_rate": 0.00013755503486518692, + "loss": 3.076005458831787, + "step": 4441, + "token_acc": 0.29140460661224815 + }, + { + "epoch": 2.6039284667253004, + "grad_norm": 0.3315894656420227, + "learning_rate": 0.00013755325712067951, + "loss": 3.105449914932251, + "step": 4442, + "token_acc": 0.2865914345649145 + }, + { + "epoch": 2.6045148050425095, + "grad_norm": 0.28247757609454416, + "learning_rate": 0.00013755147874160026, + "loss": 3.0808606147766113, + "step": 4443, + "token_acc": 0.28992357850067063 + }, + { + "epoch": 2.6051011433597187, + "grad_norm": 0.33773511037182347, + "learning_rate": 0.0001375496997279659, + "loss": 3.0509166717529297, + "step": 4444, + "token_acc": 0.29406827834741944 + }, + { + "epoch": 2.6056874816769273, + "grad_norm": 0.30187671189376414, + "learning_rate": 0.00013754792007979313, + "loss": 3.1000943183898926, + "step": 4445, + "token_acc": 0.28549702458230225 + }, + { + "epoch": 2.6062738199941364, + "grad_norm": 0.30011680398563206, + "learning_rate": 0.00013754613979709868, + "loss": 3.0710039138793945, + "step": 4446, + "token_acc": 0.28889464549387356 + }, + { + "epoch": 2.6068601583113455, + "grad_norm": 0.307711419659515, + "learning_rate": 0.00013754435887989926, + "loss": 3.0676560401916504, + "step": 4447, + "token_acc": 0.29186240537301117 + }, + { + "epoch": 2.6074464966285547, + "grad_norm": 0.2868450376223833, + "learning_rate": 0.00013754257732821162, + "loss": 3.095599889755249, + "step": 4448, + "token_acc": 0.2883357052561279 + }, + { + "epoch": 2.6080328349457638, + "grad_norm": 0.32378027935979525, + "learning_rate": 0.00013754079514205248, + "loss": 3.037041187286377, + "step": 4449, + "token_acc": 0.29625375439467905 + }, + { + "epoch": 2.608619173262973, + "grad_norm": 0.33579876256775354, + "learning_rate": 0.00013753901232143857, + "loss": 3.047914981842041, + "step": 4450, + "token_acc": 0.29306351620072524 + }, + { + "epoch": 2.609205511580182, + "grad_norm": 0.324412759674356, + "learning_rate": 0.00013753722886638664, + "loss": 3.067213535308838, + "step": 4451, + "token_acc": 0.29269742359394124 + }, + { + "epoch": 2.6097918498973907, + "grad_norm": 0.3606715210221491, + "learning_rate": 0.00013753544477691348, + "loss": 3.0669684410095215, + "step": 4452, + "token_acc": 0.2927171373909994 + }, + { + "epoch": 2.6103781882145998, + "grad_norm": 0.33450755618091776, + "learning_rate": 0.00013753366005303581, + "loss": 3.0537023544311523, + "step": 4453, + "token_acc": 0.2930190654106549 + }, + { + "epoch": 2.610964526531809, + "grad_norm": 0.3109836402167432, + "learning_rate": 0.0001375318746947704, + "loss": 3.071597099304199, + "step": 4454, + "token_acc": 0.29061296115337587 + }, + { + "epoch": 2.611550864849018, + "grad_norm": 0.37296756072227977, + "learning_rate": 0.00013753008870213404, + "loss": 3.047229766845703, + "step": 4455, + "token_acc": 0.29453239254590785 + }, + { + "epoch": 2.6121372031662267, + "grad_norm": 0.33034906505470774, + "learning_rate": 0.00013752830207514348, + "loss": 3.0468077659606934, + "step": 4456, + "token_acc": 0.2935726126423417 + }, + { + "epoch": 2.6127235414834358, + "grad_norm": 0.39796619993887566, + "learning_rate": 0.00013752651481381553, + "loss": 3.0859241485595703, + "step": 4457, + "token_acc": 0.28787343161360446 + }, + { + "epoch": 2.613309879800645, + "grad_norm": 0.35352470214403897, + "learning_rate": 0.00013752472691816694, + "loss": 3.0704493522644043, + "step": 4458, + "token_acc": 0.2898268877820356 + }, + { + "epoch": 2.613896218117854, + "grad_norm": 0.35075359172371434, + "learning_rate": 0.00013752293838821457, + "loss": 3.0824742317199707, + "step": 4459, + "token_acc": 0.28955281945598116 + }, + { + "epoch": 2.614482556435063, + "grad_norm": 0.42655089327280327, + "learning_rate": 0.00013752114922397515, + "loss": 3.1209373474121094, + "step": 4460, + "token_acc": 0.2851773182565961 + }, + { + "epoch": 2.615068894752272, + "grad_norm": 0.32057052336688263, + "learning_rate": 0.00013751935942546556, + "loss": 3.0794143676757812, + "step": 4461, + "token_acc": 0.2891218949196332 + }, + { + "epoch": 2.6156552330694813, + "grad_norm": 0.3858109294160279, + "learning_rate": 0.00013751756899270251, + "loss": 3.050518035888672, + "step": 4462, + "token_acc": 0.29322694502464486 + }, + { + "epoch": 2.61624157138669, + "grad_norm": 0.272830021710739, + "learning_rate": 0.00013751577792570294, + "loss": 3.029107093811035, + "step": 4463, + "token_acc": 0.29660354163178926 + }, + { + "epoch": 2.616827909703899, + "grad_norm": 0.3473448606413895, + "learning_rate": 0.00013751398622448359, + "loss": 3.011425495147705, + "step": 4464, + "token_acc": 0.29936184826724943 + }, + { + "epoch": 2.6174142480211082, + "grad_norm": 0.31838458676233844, + "learning_rate": 0.0001375121938890613, + "loss": 3.0856447219848633, + "step": 4465, + "token_acc": 0.28835524913298705 + }, + { + "epoch": 2.6180005863383173, + "grad_norm": 0.3380809321529088, + "learning_rate": 0.00013751040091945294, + "loss": 3.077059745788574, + "step": 4466, + "token_acc": 0.2920450274522378 + }, + { + "epoch": 2.618586924655526, + "grad_norm": 0.3051797385265674, + "learning_rate": 0.00013750860731567534, + "loss": 3.0890941619873047, + "step": 4467, + "token_acc": 0.2890020382503846 + }, + { + "epoch": 2.619173262972735, + "grad_norm": 0.306081934016556, + "learning_rate": 0.00013750681307774532, + "loss": 3.0404391288757324, + "step": 4468, + "token_acc": 0.29479730043257013 + }, + { + "epoch": 2.6197596012899442, + "grad_norm": 0.316741423511763, + "learning_rate": 0.00013750501820567978, + "loss": 3.0462069511413574, + "step": 4469, + "token_acc": 0.2945326594129943 + }, + { + "epoch": 2.6203459396071533, + "grad_norm": 0.33421666301891484, + "learning_rate": 0.00013750322269949556, + "loss": 3.0583081245422363, + "step": 4470, + "token_acc": 0.29159750383459754 + }, + { + "epoch": 2.6209322779243625, + "grad_norm": 0.3160489514279233, + "learning_rate": 0.00013750142655920953, + "loss": 3.039483070373535, + "step": 4471, + "token_acc": 0.2962294702752718 + }, + { + "epoch": 2.6215186162415716, + "grad_norm": 0.2779608535115818, + "learning_rate": 0.00013749962978483853, + "loss": 3.030332565307617, + "step": 4472, + "token_acc": 0.29782767121197373 + }, + { + "epoch": 2.6221049545587807, + "grad_norm": 0.29113334568278476, + "learning_rate": 0.00013749783237639948, + "loss": 3.060206413269043, + "step": 4473, + "token_acc": 0.29365464274072367 + }, + { + "epoch": 2.6226912928759893, + "grad_norm": 0.3057461016192328, + "learning_rate": 0.00013749603433390925, + "loss": 3.1124448776245117, + "step": 4474, + "token_acc": 0.28595810307158465 + }, + { + "epoch": 2.6232776311931985, + "grad_norm": 0.26337725656864613, + "learning_rate": 0.00013749423565738471, + "loss": 3.0445902347564697, + "step": 4475, + "token_acc": 0.2937385512517298 + }, + { + "epoch": 2.6238639695104076, + "grad_norm": 0.29607821166623827, + "learning_rate": 0.0001374924363468428, + "loss": 3.070188045501709, + "step": 4476, + "token_acc": 0.2901865209510495 + }, + { + "epoch": 2.6244503078276167, + "grad_norm": 0.28914789226147614, + "learning_rate": 0.0001374906364023004, + "loss": 3.081263780593872, + "step": 4477, + "token_acc": 0.28879240822952995 + }, + { + "epoch": 2.6250366461448253, + "grad_norm": 0.2727191568924578, + "learning_rate": 0.0001374888358237744, + "loss": 3.0323848724365234, + "step": 4478, + "token_acc": 0.2962986572560699 + }, + { + "epoch": 2.6256229844620345, + "grad_norm": 0.28170033082298745, + "learning_rate": 0.00013748703461128174, + "loss": 3.0561113357543945, + "step": 4479, + "token_acc": 0.2924859420051294 + }, + { + "epoch": 2.6262093227792436, + "grad_norm": 0.2577371445828078, + "learning_rate": 0.00013748523276483932, + "loss": 3.0492372512817383, + "step": 4480, + "token_acc": 0.2936592405519978 + }, + { + "epoch": 2.6267956610964527, + "grad_norm": 0.2572849648200553, + "learning_rate": 0.00013748343028446408, + "loss": 3.0889220237731934, + "step": 4481, + "token_acc": 0.2887773722627737 + }, + { + "epoch": 2.627381999413662, + "grad_norm": 0.29117434856664287, + "learning_rate": 0.00013748162717017293, + "loss": 3.0544140338897705, + "step": 4482, + "token_acc": 0.2933192810191257 + }, + { + "epoch": 2.627968337730871, + "grad_norm": 0.30358482516992324, + "learning_rate": 0.00013747982342198284, + "loss": 3.0444793701171875, + "step": 4483, + "token_acc": 0.2931739951069783 + }, + { + "epoch": 2.62855467604808, + "grad_norm": 0.300915252790484, + "learning_rate": 0.00013747801903991075, + "loss": 3.068734645843506, + "step": 4484, + "token_acc": 0.2921289370896701 + }, + { + "epoch": 2.6291410143652887, + "grad_norm": 0.307208419764696, + "learning_rate": 0.0001374762140239736, + "loss": 3.096848487854004, + "step": 4485, + "token_acc": 0.28724592818851796 + }, + { + "epoch": 2.629727352682498, + "grad_norm": 0.3692342632576996, + "learning_rate": 0.00013747440837418834, + "loss": 3.0606870651245117, + "step": 4486, + "token_acc": 0.29231684567699157 + }, + { + "epoch": 2.630313690999707, + "grad_norm": 0.3472495600363577, + "learning_rate": 0.00013747260209057193, + "loss": 3.0597872734069824, + "step": 4487, + "token_acc": 0.2942744754076003 + }, + { + "epoch": 2.630900029316916, + "grad_norm": 0.2932074819906137, + "learning_rate": 0.00013747079517314133, + "loss": 3.0616931915283203, + "step": 4488, + "token_acc": 0.29272130158225546 + }, + { + "epoch": 2.6314863676341247, + "grad_norm": 0.4343772973672611, + "learning_rate": 0.00013746898762191355, + "loss": 3.0955214500427246, + "step": 4489, + "token_acc": 0.2870414856144803 + }, + { + "epoch": 2.632072705951334, + "grad_norm": 0.3402355524762752, + "learning_rate": 0.00013746717943690555, + "loss": 3.0270819664001465, + "step": 4490, + "token_acc": 0.29558606573718416 + }, + { + "epoch": 2.632659044268543, + "grad_norm": 0.3213517946158592, + "learning_rate": 0.0001374653706181343, + "loss": 3.103055953979492, + "step": 4491, + "token_acc": 0.2873525911500595 + }, + { + "epoch": 2.633245382585752, + "grad_norm": 0.3312927647702127, + "learning_rate": 0.00013746356116561682, + "loss": 3.144172430038452, + "step": 4492, + "token_acc": 0.28198694389205287 + }, + { + "epoch": 2.633831720902961, + "grad_norm": 0.34072900136920803, + "learning_rate": 0.00013746175107937005, + "loss": 3.0486061573028564, + "step": 4493, + "token_acc": 0.29520381059940604 + }, + { + "epoch": 2.6344180592201702, + "grad_norm": 0.29328336667025795, + "learning_rate": 0.0001374599403594111, + "loss": 3.0564751625061035, + "step": 4494, + "token_acc": 0.29197452744160435 + }, + { + "epoch": 2.635004397537379, + "grad_norm": 0.32864576559021946, + "learning_rate": 0.00013745812900575687, + "loss": 3.080198287963867, + "step": 4495, + "token_acc": 0.2895390402983495 + }, + { + "epoch": 2.635590735854588, + "grad_norm": 0.2732465883518336, + "learning_rate": 0.00013745631701842442, + "loss": 3.0655946731567383, + "step": 4496, + "token_acc": 0.2917062689653501 + }, + { + "epoch": 2.636177074171797, + "grad_norm": 0.34501286930516945, + "learning_rate": 0.00013745450439743077, + "loss": 3.0551295280456543, + "step": 4497, + "token_acc": 0.29316107781197687 + }, + { + "epoch": 2.6367634124890063, + "grad_norm": 0.2881453255632399, + "learning_rate": 0.00013745269114279294, + "loss": 3.056288242340088, + "step": 4498, + "token_acc": 0.29484974408747083 + }, + { + "epoch": 2.637349750806215, + "grad_norm": 0.30700088102205325, + "learning_rate": 0.00013745087725452798, + "loss": 3.066913604736328, + "step": 4499, + "token_acc": 0.2908676264492292 + }, + { + "epoch": 2.637936089123424, + "grad_norm": 0.2692430028490861, + "learning_rate": 0.00013744906273265294, + "loss": 3.0516200065612793, + "step": 4500, + "token_acc": 0.2940430129467871 + }, + { + "epoch": 2.638522427440633, + "grad_norm": 0.3371302974384048, + "learning_rate": 0.00013744724757718484, + "loss": 3.1064674854278564, + "step": 4501, + "token_acc": 0.28612330648853934 + }, + { + "epoch": 2.6391087657578423, + "grad_norm": 0.2705574252096661, + "learning_rate": 0.0001374454317881407, + "loss": 3.0408406257629395, + "step": 4502, + "token_acc": 0.2951778955544638 + }, + { + "epoch": 2.6396951040750514, + "grad_norm": 0.33431306198481364, + "learning_rate": 0.00013744361536553764, + "loss": 3.070269823074341, + "step": 4503, + "token_acc": 0.2926407530608605 + }, + { + "epoch": 2.6402814423922605, + "grad_norm": 0.2829075479525803, + "learning_rate": 0.00013744179830939269, + "loss": 3.0330653190612793, + "step": 4504, + "token_acc": 0.2955782740245192 + }, + { + "epoch": 2.6408677807094696, + "grad_norm": 0.2997351415327728, + "learning_rate": 0.00013743998061972293, + "loss": 3.0734310150146484, + "step": 4505, + "token_acc": 0.2900255995971211 + }, + { + "epoch": 2.6414541190266783, + "grad_norm": 0.27911715752868643, + "learning_rate": 0.0001374381622965454, + "loss": 3.0682992935180664, + "step": 4506, + "token_acc": 0.2897983102468502 + }, + { + "epoch": 2.6420404573438874, + "grad_norm": 0.2989643405773793, + "learning_rate": 0.00013743634333987726, + "loss": 3.0760579109191895, + "step": 4507, + "token_acc": 0.28815674371585664 + }, + { + "epoch": 2.6426267956610965, + "grad_norm": 0.27972314388874864, + "learning_rate": 0.0001374345237497355, + "loss": 3.0545825958251953, + "step": 4508, + "token_acc": 0.2933210485145344 + }, + { + "epoch": 2.6432131339783056, + "grad_norm": 0.319858181247578, + "learning_rate": 0.00013743270352613727, + "loss": 3.088914632797241, + "step": 4509, + "token_acc": 0.28934416389900364 + }, + { + "epoch": 2.6437994722955143, + "grad_norm": 0.3012741651006681, + "learning_rate": 0.00013743088266909967, + "loss": 3.0659196376800537, + "step": 4510, + "token_acc": 0.29071482844065205 + }, + { + "epoch": 2.6443858106127234, + "grad_norm": 0.33819822899657687, + "learning_rate": 0.0001374290611786398, + "loss": 3.09185791015625, + "step": 4511, + "token_acc": 0.2878797493254682 + }, + { + "epoch": 2.6449721489299325, + "grad_norm": 0.3544651689972899, + "learning_rate": 0.00013742723905477472, + "loss": 3.0865135192871094, + "step": 4512, + "token_acc": 0.2891197809369042 + }, + { + "epoch": 2.6455584872471416, + "grad_norm": 0.3543055835806596, + "learning_rate": 0.00013742541629752162, + "loss": 3.0942201614379883, + "step": 4513, + "token_acc": 0.2874524504238238 + }, + { + "epoch": 2.6461448255643507, + "grad_norm": 0.37593491510183574, + "learning_rate": 0.00013742359290689759, + "loss": 3.073152542114258, + "step": 4514, + "token_acc": 0.2879362786205141 + }, + { + "epoch": 2.64673116388156, + "grad_norm": 0.2948952297093295, + "learning_rate": 0.00013742176888291975, + "loss": 3.0756428241729736, + "step": 4515, + "token_acc": 0.2893849421583674 + }, + { + "epoch": 2.647317502198769, + "grad_norm": 0.37272138290929685, + "learning_rate": 0.00013741994422560524, + "loss": 3.086709499359131, + "step": 4516, + "token_acc": 0.2891275546309253 + }, + { + "epoch": 2.6479038405159776, + "grad_norm": 0.3702762574763534, + "learning_rate": 0.0001374181189349712, + "loss": 3.0641942024230957, + "step": 4517, + "token_acc": 0.2911838899201857 + }, + { + "epoch": 2.6484901788331867, + "grad_norm": 0.3714521513994398, + "learning_rate": 0.0001374162930110348, + "loss": 3.080141067504883, + "step": 4518, + "token_acc": 0.2902737611057273 + }, + { + "epoch": 2.649076517150396, + "grad_norm": 0.3661789248513293, + "learning_rate": 0.00013741446645381317, + "loss": 3.0809543132781982, + "step": 4519, + "token_acc": 0.28962052248926295 + }, + { + "epoch": 2.649662855467605, + "grad_norm": 0.315240988394481, + "learning_rate": 0.00013741263926332346, + "loss": 3.0562877655029297, + "step": 4520, + "token_acc": 0.29198376329773723 + }, + { + "epoch": 2.6502491937848136, + "grad_norm": 0.3712949001031072, + "learning_rate": 0.00013741081143958284, + "loss": 3.0590438842773438, + "step": 4521, + "token_acc": 0.2934216090427257 + }, + { + "epoch": 2.6508355321020227, + "grad_norm": 0.3658295166517285, + "learning_rate": 0.0001374089829826085, + "loss": 3.073647975921631, + "step": 4522, + "token_acc": 0.2888377357336229 + }, + { + "epoch": 2.651421870419232, + "grad_norm": 0.3197818920745739, + "learning_rate": 0.0001374071538924176, + "loss": 3.09137225151062, + "step": 4523, + "token_acc": 0.28804149775949295 + }, + { + "epoch": 2.652008208736441, + "grad_norm": 0.3004364801907915, + "learning_rate": 0.0001374053241690273, + "loss": 3.04290771484375, + "step": 4524, + "token_acc": 0.29553432606151925 + }, + { + "epoch": 2.65259454705365, + "grad_norm": 0.31207437644952923, + "learning_rate": 0.00013740349381245485, + "loss": 3.0680551528930664, + "step": 4525, + "token_acc": 0.29005338737218916 + }, + { + "epoch": 2.653180885370859, + "grad_norm": 0.3246646728601106, + "learning_rate": 0.00013740166282271733, + "loss": 3.035292863845825, + "step": 4526, + "token_acc": 0.2955817630218348 + }, + { + "epoch": 2.6537672236880683, + "grad_norm": 0.3527508068069239, + "learning_rate": 0.00013739983119983207, + "loss": 3.0933501720428467, + "step": 4527, + "token_acc": 0.28773853013932216 + }, + { + "epoch": 2.654353562005277, + "grad_norm": 0.3478749265821684, + "learning_rate": 0.00013739799894381622, + "loss": 3.1205596923828125, + "step": 4528, + "token_acc": 0.2843326681100424 + }, + { + "epoch": 2.654939900322486, + "grad_norm": 0.2979163614673156, + "learning_rate": 0.00013739616605468698, + "loss": 3.0574023723602295, + "step": 4529, + "token_acc": 0.29246849935464564 + }, + { + "epoch": 2.655526238639695, + "grad_norm": 0.3082733700046681, + "learning_rate": 0.00013739433253246155, + "loss": 3.064596176147461, + "step": 4530, + "token_acc": 0.2941308830809224 + }, + { + "epoch": 2.6561125769569043, + "grad_norm": 0.29164374333019905, + "learning_rate": 0.0001373924983771572, + "loss": 3.057131052017212, + "step": 4531, + "token_acc": 0.2916910708296414 + }, + { + "epoch": 2.656698915274113, + "grad_norm": 0.2884205576629347, + "learning_rate": 0.00013739066358879113, + "loss": 3.112107276916504, + "step": 4532, + "token_acc": 0.2869369888570502 + }, + { + "epoch": 2.657285253591322, + "grad_norm": 0.26717993918944666, + "learning_rate": 0.0001373888281673806, + "loss": 3.0781047344207764, + "step": 4533, + "token_acc": 0.2894382033970287 + }, + { + "epoch": 2.657871591908531, + "grad_norm": 0.29665961866738705, + "learning_rate": 0.00013738699211294285, + "loss": 3.070359230041504, + "step": 4534, + "token_acc": 0.29100672568253716 + }, + { + "epoch": 2.6584579302257403, + "grad_norm": 0.2755225384393486, + "learning_rate": 0.0001373851554254951, + "loss": 3.1055407524108887, + "step": 4535, + "token_acc": 0.2872393670266011 + }, + { + "epoch": 2.6590442685429494, + "grad_norm": 0.37738379992075227, + "learning_rate": 0.00013738331810505462, + "loss": 3.031566858291626, + "step": 4536, + "token_acc": 0.2966010752319606 + }, + { + "epoch": 2.6596306068601585, + "grad_norm": 0.37504910233581834, + "learning_rate": 0.00013738148015163867, + "loss": 3.0826618671417236, + "step": 4537, + "token_acc": 0.28885332598253166 + }, + { + "epoch": 2.660216945177367, + "grad_norm": 0.2771024387116616, + "learning_rate": 0.0001373796415652645, + "loss": 3.063094139099121, + "step": 4538, + "token_acc": 0.29270691661828296 + }, + { + "epoch": 2.6608032834945763, + "grad_norm": 0.35361566203100014, + "learning_rate": 0.0001373778023459494, + "loss": 3.1228065490722656, + "step": 4539, + "token_acc": 0.2840617129903761 + }, + { + "epoch": 2.6613896218117854, + "grad_norm": 0.3326145793451327, + "learning_rate": 0.00013737596249371065, + "loss": 3.110495090484619, + "step": 4540, + "token_acc": 0.2858684887294944 + }, + { + "epoch": 2.6619759601289945, + "grad_norm": 0.28741494314364296, + "learning_rate": 0.0001373741220085655, + "loss": 3.070624351501465, + "step": 4541, + "token_acc": 0.2913099903302967 + }, + { + "epoch": 2.6625622984462036, + "grad_norm": 0.3109902208545587, + "learning_rate": 0.00013737228089053127, + "loss": 3.058377742767334, + "step": 4542, + "token_acc": 0.2934938262682716 + }, + { + "epoch": 2.6631486367634123, + "grad_norm": 0.27994703002618154, + "learning_rate": 0.00013737043913962524, + "loss": 3.0249228477478027, + "step": 4543, + "token_acc": 0.2974201448703846 + }, + { + "epoch": 2.6637349750806214, + "grad_norm": 0.30004857084991077, + "learning_rate": 0.0001373685967558647, + "loss": 3.077030658721924, + "step": 4544, + "token_acc": 0.28988123986080666 + }, + { + "epoch": 2.6643213133978305, + "grad_norm": 0.2900047286148718, + "learning_rate": 0.00013736675373926703, + "loss": 3.0895135402679443, + "step": 4545, + "token_acc": 0.2880103228552805 + }, + { + "epoch": 2.6649076517150396, + "grad_norm": 0.3282769241765511, + "learning_rate": 0.00013736491008984944, + "loss": 3.0604395866394043, + "step": 4546, + "token_acc": 0.2948131349830582 + }, + { + "epoch": 2.6654939900322487, + "grad_norm": 0.28861671693521695, + "learning_rate": 0.00013736306580762933, + "loss": 3.0964484214782715, + "step": 4547, + "token_acc": 0.28817332454774797 + }, + { + "epoch": 2.666080328349458, + "grad_norm": 0.3044410405818131, + "learning_rate": 0.00013736122089262395, + "loss": 3.0955069065093994, + "step": 4548, + "token_acc": 0.2877980648137111 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.3083182021792271, + "learning_rate": 0.0001373593753448507, + "loss": 3.0277023315429688, + "step": 4549, + "token_acc": 0.29706689708586453 + }, + { + "epoch": 2.6672530049838756, + "grad_norm": 0.28363840179118355, + "learning_rate": 0.00013735752916432687, + "loss": 3.0810670852661133, + "step": 4550, + "token_acc": 0.288914767801457 + }, + { + "epoch": 2.6678393433010847, + "grad_norm": 0.3000481584331715, + "learning_rate": 0.00013735568235106983, + "loss": 3.072765827178955, + "step": 4551, + "token_acc": 0.2904116373292308 + }, + { + "epoch": 2.668425681618294, + "grad_norm": 0.30055561835411904, + "learning_rate": 0.0001373538349050969, + "loss": 3.0768237113952637, + "step": 4552, + "token_acc": 0.289230257885186 + }, + { + "epoch": 2.6690120199355025, + "grad_norm": 0.2954632151915868, + "learning_rate": 0.00013735198682642547, + "loss": 3.0627124309539795, + "step": 4553, + "token_acc": 0.2910717054159962 + }, + { + "epoch": 2.6695983582527116, + "grad_norm": 0.2968581250922359, + "learning_rate": 0.00013735013811507288, + "loss": 3.0866482257843018, + "step": 4554, + "token_acc": 0.2888657711960346 + }, + { + "epoch": 2.6701846965699207, + "grad_norm": 0.2858234387371888, + "learning_rate": 0.0001373482887710565, + "loss": 3.062004804611206, + "step": 4555, + "token_acc": 0.2901282701571896 + }, + { + "epoch": 2.67077103488713, + "grad_norm": 0.31495469774461377, + "learning_rate": 0.0001373464387943937, + "loss": 3.0563271045684814, + "step": 4556, + "token_acc": 0.2919139894840176 + }, + { + "epoch": 2.671357373204339, + "grad_norm": 0.28392186348033, + "learning_rate": 0.00013734458818510185, + "loss": 3.1017322540283203, + "step": 4557, + "token_acc": 0.28585819871240414 + }, + { + "epoch": 2.671943711521548, + "grad_norm": 0.3006992878182451, + "learning_rate": 0.00013734273694319835, + "loss": 3.052558183670044, + "step": 4558, + "token_acc": 0.2944821777664449 + }, + { + "epoch": 2.672530049838757, + "grad_norm": 0.28622296687827337, + "learning_rate": 0.00013734088506870055, + "loss": 3.093113899230957, + "step": 4559, + "token_acc": 0.2875392275935338 + }, + { + "epoch": 2.673116388155966, + "grad_norm": 0.2880224484646094, + "learning_rate": 0.0001373390325616259, + "loss": 3.05501651763916, + "step": 4560, + "token_acc": 0.2918912053869371 + }, + { + "epoch": 2.673702726473175, + "grad_norm": 0.38004753707376665, + "learning_rate": 0.0001373371794219918, + "loss": 3.091177225112915, + "step": 4561, + "token_acc": 0.28923965172044513 + }, + { + "epoch": 2.674289064790384, + "grad_norm": 0.44895660179823377, + "learning_rate": 0.0001373353256498156, + "loss": 3.10707426071167, + "step": 4562, + "token_acc": 0.2853962338695469 + }, + { + "epoch": 2.674875403107593, + "grad_norm": 0.34871977897799034, + "learning_rate": 0.0001373334712451148, + "loss": 3.1100454330444336, + "step": 4563, + "token_acc": 0.2850130411700892 + }, + { + "epoch": 2.675461741424802, + "grad_norm": 0.3378394758294805, + "learning_rate": 0.00013733161620790673, + "loss": 3.0971598625183105, + "step": 4564, + "token_acc": 0.28840658264053765 + }, + { + "epoch": 2.676048079742011, + "grad_norm": 0.3507666743998226, + "learning_rate": 0.00013732976053820885, + "loss": 3.1092798709869385, + "step": 4565, + "token_acc": 0.28592710168007845 + }, + { + "epoch": 2.67663441805922, + "grad_norm": 0.31230355473321864, + "learning_rate": 0.00013732790423603863, + "loss": 3.0767576694488525, + "step": 4566, + "token_acc": 0.2891803099908666 + }, + { + "epoch": 2.677220756376429, + "grad_norm": 0.3027973413143864, + "learning_rate": 0.00013732604730141347, + "loss": 3.068896770477295, + "step": 4567, + "token_acc": 0.29142429906542056 + }, + { + "epoch": 2.6778070946936383, + "grad_norm": 0.3346302994019165, + "learning_rate": 0.0001373241897343508, + "loss": 3.12660551071167, + "step": 4568, + "token_acc": 0.2820050167136875 + }, + { + "epoch": 2.6783934330108474, + "grad_norm": 0.2685555032115937, + "learning_rate": 0.00013732233153486808, + "loss": 3.0713300704956055, + "step": 4569, + "token_acc": 0.29165210753508974 + }, + { + "epoch": 2.6789797713280565, + "grad_norm": 0.291504687941692, + "learning_rate": 0.00013732047270298282, + "loss": 3.0642271041870117, + "step": 4570, + "token_acc": 0.2908073611502897 + }, + { + "epoch": 2.679566109645265, + "grad_norm": 0.32981816400438313, + "learning_rate": 0.00013731861323871238, + "loss": 3.0596375465393066, + "step": 4571, + "token_acc": 0.2913828373242508 + }, + { + "epoch": 2.6801524479624743, + "grad_norm": 0.3597449519855727, + "learning_rate": 0.00013731675314207432, + "loss": 3.0819168090820312, + "step": 4572, + "token_acc": 0.28928950048578717 + }, + { + "epoch": 2.6807387862796834, + "grad_norm": 0.3329706862045801, + "learning_rate": 0.00013731489241308606, + "loss": 3.0376362800598145, + "step": 4573, + "token_acc": 0.2963131666422649 + }, + { + "epoch": 2.6813251245968925, + "grad_norm": 0.3345787609661259, + "learning_rate": 0.0001373130310517651, + "loss": 3.025294780731201, + "step": 4574, + "token_acc": 0.29589207862903766 + }, + { + "epoch": 2.681911462914101, + "grad_norm": 0.2889130824878145, + "learning_rate": 0.0001373111690581289, + "loss": 3.023744583129883, + "step": 4575, + "token_acc": 0.2972737819025522 + }, + { + "epoch": 2.6824978012313103, + "grad_norm": 0.30946782320387745, + "learning_rate": 0.00013730930643219498, + "loss": 3.0749611854553223, + "step": 4576, + "token_acc": 0.29202005127218456 + }, + { + "epoch": 2.6830841395485194, + "grad_norm": 0.33347309829399296, + "learning_rate": 0.00013730744317398086, + "loss": 3.071925640106201, + "step": 4577, + "token_acc": 0.2905604532243669 + }, + { + "epoch": 2.6836704778657285, + "grad_norm": 0.2725724513314644, + "learning_rate": 0.00013730557928350395, + "loss": 3.040964365005493, + "step": 4578, + "token_acc": 0.29513947901790133 + }, + { + "epoch": 2.6842568161829377, + "grad_norm": 0.324208926488268, + "learning_rate": 0.00013730371476078186, + "loss": 3.0867202281951904, + "step": 4579, + "token_acc": 0.28794161422565734 + }, + { + "epoch": 2.6848431545001468, + "grad_norm": 0.3132618742465307, + "learning_rate": 0.00013730184960583205, + "loss": 3.095482349395752, + "step": 4580, + "token_acc": 0.2862830323543633 + }, + { + "epoch": 2.685429492817356, + "grad_norm": 0.2879602181704379, + "learning_rate": 0.00013729998381867205, + "loss": 3.0938520431518555, + "step": 4581, + "token_acc": 0.28762722736441315 + }, + { + "epoch": 2.6860158311345645, + "grad_norm": 0.40888468437613634, + "learning_rate": 0.0001372981173993194, + "loss": 3.143092632293701, + "step": 4582, + "token_acc": 0.28189501538429795 + }, + { + "epoch": 2.6866021694517737, + "grad_norm": 0.37559990334447524, + "learning_rate": 0.00013729625034779162, + "loss": 3.072305679321289, + "step": 4583, + "token_acc": 0.2902383439369481 + }, + { + "epoch": 2.6871885077689828, + "grad_norm": 0.3140639331353125, + "learning_rate": 0.00013729438266410623, + "loss": 3.0520448684692383, + "step": 4584, + "token_acc": 0.29372024192510615 + }, + { + "epoch": 2.687774846086192, + "grad_norm": 0.42503434050472433, + "learning_rate": 0.00013729251434828083, + "loss": 3.0619285106658936, + "step": 4585, + "token_acc": 0.2918940513762934 + }, + { + "epoch": 2.6883611844034006, + "grad_norm": 0.33292024848165375, + "learning_rate": 0.0001372906454003329, + "loss": 3.0980563163757324, + "step": 4586, + "token_acc": 0.287998634571997 + }, + { + "epoch": 2.6889475227206097, + "grad_norm": 0.3281519261921524, + "learning_rate": 0.00013728877582028004, + "loss": 3.085754871368408, + "step": 4587, + "token_acc": 0.2880705214006192 + }, + { + "epoch": 2.6895338610378188, + "grad_norm": 0.25373747842140015, + "learning_rate": 0.00013728690560813983, + "loss": 3.0732369422912598, + "step": 4588, + "token_acc": 0.2910603827632016 + }, + { + "epoch": 2.690120199355028, + "grad_norm": 0.35994000399555687, + "learning_rate": 0.0001372850347639298, + "loss": 3.037281036376953, + "step": 4589, + "token_acc": 0.29614052893129633 + }, + { + "epoch": 2.690706537672237, + "grad_norm": 0.27046230531460835, + "learning_rate": 0.00013728316328766752, + "loss": 3.0872697830200195, + "step": 4590, + "token_acc": 0.288361827264557 + }, + { + "epoch": 2.691292875989446, + "grad_norm": 0.360549886814419, + "learning_rate": 0.0001372812911793706, + "loss": 3.051240921020508, + "step": 4591, + "token_acc": 0.29471915668046483 + }, + { + "epoch": 2.6918792143066548, + "grad_norm": 0.3013384453048884, + "learning_rate": 0.00013727941843905662, + "loss": 3.0594334602355957, + "step": 4592, + "token_acc": 0.293474875282752 + }, + { + "epoch": 2.692465552623864, + "grad_norm": 0.31123662333392366, + "learning_rate": 0.00013727754506674314, + "loss": 3.083174705505371, + "step": 4593, + "token_acc": 0.2888921773054416 + }, + { + "epoch": 2.693051890941073, + "grad_norm": 0.3450814393652759, + "learning_rate": 0.0001372756710624478, + "loss": 3.011199951171875, + "step": 4594, + "token_acc": 0.29820594387086496 + }, + { + "epoch": 2.693638229258282, + "grad_norm": 0.2991844292590364, + "learning_rate": 0.0001372737964261882, + "loss": 3.0423927307128906, + "step": 4595, + "token_acc": 0.29420687478684904 + }, + { + "epoch": 2.6942245675754912, + "grad_norm": 0.3083194728818996, + "learning_rate": 0.00013727192115798188, + "loss": 3.1141276359558105, + "step": 4596, + "token_acc": 0.28490790104076413 + }, + { + "epoch": 2.6948109058927, + "grad_norm": 0.3340472489379645, + "learning_rate": 0.00013727004525784656, + "loss": 3.0891127586364746, + "step": 4597, + "token_acc": 0.28709002820591056 + }, + { + "epoch": 2.695397244209909, + "grad_norm": 0.282345460141468, + "learning_rate": 0.00013726816872579979, + "loss": 3.080362558364868, + "step": 4598, + "token_acc": 0.2896671327167133 + }, + { + "epoch": 2.695983582527118, + "grad_norm": 0.34572595262021083, + "learning_rate": 0.00013726629156185922, + "loss": 3.083634853363037, + "step": 4599, + "token_acc": 0.28858676569166763 + }, + { + "epoch": 2.6965699208443272, + "grad_norm": 0.33409714084629816, + "learning_rate": 0.0001372644137660425, + "loss": 3.03096342086792, + "step": 4600, + "token_acc": 0.29632946098863655 + }, + { + "epoch": 2.6971562591615363, + "grad_norm": 0.284284990655958, + "learning_rate": 0.00013726253533836725, + "loss": 3.0745086669921875, + "step": 4601, + "token_acc": 0.2888514532837172 + }, + { + "epoch": 2.6977425974787455, + "grad_norm": 0.3545023195050065, + "learning_rate": 0.0001372606562788511, + "loss": 3.093515634536743, + "step": 4602, + "token_acc": 0.28666612227399624 + }, + { + "epoch": 2.698328935795954, + "grad_norm": 0.359696574238944, + "learning_rate": 0.00013725877658751174, + "loss": 3.115234375, + "step": 4603, + "token_acc": 0.28445914959438484 + }, + { + "epoch": 2.6989152741131632, + "grad_norm": 0.31530924166126517, + "learning_rate": 0.0001372568962643668, + "loss": 3.046837568283081, + "step": 4604, + "token_acc": 0.29512913831414156 + }, + { + "epoch": 2.6995016124303723, + "grad_norm": 0.3018615579284375, + "learning_rate": 0.00013725501530943395, + "loss": 3.002796173095703, + "step": 4605, + "token_acc": 0.3021430059368816 + }, + { + "epoch": 2.7000879507475815, + "grad_norm": 0.3133603057599074, + "learning_rate": 0.00013725313372273085, + "loss": 3.0402657985687256, + "step": 4606, + "token_acc": 0.2940854750108718 + }, + { + "epoch": 2.70067428906479, + "grad_norm": 0.28798801816123304, + "learning_rate": 0.00013725125150427521, + "loss": 3.0450315475463867, + "step": 4607, + "token_acc": 0.29462965656779383 + }, + { + "epoch": 2.7012606273819992, + "grad_norm": 0.29052676188547427, + "learning_rate": 0.00013724936865408465, + "loss": 3.0572915077209473, + "step": 4608, + "token_acc": 0.2932958411950096 + }, + { + "epoch": 2.7018469656992083, + "grad_norm": 0.30348748587655283, + "learning_rate": 0.00013724748517217688, + "loss": 3.0782089233398438, + "step": 4609, + "token_acc": 0.2909379369689147 + }, + { + "epoch": 2.7024333040164175, + "grad_norm": 0.3239671233552501, + "learning_rate": 0.00013724560105856965, + "loss": 3.027193307876587, + "step": 4610, + "token_acc": 0.29525377449405715 + }, + { + "epoch": 2.7030196423336266, + "grad_norm": 0.3000059637709648, + "learning_rate": 0.00013724371631328058, + "loss": 3.054741382598877, + "step": 4611, + "token_acc": 0.2928138952996532 + }, + { + "epoch": 2.7036059806508357, + "grad_norm": 0.3216486896668394, + "learning_rate": 0.00013724183093632742, + "loss": 3.0811262130737305, + "step": 4612, + "token_acc": 0.2890756432818811 + }, + { + "epoch": 2.704192318968045, + "grad_norm": 0.4335428447454894, + "learning_rate": 0.00013723994492772788, + "loss": 3.0160369873046875, + "step": 4613, + "token_acc": 0.2986327512286939 + }, + { + "epoch": 2.7047786572852535, + "grad_norm": 0.4681177517672436, + "learning_rate": 0.00013723805828749964, + "loss": 3.0675861835479736, + "step": 4614, + "token_acc": 0.2904853276025277 + }, + { + "epoch": 2.7053649956024626, + "grad_norm": 0.34053942585608354, + "learning_rate": 0.00013723617101566048, + "loss": 3.069357395172119, + "step": 4615, + "token_acc": 0.29021048659301035 + }, + { + "epoch": 2.7059513339196717, + "grad_norm": 0.32679711959989066, + "learning_rate": 0.00013723428311222805, + "loss": 3.074840545654297, + "step": 4616, + "token_acc": 0.29085700569806516 + }, + { + "epoch": 2.706537672236881, + "grad_norm": 0.3631607233836758, + "learning_rate": 0.00013723239457722014, + "loss": 3.1168670654296875, + "step": 4617, + "token_acc": 0.28500542278293595 + }, + { + "epoch": 2.7071240105540895, + "grad_norm": 0.30944801798831184, + "learning_rate": 0.0001372305054106545, + "loss": 3.1093590259552, + "step": 4618, + "token_acc": 0.28613807687250736 + }, + { + "epoch": 2.7077103488712986, + "grad_norm": 0.3091127128134372, + "learning_rate": 0.00013722861561254885, + "loss": 3.1028690338134766, + "step": 4619, + "token_acc": 0.2852865263601132 + }, + { + "epoch": 2.7082966871885077, + "grad_norm": 0.3125408254625326, + "learning_rate": 0.00013722672518292096, + "loss": 3.0512280464172363, + "step": 4620, + "token_acc": 0.29162727370885466 + }, + { + "epoch": 2.708883025505717, + "grad_norm": 0.3366863659368551, + "learning_rate": 0.00013722483412178857, + "loss": 3.052410840988159, + "step": 4621, + "token_acc": 0.2929979092407393 + }, + { + "epoch": 2.709469363822926, + "grad_norm": 0.29294366976051567, + "learning_rate": 0.0001372229424291694, + "loss": 3.072221279144287, + "step": 4622, + "token_acc": 0.29019422943433787 + }, + { + "epoch": 2.710055702140135, + "grad_norm": 0.2981835171320265, + "learning_rate": 0.00013722105010508133, + "loss": 3.0546960830688477, + "step": 4623, + "token_acc": 0.2925256309907228 + }, + { + "epoch": 2.710642040457344, + "grad_norm": 0.33784872283514544, + "learning_rate": 0.00013721915714954205, + "loss": 3.0673251152038574, + "step": 4624, + "token_acc": 0.2911468892864242 + }, + { + "epoch": 2.711228378774553, + "grad_norm": 0.3250408321371394, + "learning_rate": 0.0001372172635625694, + "loss": 3.0613882541656494, + "step": 4625, + "token_acc": 0.2907878127602516 + }, + { + "epoch": 2.711814717091762, + "grad_norm": 0.3551579961697743, + "learning_rate": 0.0001372153693441811, + "loss": 3.105496644973755, + "step": 4626, + "token_acc": 0.2864916365637308 + }, + { + "epoch": 2.712401055408971, + "grad_norm": 0.3292667703478197, + "learning_rate": 0.000137213474494395, + "loss": 3.1070785522460938, + "step": 4627, + "token_acc": 0.28521062014822157 + }, + { + "epoch": 2.71298739372618, + "grad_norm": 0.3249840887173426, + "learning_rate": 0.00013721157901322884, + "loss": 3.0705509185791016, + "step": 4628, + "token_acc": 0.29172157206918964 + }, + { + "epoch": 2.713573732043389, + "grad_norm": 0.3534857655964296, + "learning_rate": 0.00013720968290070051, + "loss": 3.059196710586548, + "step": 4629, + "token_acc": 0.29233970105376694 + }, + { + "epoch": 2.714160070360598, + "grad_norm": 0.30552468839914665, + "learning_rate": 0.00013720778615682774, + "loss": 3.0478081703186035, + "step": 4630, + "token_acc": 0.2924005032021958 + }, + { + "epoch": 2.714746408677807, + "grad_norm": 0.2932979069073854, + "learning_rate": 0.0001372058887816284, + "loss": 3.060421943664551, + "step": 4631, + "token_acc": 0.29230218651767614 + }, + { + "epoch": 2.715332746995016, + "grad_norm": 0.3198345435771246, + "learning_rate": 0.0001372039907751203, + "loss": 3.0909838676452637, + "step": 4632, + "token_acc": 0.288681456338142 + }, + { + "epoch": 2.7159190853122253, + "grad_norm": 0.3030597503644988, + "learning_rate": 0.00013720209213732124, + "loss": 3.087268352508545, + "step": 4633, + "token_acc": 0.2887917063387557 + }, + { + "epoch": 2.7165054236294344, + "grad_norm": 0.2949125865011794, + "learning_rate": 0.00013720019286824912, + "loss": 3.0773468017578125, + "step": 4634, + "token_acc": 0.28858600313312116 + }, + { + "epoch": 2.7170917619466435, + "grad_norm": 0.25734936655133034, + "learning_rate": 0.00013719829296792173, + "loss": 3.1264283657073975, + "step": 4635, + "token_acc": 0.2829478059224129 + }, + { + "epoch": 2.717678100263852, + "grad_norm": 0.31215351922702717, + "learning_rate": 0.00013719639243635692, + "loss": 3.0454394817352295, + "step": 4636, + "token_acc": 0.29405746197521654 + }, + { + "epoch": 2.7182644385810613, + "grad_norm": 0.303838470528605, + "learning_rate": 0.00013719449127357254, + "loss": 3.0922412872314453, + "step": 4637, + "token_acc": 0.2884730919880683 + }, + { + "epoch": 2.7188507768982704, + "grad_norm": 0.30572499640591616, + "learning_rate": 0.00013719258947958646, + "loss": 3.0443553924560547, + "step": 4638, + "token_acc": 0.2930546878847814 + }, + { + "epoch": 2.7194371152154795, + "grad_norm": 0.3125310572118373, + "learning_rate": 0.00013719068705441657, + "loss": 3.092749834060669, + "step": 4639, + "token_acc": 0.288975850388257 + }, + { + "epoch": 2.720023453532688, + "grad_norm": 0.3511025429775157, + "learning_rate": 0.0001371887839980807, + "loss": 3.0956149101257324, + "step": 4640, + "token_acc": 0.2858726763116649 + }, + { + "epoch": 2.7206097918498973, + "grad_norm": 0.3441957897563102, + "learning_rate": 0.00013718688031059674, + "loss": 3.0626916885375977, + "step": 4641, + "token_acc": 0.2915112795744941 + }, + { + "epoch": 2.7211961301671064, + "grad_norm": 0.3668622720711865, + "learning_rate": 0.0001371849759919826, + "loss": 3.081163167953491, + "step": 4642, + "token_acc": 0.28903033673022016 + }, + { + "epoch": 2.7217824684843155, + "grad_norm": 0.3864319472591011, + "learning_rate": 0.00013718307104225612, + "loss": 3.0808796882629395, + "step": 4643, + "token_acc": 0.28866641325676246 + }, + { + "epoch": 2.7223688068015246, + "grad_norm": 0.4052525282319413, + "learning_rate": 0.00013718116546143522, + "loss": 3.067661762237549, + "step": 4644, + "token_acc": 0.2921307622762245 + }, + { + "epoch": 2.7229551451187337, + "grad_norm": 0.3062027632945052, + "learning_rate": 0.0001371792592495378, + "loss": 3.08237624168396, + "step": 4645, + "token_acc": 0.2893445603683615 + }, + { + "epoch": 2.7235414834359424, + "grad_norm": 0.31900900898352386, + "learning_rate": 0.00013717735240658174, + "loss": 3.076756000518799, + "step": 4646, + "token_acc": 0.289236042966859 + }, + { + "epoch": 2.7241278217531515, + "grad_norm": 0.385087227463911, + "learning_rate": 0.000137175444932585, + "loss": 3.0756025314331055, + "step": 4647, + "token_acc": 0.29054068148395373 + }, + { + "epoch": 2.7247141600703606, + "grad_norm": 0.26957241858256603, + "learning_rate": 0.0001371735368275655, + "loss": 3.0599937438964844, + "step": 4648, + "token_acc": 0.2932118245969828 + }, + { + "epoch": 2.7253004983875697, + "grad_norm": 0.3796463293172065, + "learning_rate": 0.0001371716280915411, + "loss": 3.027343273162842, + "step": 4649, + "token_acc": 0.2979689018708606 + }, + { + "epoch": 2.7258868367047784, + "grad_norm": 0.36920687937172897, + "learning_rate": 0.0001371697187245298, + "loss": 3.0219480991363525, + "step": 4650, + "token_acc": 0.29866804488297843 + }, + { + "epoch": 2.7264731750219875, + "grad_norm": 0.2651930395952895, + "learning_rate": 0.00013716780872654948, + "loss": 3.023284673690796, + "step": 4651, + "token_acc": 0.29661877193029196 + }, + { + "epoch": 2.7270595133391966, + "grad_norm": 0.4031078134454338, + "learning_rate": 0.0001371658980976181, + "loss": 3.0579094886779785, + "step": 4652, + "token_acc": 0.2920567424228522 + }, + { + "epoch": 2.7276458516564057, + "grad_norm": 0.26127779771668336, + "learning_rate": 0.00013716398683775365, + "loss": 3.0470476150512695, + "step": 4653, + "token_acc": 0.29355656403503383 + }, + { + "epoch": 2.728232189973615, + "grad_norm": 0.303747678696168, + "learning_rate": 0.00013716207494697403, + "loss": 3.0965912342071533, + "step": 4654, + "token_acc": 0.2873409619808418 + }, + { + "epoch": 2.728818528290824, + "grad_norm": 0.29855832942073834, + "learning_rate": 0.00013716016242529722, + "loss": 3.099052906036377, + "step": 4655, + "token_acc": 0.28842650546005966 + }, + { + "epoch": 2.729404866608033, + "grad_norm": 0.3342339819084472, + "learning_rate": 0.00013715824927274116, + "loss": 3.014275550842285, + "step": 4656, + "token_acc": 0.29808178565981597 + }, + { + "epoch": 2.7299912049252417, + "grad_norm": 0.3043326481389358, + "learning_rate": 0.00013715633548932386, + "loss": 3.0775539875030518, + "step": 4657, + "token_acc": 0.2892878810066735 + }, + { + "epoch": 2.730577543242451, + "grad_norm": 0.2817202425683157, + "learning_rate": 0.00013715442107506328, + "loss": 3.0511786937713623, + "step": 4658, + "token_acc": 0.29335321687260335 + }, + { + "epoch": 2.73116388155966, + "grad_norm": 0.28392520773135965, + "learning_rate": 0.00013715250602997745, + "loss": 3.036102533340454, + "step": 4659, + "token_acc": 0.2954363658226723 + }, + { + "epoch": 2.731750219876869, + "grad_norm": 0.29323828946359787, + "learning_rate": 0.00013715059035408425, + "loss": 3.0630078315734863, + "step": 4660, + "token_acc": 0.2921022757440224 + }, + { + "epoch": 2.7323365581940777, + "grad_norm": 0.3278566182677005, + "learning_rate": 0.00013714867404740177, + "loss": 3.062596082687378, + "step": 4661, + "token_acc": 0.2921800965947674 + }, + { + "epoch": 2.732922896511287, + "grad_norm": 0.30987588372629177, + "learning_rate": 0.000137146757109948, + "loss": 3.0138630867004395, + "step": 4662, + "token_acc": 0.2994056424204828 + }, + { + "epoch": 2.733509234828496, + "grad_norm": 0.3056280754597164, + "learning_rate": 0.0001371448395417409, + "loss": 3.075791597366333, + "step": 4663, + "token_acc": 0.2885157277338287 + }, + { + "epoch": 2.734095573145705, + "grad_norm": 0.25058130906685194, + "learning_rate": 0.0001371429213427985, + "loss": 3.0427842140197754, + "step": 4664, + "token_acc": 0.294549490382951 + }, + { + "epoch": 2.734681911462914, + "grad_norm": 0.2921262056888084, + "learning_rate": 0.00013714100251313886, + "loss": 3.0665647983551025, + "step": 4665, + "token_acc": 0.2929076825920532 + }, + { + "epoch": 2.7352682497801233, + "grad_norm": 0.33380965684513686, + "learning_rate": 0.00013713908305277995, + "loss": 3.0743348598480225, + "step": 4666, + "token_acc": 0.29085381435748553 + }, + { + "epoch": 2.7358545880973324, + "grad_norm": 0.275615936048703, + "learning_rate": 0.00013713716296173984, + "loss": 3.074038505554199, + "step": 4667, + "token_acc": 0.29078432451843056 + }, + { + "epoch": 2.736440926414541, + "grad_norm": 0.28679042983555186, + "learning_rate": 0.00013713524224003655, + "loss": 3.008634090423584, + "step": 4668, + "token_acc": 0.2977739502596623 + }, + { + "epoch": 2.73702726473175, + "grad_norm": 0.29273435585158947, + "learning_rate": 0.00013713332088768814, + "loss": 3.1060876846313477, + "step": 4669, + "token_acc": 0.2859662969506267 + }, + { + "epoch": 2.7376136030489593, + "grad_norm": 0.2712365948103812, + "learning_rate": 0.00013713139890471265, + "loss": 3.068361520767212, + "step": 4670, + "token_acc": 0.29063215164730777 + }, + { + "epoch": 2.7381999413661684, + "grad_norm": 0.2821305526209546, + "learning_rate": 0.0001371294762911281, + "loss": 3.028829574584961, + "step": 4671, + "token_acc": 0.29626467888374913 + }, + { + "epoch": 2.738786279683377, + "grad_norm": 0.2817901581959042, + "learning_rate": 0.0001371275530469526, + "loss": 3.076486587524414, + "step": 4672, + "token_acc": 0.28893197327631187 + }, + { + "epoch": 2.739372618000586, + "grad_norm": 0.32906803019899245, + "learning_rate": 0.00013712562917220415, + "loss": 3.0902698040008545, + "step": 4673, + "token_acc": 0.28736995466164544 + }, + { + "epoch": 2.7399589563177953, + "grad_norm": 0.33837683482148806, + "learning_rate": 0.0001371237046669009, + "loss": 3.051807403564453, + "step": 4674, + "token_acc": 0.2962123534061396 + }, + { + "epoch": 2.7405452946350044, + "grad_norm": 0.3638746615834578, + "learning_rate": 0.00013712177953106088, + "loss": 3.0917587280273438, + "step": 4675, + "token_acc": 0.28699874265560626 + }, + { + "epoch": 2.7411316329522135, + "grad_norm": 0.3834274493168949, + "learning_rate": 0.00013711985376470222, + "loss": 3.0815956592559814, + "step": 4676, + "token_acc": 0.2897532678221256 + }, + { + "epoch": 2.7417179712694226, + "grad_norm": 0.2953184745806514, + "learning_rate": 0.00013711792736784297, + "loss": 3.06042218208313, + "step": 4677, + "token_acc": 0.2919671267407744 + }, + { + "epoch": 2.7423043095866317, + "grad_norm": 0.3141705413655958, + "learning_rate": 0.0001371160003405012, + "loss": 3.1010549068450928, + "step": 4678, + "token_acc": 0.28532810070035175 + }, + { + "epoch": 2.7428906479038404, + "grad_norm": 0.3089294994230565, + "learning_rate": 0.0001371140726826951, + "loss": 3.0901567935943604, + "step": 4679, + "token_acc": 0.2874934315574634 + }, + { + "epoch": 2.7434769862210495, + "grad_norm": 0.28533671772180463, + "learning_rate": 0.0001371121443944427, + "loss": 3.039623260498047, + "step": 4680, + "token_acc": 0.29444060437697606 + }, + { + "epoch": 2.7440633245382586, + "grad_norm": 0.30368584965023476, + "learning_rate": 0.00013711021547576212, + "loss": 3.1114554405212402, + "step": 4681, + "token_acc": 0.2857752536873687 + }, + { + "epoch": 2.7446496628554677, + "grad_norm": 0.35458459299404604, + "learning_rate": 0.00013710828592667152, + "loss": 3.0802512168884277, + "step": 4682, + "token_acc": 0.2884036477069367 + }, + { + "epoch": 2.7452360011726764, + "grad_norm": 0.3112718888385984, + "learning_rate": 0.00013710635574718902, + "loss": 3.021833896636963, + "step": 4683, + "token_acc": 0.29513349461409843 + }, + { + "epoch": 2.7458223394898855, + "grad_norm": 0.3562722640745905, + "learning_rate": 0.00013710442493733267, + "loss": 3.0417211055755615, + "step": 4684, + "token_acc": 0.2959423493383204 + }, + { + "epoch": 2.7464086778070946, + "grad_norm": 0.33513570712228946, + "learning_rate": 0.00013710249349712075, + "loss": 3.079073190689087, + "step": 4685, + "token_acc": 0.2885143173801939 + }, + { + "epoch": 2.7469950161243037, + "grad_norm": 0.32798886218575923, + "learning_rate": 0.00013710056142657127, + "loss": 3.06345796585083, + "step": 4686, + "token_acc": 0.29214969962656273 + }, + { + "epoch": 2.747581354441513, + "grad_norm": 0.3365384549801903, + "learning_rate": 0.00013709862872570244, + "loss": 3.029172897338867, + "step": 4687, + "token_acc": 0.29769283171167465 + }, + { + "epoch": 2.748167692758722, + "grad_norm": 0.26849010899258874, + "learning_rate": 0.00013709669539453242, + "loss": 3.05407452583313, + "step": 4688, + "token_acc": 0.2943078316815033 + }, + { + "epoch": 2.748754031075931, + "grad_norm": 0.3921573648411156, + "learning_rate": 0.0001370947614330794, + "loss": 3.086358070373535, + "step": 4689, + "token_acc": 0.2892593099671413 + }, + { + "epoch": 2.7493403693931397, + "grad_norm": 0.28813338622658896, + "learning_rate": 0.00013709282684136145, + "loss": 3.053274154663086, + "step": 4690, + "token_acc": 0.29485944523853913 + }, + { + "epoch": 2.749926707710349, + "grad_norm": 0.3579032688645566, + "learning_rate": 0.0001370908916193968, + "loss": 3.0473785400390625, + "step": 4691, + "token_acc": 0.2943172799536413 + }, + { + "epoch": 2.750513046027558, + "grad_norm": 0.3534504534079015, + "learning_rate": 0.0001370889557672037, + "loss": 3.0648417472839355, + "step": 4692, + "token_acc": 0.2914199837234759 + }, + { + "epoch": 2.751099384344767, + "grad_norm": 0.2740320608447911, + "learning_rate": 0.0001370870192848002, + "loss": 3.0985889434814453, + "step": 4693, + "token_acc": 0.2854227019689745 + }, + { + "epoch": 2.7516857226619758, + "grad_norm": 0.34343294461616114, + "learning_rate": 0.00013708508217220457, + "loss": 3.114053726196289, + "step": 4694, + "token_acc": 0.28550011651692087 + }, + { + "epoch": 2.752272060979185, + "grad_norm": 0.2939976027840481, + "learning_rate": 0.00013708314442943497, + "loss": 3.062159776687622, + "step": 4695, + "token_acc": 0.2916887021266064 + }, + { + "epoch": 2.752858399296394, + "grad_norm": 0.3406318833595693, + "learning_rate": 0.00013708120605650963, + "loss": 3.0345864295959473, + "step": 4696, + "token_acc": 0.2951055989843479 + }, + { + "epoch": 2.753444737613603, + "grad_norm": 0.2961960943850363, + "learning_rate": 0.0001370792670534468, + "loss": 3.0678000450134277, + "step": 4697, + "token_acc": 0.2892025213907601 + }, + { + "epoch": 2.754031075930812, + "grad_norm": 0.3089197689768157, + "learning_rate": 0.0001370773274202646, + "loss": 3.1106622219085693, + "step": 4698, + "token_acc": 0.28458876156757157 + }, + { + "epoch": 2.7546174142480213, + "grad_norm": 0.3101572549155367, + "learning_rate": 0.00013707538715698132, + "loss": 3.0618839263916016, + "step": 4699, + "token_acc": 0.29171420480943966 + }, + { + "epoch": 2.75520375256523, + "grad_norm": 0.27500096270362484, + "learning_rate": 0.00013707344626361515, + "loss": 3.071516513824463, + "step": 4700, + "token_acc": 0.290153928890332 + }, + { + "epoch": 2.755790090882439, + "grad_norm": 0.26219293122341814, + "learning_rate": 0.00013707150474018433, + "loss": 3.0642361640930176, + "step": 4701, + "token_acc": 0.29142087714798665 + }, + { + "epoch": 2.756376429199648, + "grad_norm": 0.28405478903105286, + "learning_rate": 0.00013706956258670712, + "loss": 3.076244354248047, + "step": 4702, + "token_acc": 0.29035838883602916 + }, + { + "epoch": 2.7569627675168573, + "grad_norm": 0.31324206999762183, + "learning_rate": 0.00013706761980320173, + "loss": 3.0728230476379395, + "step": 4703, + "token_acc": 0.2904097509251044 + }, + { + "epoch": 2.757549105834066, + "grad_norm": 0.25028313187446083, + "learning_rate": 0.00013706567638968644, + "loss": 3.0894627571105957, + "step": 4704, + "token_acc": 0.2875475857345793 + }, + { + "epoch": 2.758135444151275, + "grad_norm": 0.3101241656121886, + "learning_rate": 0.00013706373234617948, + "loss": 3.0572173595428467, + "step": 4705, + "token_acc": 0.2940423514538559 + }, + { + "epoch": 2.758721782468484, + "grad_norm": 0.3085137301156558, + "learning_rate": 0.00013706178767269913, + "loss": 3.0969886779785156, + "step": 4706, + "token_acc": 0.2885033629762671 + }, + { + "epoch": 2.7593081207856933, + "grad_norm": 0.3162419988991531, + "learning_rate": 0.00013705984236926367, + "loss": 3.051300048828125, + "step": 4707, + "token_acc": 0.2953059884492208 + }, + { + "epoch": 2.7598944591029024, + "grad_norm": 0.34356460213921486, + "learning_rate": 0.00013705789643589134, + "loss": 3.0466763973236084, + "step": 4708, + "token_acc": 0.29424901836879086 + }, + { + "epoch": 2.7604807974201115, + "grad_norm": 0.3572469750640098, + "learning_rate": 0.00013705594987260044, + "loss": 3.0647950172424316, + "step": 4709, + "token_acc": 0.2927943059900481 + }, + { + "epoch": 2.7610671357373207, + "grad_norm": 0.34518077114134166, + "learning_rate": 0.00013705400267940925, + "loss": 3.074469804763794, + "step": 4710, + "token_acc": 0.28942934603648496 + }, + { + "epoch": 2.7616534740545293, + "grad_norm": 0.3866458586441254, + "learning_rate": 0.00013705205485633603, + "loss": 3.1213059425354004, + "step": 4711, + "token_acc": 0.2843682004891753 + }, + { + "epoch": 2.7622398123717384, + "grad_norm": 0.3630731899911009, + "learning_rate": 0.00013705010640339914, + "loss": 3.0784640312194824, + "step": 4712, + "token_acc": 0.288812678879881 + }, + { + "epoch": 2.7628261506889475, + "grad_norm": 0.305861782829285, + "learning_rate": 0.00013704815732061684, + "loss": 3.07456636428833, + "step": 4713, + "token_acc": 0.29005863726072445 + }, + { + "epoch": 2.7634124890061567, + "grad_norm": 0.33521756626415367, + "learning_rate": 0.00013704620760800743, + "loss": 3.08918833732605, + "step": 4714, + "token_acc": 0.2880615956485588 + }, + { + "epoch": 2.7639988273233653, + "grad_norm": 0.3427837982771273, + "learning_rate": 0.0001370442572655893, + "loss": 3.075580596923828, + "step": 4715, + "token_acc": 0.2889680260386751 + }, + { + "epoch": 2.7645851656405744, + "grad_norm": 0.3191998850909233, + "learning_rate": 0.0001370423062933807, + "loss": 3.051447868347168, + "step": 4716, + "token_acc": 0.2932676596776404 + }, + { + "epoch": 2.7651715039577835, + "grad_norm": 0.33656892160255814, + "learning_rate": 0.00013704035469139992, + "loss": 3.082031726837158, + "step": 4717, + "token_acc": 0.28997611797562406 + }, + { + "epoch": 2.7657578422749927, + "grad_norm": 0.32789469239635893, + "learning_rate": 0.00013703840245966542, + "loss": 3.0598161220550537, + "step": 4718, + "token_acc": 0.2942095977053003 + }, + { + "epoch": 2.7663441805922018, + "grad_norm": 0.29972958430977487, + "learning_rate": 0.00013703644959819542, + "loss": 3.059307098388672, + "step": 4719, + "token_acc": 0.29243407766810886 + }, + { + "epoch": 2.766930518909411, + "grad_norm": 0.27789885280737897, + "learning_rate": 0.0001370344961070083, + "loss": 3.053579330444336, + "step": 4720, + "token_acc": 0.29429505310434206 + }, + { + "epoch": 2.76751685722662, + "grad_norm": 0.32262742642120495, + "learning_rate": 0.00013703254198612244, + "loss": 3.0015058517456055, + "step": 4721, + "token_acc": 0.30103998977897023 + }, + { + "epoch": 2.7681031955438287, + "grad_norm": 0.3475917455122131, + "learning_rate": 0.0001370305872355562, + "loss": 3.0520219802856445, + "step": 4722, + "token_acc": 0.2935668502856675 + }, + { + "epoch": 2.7686895338610378, + "grad_norm": 0.3958104136592667, + "learning_rate": 0.00013702863185532788, + "loss": 3.037843942642212, + "step": 4723, + "token_acc": 0.29660543626660046 + }, + { + "epoch": 2.769275872178247, + "grad_norm": 0.38220230203313876, + "learning_rate": 0.0001370266758454559, + "loss": 3.0557992458343506, + "step": 4724, + "token_acc": 0.29322974180995953 + }, + { + "epoch": 2.769862210495456, + "grad_norm": 0.31961681829721456, + "learning_rate": 0.0001370247192059586, + "loss": 3.0636849403381348, + "step": 4725, + "token_acc": 0.2914145267828588 + }, + { + "epoch": 2.7704485488126647, + "grad_norm": 0.35240628567111343, + "learning_rate": 0.0001370227619368544, + "loss": 3.0894243717193604, + "step": 4726, + "token_acc": 0.2887275646478394 + }, + { + "epoch": 2.771034887129874, + "grad_norm": 0.32621550091359836, + "learning_rate": 0.00013702080403816164, + "loss": 3.0906317234039307, + "step": 4727, + "token_acc": 0.288836742754979 + }, + { + "epoch": 2.771621225447083, + "grad_norm": 0.30997750700599325, + "learning_rate": 0.00013701884550989878, + "loss": 3.054356098175049, + "step": 4728, + "token_acc": 0.2930217778268492 + }, + { + "epoch": 2.772207563764292, + "grad_norm": 0.3320253424704229, + "learning_rate": 0.00013701688635208415, + "loss": 3.126520872116089, + "step": 4729, + "token_acc": 0.28327438125474846 + }, + { + "epoch": 2.772793902081501, + "grad_norm": 0.38854927055120986, + "learning_rate": 0.00013701492656473618, + "loss": 3.0934243202209473, + "step": 4730, + "token_acc": 0.2862656729371103 + }, + { + "epoch": 2.7733802403987102, + "grad_norm": 0.3285079400934574, + "learning_rate": 0.0001370129661478733, + "loss": 3.027575731277466, + "step": 4731, + "token_acc": 0.2970539716135458 + }, + { + "epoch": 2.7739665787159193, + "grad_norm": 0.29171581241204786, + "learning_rate": 0.0001370110051015139, + "loss": 3.0499985218048096, + "step": 4732, + "token_acc": 0.29461574819550257 + }, + { + "epoch": 2.774552917033128, + "grad_norm": 0.3742543628374655, + "learning_rate": 0.00013700904342567636, + "loss": 3.0911457538604736, + "step": 4733, + "token_acc": 0.2874119127949567 + }, + { + "epoch": 2.775139255350337, + "grad_norm": 0.3434006243030413, + "learning_rate": 0.00013700708112037918, + "loss": 3.0453970432281494, + "step": 4734, + "token_acc": 0.29446659241077955 + }, + { + "epoch": 2.7757255936675462, + "grad_norm": 0.3114028043875064, + "learning_rate": 0.0001370051181856408, + "loss": 3.06387996673584, + "step": 4735, + "token_acc": 0.29158163976647916 + }, + { + "epoch": 2.7763119319847553, + "grad_norm": 0.3533483550458655, + "learning_rate": 0.0001370031546214796, + "loss": 3.063957452774048, + "step": 4736, + "token_acc": 0.29160827791240523 + }, + { + "epoch": 2.776898270301964, + "grad_norm": 0.37500628721929075, + "learning_rate": 0.00013700119042791404, + "loss": 3.0680840015411377, + "step": 4737, + "token_acc": 0.2912414290587785 + }, + { + "epoch": 2.777484608619173, + "grad_norm": 0.2934992988918814, + "learning_rate": 0.0001369992256049626, + "loss": 3.10247802734375, + "step": 4738, + "token_acc": 0.2858267779804424 + }, + { + "epoch": 2.7780709469363822, + "grad_norm": 0.28676405215913686, + "learning_rate": 0.0001369972601526437, + "loss": 3.0179991722106934, + "step": 4739, + "token_acc": 0.2991765299144848 + }, + { + "epoch": 2.7786572852535913, + "grad_norm": 0.2921200128809918, + "learning_rate": 0.00013699529407097582, + "loss": 3.035616159439087, + "step": 4740, + "token_acc": 0.29539959713907105 + }, + { + "epoch": 2.7792436235708005, + "grad_norm": 0.2910749418337646, + "learning_rate": 0.00013699332735997742, + "loss": 3.101915121078491, + "step": 4741, + "token_acc": 0.28593646730521083 + }, + { + "epoch": 2.7798299618880096, + "grad_norm": 0.32873953854336596, + "learning_rate": 0.000136991360019667, + "loss": 3.0812268257141113, + "step": 4742, + "token_acc": 0.28828423831697986 + }, + { + "epoch": 2.7804163002052187, + "grad_norm": 0.3149760976726917, + "learning_rate": 0.00013698939205006305, + "loss": 3.0557236671447754, + "step": 4743, + "token_acc": 0.2927200628677193 + }, + { + "epoch": 2.7810026385224274, + "grad_norm": 0.2963351197516334, + "learning_rate": 0.000136987423451184, + "loss": 3.0593719482421875, + "step": 4744, + "token_acc": 0.29033246275825186 + }, + { + "epoch": 2.7815889768396365, + "grad_norm": 0.32678462102514294, + "learning_rate": 0.00013698545422304837, + "loss": 3.0411734580993652, + "step": 4745, + "token_acc": 0.29531782088465086 + }, + { + "epoch": 2.7821753151568456, + "grad_norm": 0.2819593216147927, + "learning_rate": 0.00013698348436567468, + "loss": 3.093870162963867, + "step": 4746, + "token_acc": 0.2879781076339647 + }, + { + "epoch": 2.7827616534740547, + "grad_norm": 0.3611208088463207, + "learning_rate": 0.0001369815138790814, + "loss": 3.072493076324463, + "step": 4747, + "token_acc": 0.28977192744812125 + }, + { + "epoch": 2.7833479917912634, + "grad_norm": 0.38582350438606794, + "learning_rate": 0.00013697954276328708, + "loss": 3.0771806240081787, + "step": 4748, + "token_acc": 0.28853124358537974 + }, + { + "epoch": 2.7839343301084725, + "grad_norm": 0.3232222985530676, + "learning_rate": 0.00013697757101831018, + "loss": 3.0764048099517822, + "step": 4749, + "token_acc": 0.288356383414513 + }, + { + "epoch": 2.7845206684256816, + "grad_norm": 0.3178239880814871, + "learning_rate": 0.00013697559864416927, + "loss": 3.041761875152588, + "step": 4750, + "token_acc": 0.29676185633088076 + }, + { + "epoch": 2.7851070067428907, + "grad_norm": 0.3123355856322726, + "learning_rate": 0.00013697362564088288, + "loss": 3.103762149810791, + "step": 4751, + "token_acc": 0.28761483352149 + }, + { + "epoch": 2.7856933450601, + "grad_norm": 0.38316026906652556, + "learning_rate": 0.00013697165200846949, + "loss": 3.0562524795532227, + "step": 4752, + "token_acc": 0.29391848564906253 + }, + { + "epoch": 2.786279683377309, + "grad_norm": 0.2970264436833723, + "learning_rate": 0.0001369696777469477, + "loss": 3.0503549575805664, + "step": 4753, + "token_acc": 0.29155181414445064 + }, + { + "epoch": 2.7868660216945176, + "grad_norm": 0.32899263769294973, + "learning_rate": 0.000136967702856336, + "loss": 3.045626163482666, + "step": 4754, + "token_acc": 0.2926578099406911 + }, + { + "epoch": 2.7874523600117267, + "grad_norm": 0.318241058702945, + "learning_rate": 0.000136965727336653, + "loss": 3.0914764404296875, + "step": 4755, + "token_acc": 0.2873910179209256 + }, + { + "epoch": 2.788038698328936, + "grad_norm": 0.29274760618533635, + "learning_rate": 0.00013696375118791722, + "loss": 3.1226277351379395, + "step": 4756, + "token_acc": 0.2827084504908418 + }, + { + "epoch": 2.788625036646145, + "grad_norm": 0.2786488709164468, + "learning_rate": 0.00013696177441014723, + "loss": 3.078016519546509, + "step": 4757, + "token_acc": 0.2892528642891722 + }, + { + "epoch": 2.7892113749633536, + "grad_norm": 0.2826372736297748, + "learning_rate": 0.0001369597970033616, + "loss": 3.054342269897461, + "step": 4758, + "token_acc": 0.2914980596693791 + }, + { + "epoch": 2.7897977132805627, + "grad_norm": 0.3082364879523241, + "learning_rate": 0.00013695781896757892, + "loss": 3.074385166168213, + "step": 4759, + "token_acc": 0.2888890055268918 + }, + { + "epoch": 2.790384051597772, + "grad_norm": 0.33923244658162227, + "learning_rate": 0.00013695584030281774, + "loss": 3.048893690109253, + "step": 4760, + "token_acc": 0.29424171391766185 + }, + { + "epoch": 2.790970389914981, + "grad_norm": 0.3490246916468502, + "learning_rate": 0.0001369538610090967, + "loss": 3.0290298461914062, + "step": 4761, + "token_acc": 0.2954485902945184 + }, + { + "epoch": 2.79155672823219, + "grad_norm": 0.3281602696443412, + "learning_rate": 0.00013695188108643432, + "loss": 3.055243492126465, + "step": 4762, + "token_acc": 0.2942199929464321 + }, + { + "epoch": 2.792143066549399, + "grad_norm": 0.27840673885928363, + "learning_rate": 0.00013694990053484923, + "loss": 3.0571136474609375, + "step": 4763, + "token_acc": 0.2923231822710476 + }, + { + "epoch": 2.7927294048666083, + "grad_norm": 0.3539115109604466, + "learning_rate": 0.00013694791935436008, + "loss": 3.0462684631347656, + "step": 4764, + "token_acc": 0.2942701933316949 + }, + { + "epoch": 2.793315743183817, + "grad_norm": 0.3918344182511708, + "learning_rate": 0.0001369459375449854, + "loss": 3.1047658920288086, + "step": 4765, + "token_acc": 0.28669970372929093 + }, + { + "epoch": 2.793902081501026, + "grad_norm": 0.3037531971428858, + "learning_rate": 0.00013694395510674388, + "loss": 3.0559329986572266, + "step": 4766, + "token_acc": 0.29373066842229484 + }, + { + "epoch": 2.794488419818235, + "grad_norm": 0.2528598112190445, + "learning_rate": 0.0001369419720396541, + "loss": 3.0698189735412598, + "step": 4767, + "token_acc": 0.29163451370430704 + }, + { + "epoch": 2.7950747581354443, + "grad_norm": 0.31890910380488735, + "learning_rate": 0.0001369399883437347, + "loss": 3.083059549331665, + "step": 4768, + "token_acc": 0.2885939555248953 + }, + { + "epoch": 2.795661096452653, + "grad_norm": 0.3922064491103291, + "learning_rate": 0.00013693800401900428, + "loss": 3.067549705505371, + "step": 4769, + "token_acc": 0.29041119612102895 + }, + { + "epoch": 2.796247434769862, + "grad_norm": 0.3803570926882513, + "learning_rate": 0.00013693601906548155, + "loss": 3.0943784713745117, + "step": 4770, + "token_acc": 0.28819645926623383 + }, + { + "epoch": 2.796833773087071, + "grad_norm": 0.3189023354880703, + "learning_rate": 0.0001369340334831851, + "loss": 3.1087989807128906, + "step": 4771, + "token_acc": 0.28510372558371166 + }, + { + "epoch": 2.7974201114042803, + "grad_norm": 0.3268408381541908, + "learning_rate": 0.00013693204727213362, + "loss": 3.0855469703674316, + "step": 4772, + "token_acc": 0.2884823142628417 + }, + { + "epoch": 2.7980064497214894, + "grad_norm": 0.33730087300884776, + "learning_rate": 0.00013693006043234574, + "loss": 3.0664196014404297, + "step": 4773, + "token_acc": 0.29234207871104767 + }, + { + "epoch": 2.7985927880386985, + "grad_norm": 0.29093193732917083, + "learning_rate": 0.0001369280729638401, + "loss": 3.084001302719116, + "step": 4774, + "token_acc": 0.2878298762053106 + }, + { + "epoch": 2.7991791263559076, + "grad_norm": 0.3350478652774217, + "learning_rate": 0.00013692608486663544, + "loss": 3.081118583679199, + "step": 4775, + "token_acc": 0.28883791063626874 + }, + { + "epoch": 2.7997654646731163, + "grad_norm": 0.314888650428089, + "learning_rate": 0.00013692409614075038, + "loss": 3.0622735023498535, + "step": 4776, + "token_acc": 0.2917783508598458 + }, + { + "epoch": 2.8003518029903254, + "grad_norm": 0.28371550439869797, + "learning_rate": 0.00013692210678620362, + "loss": 3.080512046813965, + "step": 4777, + "token_acc": 0.28835288712888313 + }, + { + "epoch": 2.8009381413075345, + "grad_norm": 0.34206857932904094, + "learning_rate": 0.00013692011680301386, + "loss": 3.092446804046631, + "step": 4778, + "token_acc": 0.28570601618813674 + }, + { + "epoch": 2.8015244796247436, + "grad_norm": 0.2993711497932688, + "learning_rate": 0.00013691812619119978, + "loss": 3.0532729625701904, + "step": 4779, + "token_acc": 0.29376728639620686 + }, + { + "epoch": 2.8021108179419523, + "grad_norm": 0.3081203422916144, + "learning_rate": 0.00013691613495078004, + "loss": 3.099503517150879, + "step": 4780, + "token_acc": 0.2882668725803604 + }, + { + "epoch": 2.8026971562591614, + "grad_norm": 0.32324740218244474, + "learning_rate": 0.00013691414308177342, + "loss": 3.070456027984619, + "step": 4781, + "token_acc": 0.2907130998642137 + }, + { + "epoch": 2.8032834945763705, + "grad_norm": 0.26938460344931686, + "learning_rate": 0.00013691215058419856, + "loss": 3.0482892990112305, + "step": 4782, + "token_acc": 0.2915497195864094 + }, + { + "epoch": 2.8038698328935796, + "grad_norm": 0.31309943407560015, + "learning_rate": 0.00013691015745807426, + "loss": 3.0742347240448, + "step": 4783, + "token_acc": 0.28983569375214285 + }, + { + "epoch": 2.8044561712107887, + "grad_norm": 0.3236097487548583, + "learning_rate": 0.00013690816370341916, + "loss": 3.070676803588867, + "step": 4784, + "token_acc": 0.2899154878846546 + }, + { + "epoch": 2.805042509527998, + "grad_norm": 0.3005695172124386, + "learning_rate": 0.00013690616932025203, + "loss": 3.065809726715088, + "step": 4785, + "token_acc": 0.29104199703561945 + }, + { + "epoch": 2.805628847845207, + "grad_norm": 0.3164094430453737, + "learning_rate": 0.0001369041743085916, + "loss": 3.039031505584717, + "step": 4786, + "token_acc": 0.2948242068489255 + }, + { + "epoch": 2.8062151861624156, + "grad_norm": 0.30348470507080993, + "learning_rate": 0.0001369021786684566, + "loss": 3.0371341705322266, + "step": 4787, + "token_acc": 0.2938407967748096 + }, + { + "epoch": 2.8068015244796247, + "grad_norm": 0.3522932833056971, + "learning_rate": 0.00013690018239986577, + "loss": 3.0759778022766113, + "step": 4788, + "token_acc": 0.2890363706895648 + }, + { + "epoch": 2.807387862796834, + "grad_norm": 0.3098380097115769, + "learning_rate": 0.00013689818550283788, + "loss": 3.07541561126709, + "step": 4789, + "token_acc": 0.28975839950123805 + }, + { + "epoch": 2.807974201114043, + "grad_norm": 0.2581259928498899, + "learning_rate": 0.00013689618797739172, + "loss": 3.065826892852783, + "step": 4790, + "token_acc": 0.2910118737510011 + }, + { + "epoch": 2.8085605394312516, + "grad_norm": 0.2866558141754139, + "learning_rate": 0.00013689418982354597, + "loss": 3.0583341121673584, + "step": 4791, + "token_acc": 0.2925796568310867 + }, + { + "epoch": 2.8091468777484607, + "grad_norm": 0.29968475396954025, + "learning_rate": 0.00013689219104131946, + "loss": 3.068889856338501, + "step": 4792, + "token_acc": 0.28877639800680216 + }, + { + "epoch": 2.80973321606567, + "grad_norm": 0.27055266527258837, + "learning_rate": 0.00013689019163073098, + "loss": 3.0700814723968506, + "step": 4793, + "token_acc": 0.2911559246078819 + }, + { + "epoch": 2.810319554382879, + "grad_norm": 0.26627702526807207, + "learning_rate": 0.00013688819159179925, + "loss": 3.068039655685425, + "step": 4794, + "token_acc": 0.29043226099697067 + }, + { + "epoch": 2.810905892700088, + "grad_norm": 0.31406922780365176, + "learning_rate": 0.00013688619092454312, + "loss": 3.093127727508545, + "step": 4795, + "token_acc": 0.2861608541795642 + }, + { + "epoch": 2.811492231017297, + "grad_norm": 0.32985388667681365, + "learning_rate": 0.00013688418962898134, + "loss": 3.0470056533813477, + "step": 4796, + "token_acc": 0.293848686846234 + }, + { + "epoch": 2.8120785693345063, + "grad_norm": 0.33833355794818615, + "learning_rate": 0.00013688218770513275, + "loss": 3.0054283142089844, + "step": 4797, + "token_acc": 0.30026423058952423 + }, + { + "epoch": 2.812664907651715, + "grad_norm": 0.43280938093342003, + "learning_rate": 0.0001368801851530161, + "loss": 3.0971732139587402, + "step": 4798, + "token_acc": 0.2863970080762149 + }, + { + "epoch": 2.813251245968924, + "grad_norm": 0.37319613678111735, + "learning_rate": 0.00013687818197265025, + "loss": 3.0789685249328613, + "step": 4799, + "token_acc": 0.2891054159617353 + }, + { + "epoch": 2.813837584286133, + "grad_norm": 0.2826209236016134, + "learning_rate": 0.00013687617816405398, + "loss": 3.068892240524292, + "step": 4800, + "token_acc": 0.2920483832542828 + }, + { + "epoch": 2.8144239226033423, + "grad_norm": 0.38002410934182995, + "learning_rate": 0.00013687417372724618, + "loss": 3.0782599449157715, + "step": 4801, + "token_acc": 0.2892367039616482 + }, + { + "epoch": 2.815010260920551, + "grad_norm": 0.31998029321316696, + "learning_rate": 0.0001368721686622456, + "loss": 3.0505943298339844, + "step": 4802, + "token_acc": 0.29189764665902046 + }, + { + "epoch": 2.81559659923776, + "grad_norm": 0.3391896399130213, + "learning_rate": 0.00013687016296907108, + "loss": 3.0685372352600098, + "step": 4803, + "token_acc": 0.29149517735398534 + }, + { + "epoch": 2.816182937554969, + "grad_norm": 0.30201905973665605, + "learning_rate": 0.00013686815664774152, + "loss": 3.0228066444396973, + "step": 4804, + "token_acc": 0.29571094940626347 + }, + { + "epoch": 2.8167692758721783, + "grad_norm": 0.3284814038437341, + "learning_rate": 0.00013686614969827575, + "loss": 3.0386509895324707, + "step": 4805, + "token_acc": 0.2949472613263569 + }, + { + "epoch": 2.8173556141893874, + "grad_norm": 0.29273330648567913, + "learning_rate": 0.00013686414212069257, + "loss": 3.054908275604248, + "step": 4806, + "token_acc": 0.29197934453247504 + }, + { + "epoch": 2.8179419525065965, + "grad_norm": 0.33616106448900057, + "learning_rate": 0.00013686213391501088, + "loss": 3.057896852493286, + "step": 4807, + "token_acc": 0.2934615757291406 + }, + { + "epoch": 2.818528290823805, + "grad_norm": 0.27502343073472063, + "learning_rate": 0.00013686012508124957, + "loss": 3.110095739364624, + "step": 4808, + "token_acc": 0.2847718423251902 + }, + { + "epoch": 2.8191146291410143, + "grad_norm": 0.3023793785635317, + "learning_rate": 0.00013685811561942745, + "loss": 3.03559947013855, + "step": 4809, + "token_acc": 0.2954912308136777 + }, + { + "epoch": 2.8197009674582234, + "grad_norm": 0.2761874824929722, + "learning_rate": 0.00013685610552956342, + "loss": 3.0418167114257812, + "step": 4810, + "token_acc": 0.2957558256841068 + }, + { + "epoch": 2.8202873057754325, + "grad_norm": 0.29874201407219236, + "learning_rate": 0.00013685409481167641, + "loss": 3.0940213203430176, + "step": 4811, + "token_acc": 0.28725322847766227 + }, + { + "epoch": 2.820873644092641, + "grad_norm": 0.29095529846725116, + "learning_rate": 0.00013685208346578522, + "loss": 3.084400177001953, + "step": 4812, + "token_acc": 0.28848042062143014 + }, + { + "epoch": 2.8214599824098503, + "grad_norm": 0.2871357032288949, + "learning_rate": 0.00013685007149190885, + "loss": 3.0701496601104736, + "step": 4813, + "token_acc": 0.2904061270231468 + }, + { + "epoch": 2.8220463207270594, + "grad_norm": 0.32193496555587703, + "learning_rate": 0.0001368480588900661, + "loss": 3.0730557441711426, + "step": 4814, + "token_acc": 0.2909905784663084 + }, + { + "epoch": 2.8226326590442685, + "grad_norm": 0.26608695957878253, + "learning_rate": 0.00013684604566027592, + "loss": 3.0878686904907227, + "step": 4815, + "token_acc": 0.2904139087091064 + }, + { + "epoch": 2.8232189973614776, + "grad_norm": 0.24726333637178188, + "learning_rate": 0.0001368440318025572, + "loss": 3.069044589996338, + "step": 4816, + "token_acc": 0.2899743770096463 + }, + { + "epoch": 2.8238053356786867, + "grad_norm": 0.29347042842533366, + "learning_rate": 0.0001368420173169289, + "loss": 3.0654702186584473, + "step": 4817, + "token_acc": 0.29164684980183136 + }, + { + "epoch": 2.824391673995896, + "grad_norm": 0.27017825135396795, + "learning_rate": 0.0001368400022034099, + "loss": 3.0691819190979004, + "step": 4818, + "token_acc": 0.29113239692176235 + }, + { + "epoch": 2.8249780123131045, + "grad_norm": 0.2818885973854282, + "learning_rate": 0.00013683798646201914, + "loss": 3.080728530883789, + "step": 4819, + "token_acc": 0.2890754983519071 + }, + { + "epoch": 2.8255643506303136, + "grad_norm": 0.291875643258975, + "learning_rate": 0.0001368359700927756, + "loss": 3.0732407569885254, + "step": 4820, + "token_acc": 0.2907353627547581 + }, + { + "epoch": 2.8261506889475227, + "grad_norm": 0.24077896518560649, + "learning_rate": 0.00013683395309569814, + "loss": 3.044919967651367, + "step": 4821, + "token_acc": 0.29268044003219745 + }, + { + "epoch": 2.826737027264732, + "grad_norm": 0.33859762739920635, + "learning_rate": 0.0001368319354708058, + "loss": 3.0237138271331787, + "step": 4822, + "token_acc": 0.29725946113952373 + }, + { + "epoch": 2.8273233655819405, + "grad_norm": 0.3459640503824454, + "learning_rate": 0.00013682991721811744, + "loss": 3.0350289344787598, + "step": 4823, + "token_acc": 0.2947752440249152 + }, + { + "epoch": 2.8279097038991496, + "grad_norm": 0.3104087793704534, + "learning_rate": 0.00013682789833765208, + "loss": 3.067493200302124, + "step": 4824, + "token_acc": 0.29205522743757123 + }, + { + "epoch": 2.8284960422163588, + "grad_norm": 0.3487878820327055, + "learning_rate": 0.00013682587882942864, + "loss": 3.092806100845337, + "step": 4825, + "token_acc": 0.2871261078162629 + }, + { + "epoch": 2.829082380533568, + "grad_norm": 0.35960622493384414, + "learning_rate": 0.00013682385869346616, + "loss": 3.0770695209503174, + "step": 4826, + "token_acc": 0.28979139452954916 + }, + { + "epoch": 2.829668718850777, + "grad_norm": 0.297385156261709, + "learning_rate": 0.00013682183792978355, + "loss": 3.0324149131774902, + "step": 4827, + "token_acc": 0.2965683672491785 + }, + { + "epoch": 2.830255057167986, + "grad_norm": 0.33746750407006637, + "learning_rate": 0.00013681981653839982, + "loss": 3.0523533821105957, + "step": 4828, + "token_acc": 0.29146112600536195 + }, + { + "epoch": 2.830841395485195, + "grad_norm": 0.31423922734077525, + "learning_rate": 0.00013681779451933397, + "loss": 3.107745409011841, + "step": 4829, + "token_acc": 0.28478826676907987 + }, + { + "epoch": 2.831427733802404, + "grad_norm": 0.31190200623331843, + "learning_rate": 0.00013681577187260496, + "loss": 3.0194432735443115, + "step": 4830, + "token_acc": 0.2982536042550585 + }, + { + "epoch": 2.832014072119613, + "grad_norm": 0.36441946283135274, + "learning_rate": 0.0001368137485982318, + "loss": 3.0998167991638184, + "step": 4831, + "token_acc": 0.2873424716617471 + }, + { + "epoch": 2.832600410436822, + "grad_norm": 0.3407404971718265, + "learning_rate": 0.00013681172469623353, + "loss": 3.035048484802246, + "step": 4832, + "token_acc": 0.2969198471615159 + }, + { + "epoch": 2.833186748754031, + "grad_norm": 0.3662600972578241, + "learning_rate": 0.00013680970016662913, + "loss": 3.0480313301086426, + "step": 4833, + "token_acc": 0.2942160598636145 + }, + { + "epoch": 2.83377308707124, + "grad_norm": 0.3742693149677501, + "learning_rate": 0.0001368076750094376, + "loss": 3.071227550506592, + "step": 4834, + "token_acc": 0.29027533051552856 + }, + { + "epoch": 2.834359425388449, + "grad_norm": 0.3021615753510456, + "learning_rate": 0.00013680564922467802, + "loss": 3.0871729850769043, + "step": 4835, + "token_acc": 0.2890503492985797 + }, + { + "epoch": 2.834945763705658, + "grad_norm": 0.3593718784822757, + "learning_rate": 0.00013680362281236937, + "loss": 3.021775722503662, + "step": 4836, + "token_acc": 0.29833020711727903 + }, + { + "epoch": 2.835532102022867, + "grad_norm": 0.31635056119294747, + "learning_rate": 0.0001368015957725307, + "loss": 3.0891332626342773, + "step": 4837, + "token_acc": 0.2888171187570303 + }, + { + "epoch": 2.8361184403400763, + "grad_norm": 0.3044507510909209, + "learning_rate": 0.00013679956810518106, + "loss": 3.085489273071289, + "step": 4838, + "token_acc": 0.28895991304572866 + }, + { + "epoch": 2.8367047786572854, + "grad_norm": 0.3356597988160041, + "learning_rate": 0.0001367975398103395, + "loss": 3.0551071166992188, + "step": 4839, + "token_acc": 0.29262468409923154 + }, + { + "epoch": 2.8372911169744945, + "grad_norm": 0.33622878549585383, + "learning_rate": 0.00013679551088802505, + "loss": 3.07496976852417, + "step": 4840, + "token_acc": 0.2883614402199578 + }, + { + "epoch": 2.837877455291703, + "grad_norm": 0.36570114198749076, + "learning_rate": 0.00013679348133825679, + "loss": 3.0654499530792236, + "step": 4841, + "token_acc": 0.29034832066221217 + }, + { + "epoch": 2.8384637936089123, + "grad_norm": 0.3979051483140825, + "learning_rate": 0.0001367914511610538, + "loss": 3.06706166267395, + "step": 4842, + "token_acc": 0.29139474650242686 + }, + { + "epoch": 2.8390501319261214, + "grad_norm": 0.40268832825795103, + "learning_rate": 0.00013678942035643508, + "loss": 3.0585060119628906, + "step": 4843, + "token_acc": 0.2907058073012075 + }, + { + "epoch": 2.8396364702433305, + "grad_norm": 0.35173387747869767, + "learning_rate": 0.00013678738892441977, + "loss": 3.001384735107422, + "step": 4844, + "token_acc": 0.3005223052981791 + }, + { + "epoch": 2.840222808560539, + "grad_norm": 0.3094901798578128, + "learning_rate": 0.00013678535686502698, + "loss": 3.0669188499450684, + "step": 4845, + "token_acc": 0.28984726470543176 + }, + { + "epoch": 2.8408091468777483, + "grad_norm": 0.34643289458033594, + "learning_rate": 0.00013678332417827572, + "loss": 3.055863380432129, + "step": 4846, + "token_acc": 0.29233547457779707 + }, + { + "epoch": 2.8413954851949574, + "grad_norm": 0.3653440793059422, + "learning_rate": 0.00013678129086418513, + "loss": 3.0282843112945557, + "step": 4847, + "token_acc": 0.2980623228287288 + }, + { + "epoch": 2.8419818235121665, + "grad_norm": 0.35183779430964246, + "learning_rate": 0.00013677925692277427, + "loss": 3.0988895893096924, + "step": 4848, + "token_acc": 0.28619740750672334 + }, + { + "epoch": 2.8425681618293757, + "grad_norm": 0.31646442279955267, + "learning_rate": 0.00013677722235406234, + "loss": 3.100616693496704, + "step": 4849, + "token_acc": 0.28517103242253267 + }, + { + "epoch": 2.8431545001465848, + "grad_norm": 0.42407306794829913, + "learning_rate": 0.00013677518715806834, + "loss": 3.107865333557129, + "step": 4850, + "token_acc": 0.2847780859916782 + }, + { + "epoch": 2.843740838463794, + "grad_norm": 0.4028516181127952, + "learning_rate": 0.00013677315133481146, + "loss": 3.058948040008545, + "step": 4851, + "token_acc": 0.2926448634340227 + }, + { + "epoch": 2.8443271767810026, + "grad_norm": 0.32515598567762216, + "learning_rate": 0.0001367711148843108, + "loss": 3.0849685668945312, + "step": 4852, + "token_acc": 0.28916697557296794 + }, + { + "epoch": 2.8449135150982117, + "grad_norm": 0.35070076013248097, + "learning_rate": 0.00013676907780658547, + "loss": 3.0353379249572754, + "step": 4853, + "token_acc": 0.29581565660639103 + }, + { + "epoch": 2.8454998534154208, + "grad_norm": 0.3065234840073978, + "learning_rate": 0.00013676704010165465, + "loss": 3.0687308311462402, + "step": 4854, + "token_acc": 0.2897950887355971 + }, + { + "epoch": 2.84608619173263, + "grad_norm": 0.3514951568828879, + "learning_rate": 0.00013676500176953743, + "loss": 3.0938425064086914, + "step": 4855, + "token_acc": 0.28926369241929634 + }, + { + "epoch": 2.8466725300498386, + "grad_norm": 0.32461838431951184, + "learning_rate": 0.000136762962810253, + "loss": 3.059913396835327, + "step": 4856, + "token_acc": 0.29186982264056816 + }, + { + "epoch": 2.8472588683670477, + "grad_norm": 0.2933209698375005, + "learning_rate": 0.0001367609232238205, + "loss": 3.0574121475219727, + "step": 4857, + "token_acc": 0.29203549131914314 + }, + { + "epoch": 2.847845206684257, + "grad_norm": 0.3211857137889328, + "learning_rate": 0.00013675888301025913, + "loss": 3.1005334854125977, + "step": 4858, + "token_acc": 0.28561218598428684 + }, + { + "epoch": 2.848431545001466, + "grad_norm": 0.32804790095073016, + "learning_rate": 0.00013675684216958795, + "loss": 3.0724306106567383, + "step": 4859, + "token_acc": 0.2908846086614517 + }, + { + "epoch": 2.849017883318675, + "grad_norm": 0.30134229366354554, + "learning_rate": 0.00013675480070182624, + "loss": 3.080113410949707, + "step": 4860, + "token_acc": 0.2914634430030319 + }, + { + "epoch": 2.849604221635884, + "grad_norm": 0.2973486162355321, + "learning_rate": 0.00013675275860699308, + "loss": 3.0549845695495605, + "step": 4861, + "token_acc": 0.29094257712293625 + }, + { + "epoch": 2.850190559953093, + "grad_norm": 0.28798675063581863, + "learning_rate": 0.00013675071588510775, + "loss": 3.0893678665161133, + "step": 4862, + "token_acc": 0.2872112296695328 + }, + { + "epoch": 2.850776898270302, + "grad_norm": 0.3886744300605585, + "learning_rate": 0.00013674867253618938, + "loss": 3.073453426361084, + "step": 4863, + "token_acc": 0.29067181663791025 + }, + { + "epoch": 2.851363236587511, + "grad_norm": 0.30142321601121114, + "learning_rate": 0.00013674662856025716, + "loss": 3.022930860519409, + "step": 4864, + "token_acc": 0.2967893400366845 + }, + { + "epoch": 2.85194957490472, + "grad_norm": 0.29869302219091765, + "learning_rate": 0.00013674458395733033, + "loss": 3.054203510284424, + "step": 4865, + "token_acc": 0.2913743630858819 + }, + { + "epoch": 2.852535913221929, + "grad_norm": 0.31447334342393185, + "learning_rate": 0.00013674253872742804, + "loss": 3.069089889526367, + "step": 4866, + "token_acc": 0.292258108397993 + }, + { + "epoch": 2.853122251539138, + "grad_norm": 0.2754222280578462, + "learning_rate": 0.00013674049287056957, + "loss": 3.0077414512634277, + "step": 4867, + "token_acc": 0.29857659987420737 + }, + { + "epoch": 2.853708589856347, + "grad_norm": 0.3473956149775441, + "learning_rate": 0.00013673844638677408, + "loss": 3.095607280731201, + "step": 4868, + "token_acc": 0.28695696590473757 + }, + { + "epoch": 2.854294928173556, + "grad_norm": 0.3047428721590186, + "learning_rate": 0.00013673639927606085, + "loss": 3.0794405937194824, + "step": 4869, + "token_acc": 0.28940673288474816 + }, + { + "epoch": 2.8548812664907652, + "grad_norm": 0.29192429555898564, + "learning_rate": 0.00013673435153844902, + "loss": 3.1294727325439453, + "step": 4870, + "token_acc": 0.28206275374951467 + }, + { + "epoch": 2.8554676048079743, + "grad_norm": 0.2902944213606873, + "learning_rate": 0.00013673230317395792, + "loss": 3.1009063720703125, + "step": 4871, + "token_acc": 0.28620251928522605 + }, + { + "epoch": 2.8560539431251835, + "grad_norm": 0.32487802043584385, + "learning_rate": 0.00013673025418260674, + "loss": 3.044849395751953, + "step": 4872, + "token_acc": 0.2927915496737702 + }, + { + "epoch": 2.856640281442392, + "grad_norm": 0.2959498091724525, + "learning_rate": 0.00013672820456441477, + "loss": 3.068972587585449, + "step": 4873, + "token_acc": 0.29110256057414446 + }, + { + "epoch": 2.8572266197596012, + "grad_norm": 0.31659334286708124, + "learning_rate": 0.00013672615431940122, + "loss": 3.0513830184936523, + "step": 4874, + "token_acc": 0.292033804131898 + }, + { + "epoch": 2.8578129580768104, + "grad_norm": 0.29738089862197364, + "learning_rate": 0.00013672410344758536, + "loss": 3.0619168281555176, + "step": 4875, + "token_acc": 0.2921223311082706 + }, + { + "epoch": 2.8583992963940195, + "grad_norm": 0.3053260952536838, + "learning_rate": 0.00013672205194898646, + "loss": 3.0587351322174072, + "step": 4876, + "token_acc": 0.2917526926558384 + }, + { + "epoch": 2.858985634711228, + "grad_norm": 0.29495805110893697, + "learning_rate": 0.00013671999982362379, + "loss": 3.085895538330078, + "step": 4877, + "token_acc": 0.2874030488082529 + }, + { + "epoch": 2.8595719730284372, + "grad_norm": 0.3392844793731414, + "learning_rate": 0.00013671794707151665, + "loss": 3.0929605960845947, + "step": 4878, + "token_acc": 0.28745235962670435 + }, + { + "epoch": 2.8601583113456464, + "grad_norm": 0.2878131750494053, + "learning_rate": 0.00013671589369268426, + "loss": 3.0608572959899902, + "step": 4879, + "token_acc": 0.29080055980679964 + }, + { + "epoch": 2.8607446496628555, + "grad_norm": 0.3465144308092056, + "learning_rate": 0.000136713839687146, + "loss": 3.0645763874053955, + "step": 4880, + "token_acc": 0.2909594420818415 + }, + { + "epoch": 2.8613309879800646, + "grad_norm": 0.35238255020966314, + "learning_rate": 0.00013671178505492108, + "loss": 3.118668556213379, + "step": 4881, + "token_acc": 0.28325499536295423 + }, + { + "epoch": 2.8619173262972737, + "grad_norm": 0.33777528758421227, + "learning_rate": 0.00013670972979602883, + "loss": 3.0725622177124023, + "step": 4882, + "token_acc": 0.29071840031781765 + }, + { + "epoch": 2.862503664614483, + "grad_norm": 0.32051723051437714, + "learning_rate": 0.0001367076739104886, + "loss": 3.0584611892700195, + "step": 4883, + "token_acc": 0.29346060475107083 + }, + { + "epoch": 2.8630900029316915, + "grad_norm": 0.3177751718458312, + "learning_rate": 0.0001367056173983196, + "loss": 3.0840795040130615, + "step": 4884, + "token_acc": 0.2903938500748538 + }, + { + "epoch": 2.8636763412489006, + "grad_norm": 0.23776306473376105, + "learning_rate": 0.00013670356025954127, + "loss": 3.051553249359131, + "step": 4885, + "token_acc": 0.29301048440135014 + }, + { + "epoch": 2.8642626795661097, + "grad_norm": 0.28116221369243977, + "learning_rate": 0.00013670150249417285, + "loss": 3.075465679168701, + "step": 4886, + "token_acc": 0.2901275130825283 + }, + { + "epoch": 2.864849017883319, + "grad_norm": 0.27369227271637464, + "learning_rate": 0.0001366994441022337, + "loss": 3.039811611175537, + "step": 4887, + "token_acc": 0.29445651846228477 + }, + { + "epoch": 2.8654353562005275, + "grad_norm": 0.2976444854923505, + "learning_rate": 0.00013669738508374315, + "loss": 3.049858570098877, + "step": 4888, + "token_acc": 0.29420746305182854 + }, + { + "epoch": 2.8660216945177366, + "grad_norm": 0.3987067204305948, + "learning_rate": 0.00013669532543872053, + "loss": 3.0503129959106445, + "step": 4889, + "token_acc": 0.2947053987101183 + }, + { + "epoch": 2.8666080328349457, + "grad_norm": 0.3551301770206625, + "learning_rate": 0.00013669326516718523, + "loss": 3.1033287048339844, + "step": 4890, + "token_acc": 0.28611700683009267 + }, + { + "epoch": 2.867194371152155, + "grad_norm": 0.2821887441620424, + "learning_rate": 0.00013669120426915656, + "loss": 3.041935920715332, + "step": 4891, + "token_acc": 0.2951843642957888 + }, + { + "epoch": 2.867780709469364, + "grad_norm": 0.3235498199949132, + "learning_rate": 0.00013668914274465388, + "loss": 3.029634714126587, + "step": 4892, + "token_acc": 0.2962288141147479 + }, + { + "epoch": 2.868367047786573, + "grad_norm": 0.339912022339504, + "learning_rate": 0.0001366870805936966, + "loss": 3.0939648151397705, + "step": 4893, + "token_acc": 0.28822060263355004 + }, + { + "epoch": 2.868953386103782, + "grad_norm": 0.29674087057349996, + "learning_rate": 0.000136685017816304, + "loss": 3.0584259033203125, + "step": 4894, + "token_acc": 0.29103092155987503 + }, + { + "epoch": 2.869539724420991, + "grad_norm": 0.3214814444078491, + "learning_rate": 0.0001366829544124956, + "loss": 3.0968194007873535, + "step": 4895, + "token_acc": 0.2880253810607628 + }, + { + "epoch": 2.8701260627382, + "grad_norm": 0.30636589044520485, + "learning_rate": 0.00013668089038229063, + "loss": 3.0129079818725586, + "step": 4896, + "token_acc": 0.2972527775661454 + }, + { + "epoch": 2.870712401055409, + "grad_norm": 0.33679609271871797, + "learning_rate": 0.0001366788257257086, + "loss": 3.045170307159424, + "step": 4897, + "token_acc": 0.2953995346440339 + }, + { + "epoch": 2.871298739372618, + "grad_norm": 0.3210662201065323, + "learning_rate": 0.0001366767604427688, + "loss": 3.0557684898376465, + "step": 4898, + "token_acc": 0.2918737509547676 + }, + { + "epoch": 2.871885077689827, + "grad_norm": 0.31999383330853426, + "learning_rate": 0.0001366746945334907, + "loss": 3.066033124923706, + "step": 4899, + "token_acc": 0.2908794796523924 + }, + { + "epoch": 2.872471416007036, + "grad_norm": 0.32629072017380617, + "learning_rate": 0.0001366726279978937, + "loss": 3.058563470840454, + "step": 4900, + "token_acc": 0.29125229719086376 + }, + { + "epoch": 2.873057754324245, + "grad_norm": 0.29318065187281434, + "learning_rate": 0.00013667056083599722, + "loss": 3.0452301502227783, + "step": 4901, + "token_acc": 0.2942682269096295 + }, + { + "epoch": 2.873644092641454, + "grad_norm": 0.30219091282295585, + "learning_rate": 0.00013666849304782064, + "loss": 3.0923855304718018, + "step": 4902, + "token_acc": 0.28690012508516977 + }, + { + "epoch": 2.8742304309586633, + "grad_norm": 0.31948389210277356, + "learning_rate": 0.0001366664246333834, + "loss": 3.035770893096924, + "step": 4903, + "token_acc": 0.2943962811862545 + }, + { + "epoch": 2.8748167692758724, + "grad_norm": 0.2970642419305478, + "learning_rate": 0.00013666435559270496, + "loss": 3.090167999267578, + "step": 4904, + "token_acc": 0.2862461673236969 + }, + { + "epoch": 2.875403107593081, + "grad_norm": 0.28649489373498555, + "learning_rate": 0.00013666228592580472, + "loss": 3.0761327743530273, + "step": 4905, + "token_acc": 0.2888026268561477 + }, + { + "epoch": 2.87598944591029, + "grad_norm": 0.26724319208957553, + "learning_rate": 0.00013666021563270213, + "loss": 3.089818000793457, + "step": 4906, + "token_acc": 0.28790439593730566 + }, + { + "epoch": 2.8765757842274993, + "grad_norm": 0.3006492314693666, + "learning_rate": 0.00013665814471341663, + "loss": 3.067349910736084, + "step": 4907, + "token_acc": 0.2917036247057446 + }, + { + "epoch": 2.8771621225447084, + "grad_norm": 0.3354999411612963, + "learning_rate": 0.0001366560731679677, + "loss": 3.043978214263916, + "step": 4908, + "token_acc": 0.295146418388846 + }, + { + "epoch": 2.8777484608619175, + "grad_norm": 0.299872786539488, + "learning_rate": 0.00013665400099637477, + "loss": 3.0540919303894043, + "step": 4909, + "token_acc": 0.2917362374461086 + }, + { + "epoch": 2.878334799179126, + "grad_norm": 0.28126319251624243, + "learning_rate": 0.00013665192819865732, + "loss": 3.0354275703430176, + "step": 4910, + "token_acc": 0.2957064240733482 + }, + { + "epoch": 2.8789211374963353, + "grad_norm": 0.28991948010681506, + "learning_rate": 0.00013664985477483482, + "loss": 3.0851798057556152, + "step": 4911, + "token_acc": 0.28707460440248594 + }, + { + "epoch": 2.8795074758135444, + "grad_norm": 0.2905649602883529, + "learning_rate": 0.00013664778072492673, + "loss": 3.0583860874176025, + "step": 4912, + "token_acc": 0.29255924385310883 + }, + { + "epoch": 2.8800938141307535, + "grad_norm": 0.3587076888473605, + "learning_rate": 0.00013664570604895258, + "loss": 3.069845676422119, + "step": 4913, + "token_acc": 0.290209003838846 + }, + { + "epoch": 2.8806801524479626, + "grad_norm": 0.3228630111132636, + "learning_rate": 0.00013664363074693183, + "loss": 3.0557589530944824, + "step": 4914, + "token_acc": 0.290475705927245 + }, + { + "epoch": 2.8812664907651717, + "grad_norm": 0.27057614203002434, + "learning_rate": 0.00013664155481888393, + "loss": 3.023590087890625, + "step": 4915, + "token_acc": 0.29832397327863686 + }, + { + "epoch": 2.8818528290823804, + "grad_norm": 0.29470217416901895, + "learning_rate": 0.00013663947826482846, + "loss": 3.067108154296875, + "step": 4916, + "token_acc": 0.2915893573051738 + }, + { + "epoch": 2.8824391673995895, + "grad_norm": 0.32574490094099856, + "learning_rate": 0.00013663740108478488, + "loss": 3.0864930152893066, + "step": 4917, + "token_acc": 0.28842733034711554 + }, + { + "epoch": 2.8830255057167986, + "grad_norm": 0.2591103620837644, + "learning_rate": 0.0001366353232787727, + "loss": 3.0933420658111572, + "step": 4918, + "token_acc": 0.2889670838022435 + }, + { + "epoch": 2.8836118440340077, + "grad_norm": 0.290714480522424, + "learning_rate": 0.00013663324484681146, + "loss": 3.0583395957946777, + "step": 4919, + "token_acc": 0.293172069511594 + }, + { + "epoch": 2.8841981823512164, + "grad_norm": 0.2887105568243705, + "learning_rate": 0.00013663116578892066, + "loss": 3.0417308807373047, + "step": 4920, + "token_acc": 0.2936160786961546 + }, + { + "epoch": 2.8847845206684255, + "grad_norm": 0.27926865635544074, + "learning_rate": 0.00013662908610511987, + "loss": 3.0844526290893555, + "step": 4921, + "token_acc": 0.29137075043844174 + }, + { + "epoch": 2.8853708589856346, + "grad_norm": 0.27509805975777296, + "learning_rate": 0.0001366270057954286, + "loss": 3.027043342590332, + "step": 4922, + "token_acc": 0.29774420004676777 + }, + { + "epoch": 2.8859571973028437, + "grad_norm": 0.28623631160017565, + "learning_rate": 0.00013662492485986638, + "loss": 3.061530113220215, + "step": 4923, + "token_acc": 0.29166378618111427 + }, + { + "epoch": 2.886543535620053, + "grad_norm": 0.2524699999124869, + "learning_rate": 0.00013662284329845275, + "loss": 3.049622058868408, + "step": 4924, + "token_acc": 0.2933500525022683 + }, + { + "epoch": 2.887129873937262, + "grad_norm": 0.3062928589230968, + "learning_rate": 0.00013662076111120732, + "loss": 3.0496773719787598, + "step": 4925, + "token_acc": 0.29277194163689085 + }, + { + "epoch": 2.887716212254471, + "grad_norm": 0.3372451273507253, + "learning_rate": 0.00013661867829814958, + "loss": 3.069695472717285, + "step": 4926, + "token_acc": 0.29136025995678005 + }, + { + "epoch": 2.8883025505716797, + "grad_norm": 0.25958844368925366, + "learning_rate": 0.00013661659485929913, + "loss": 3.067404270172119, + "step": 4927, + "token_acc": 0.29189625155630494 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 0.30297281398669845, + "learning_rate": 0.00013661451079467556, + "loss": 3.052337646484375, + "step": 4928, + "token_acc": 0.2939375636715236 + }, + { + "epoch": 2.889475227206098, + "grad_norm": 0.43104540607303826, + "learning_rate": 0.00013661242610429842, + "loss": 3.0546793937683105, + "step": 4929, + "token_acc": 0.2921234251580915 + }, + { + "epoch": 2.890061565523307, + "grad_norm": 0.3374516330558775, + "learning_rate": 0.00013661034078818728, + "loss": 3.0432088375091553, + "step": 4930, + "token_acc": 0.2947013234333822 + }, + { + "epoch": 2.8906479038405157, + "grad_norm": 0.3974203183202355, + "learning_rate": 0.00013660825484636176, + "loss": 3.078315258026123, + "step": 4931, + "token_acc": 0.2899718287293555 + }, + { + "epoch": 2.891234242157725, + "grad_norm": 0.3371193606717164, + "learning_rate": 0.00013660616827884146, + "loss": 3.076338768005371, + "step": 4932, + "token_acc": 0.28867767377845943 + }, + { + "epoch": 2.891820580474934, + "grad_norm": 0.2919640194703603, + "learning_rate": 0.00013660408108564592, + "loss": 3.037797212600708, + "step": 4933, + "token_acc": 0.2957846662642064 + }, + { + "epoch": 2.892406918792143, + "grad_norm": 0.3843348945411666, + "learning_rate": 0.00013660199326679482, + "loss": 3.093290328979492, + "step": 4934, + "token_acc": 0.28917575983668564 + }, + { + "epoch": 2.892993257109352, + "grad_norm": 0.2896491666684457, + "learning_rate": 0.00013659990482230773, + "loss": 3.0653223991394043, + "step": 4935, + "token_acc": 0.29117822052840625 + }, + { + "epoch": 2.8935795954265613, + "grad_norm": 0.3491344872250609, + "learning_rate": 0.0001365978157522043, + "loss": 3.0582046508789062, + "step": 4936, + "token_acc": 0.2936305850619878 + }, + { + "epoch": 2.8941659337437704, + "grad_norm": 0.3452016073242082, + "learning_rate": 0.00013659572605650408, + "loss": 3.097259998321533, + "step": 4937, + "token_acc": 0.28782538172403577 + }, + { + "epoch": 2.894752272060979, + "grad_norm": 0.30267139094015116, + "learning_rate": 0.00013659363573522682, + "loss": 3.0524775981903076, + "step": 4938, + "token_acc": 0.29223276083055627 + }, + { + "epoch": 2.895338610378188, + "grad_norm": 0.34179637618910697, + "learning_rate": 0.00013659154478839203, + "loss": 3.082977056503296, + "step": 4939, + "token_acc": 0.2890645446975004 + }, + { + "epoch": 2.8959249486953973, + "grad_norm": 0.33721198576623446, + "learning_rate": 0.00013658945321601943, + "loss": 3.032006025314331, + "step": 4940, + "token_acc": 0.2984387953184456 + }, + { + "epoch": 2.8965112870126064, + "grad_norm": 0.32688468461162323, + "learning_rate": 0.00013658736101812867, + "loss": 3.044804096221924, + "step": 4941, + "token_acc": 0.29463513276825665 + }, + { + "epoch": 2.897097625329815, + "grad_norm": 0.35081210327030254, + "learning_rate": 0.00013658526819473936, + "loss": 3.0757012367248535, + "step": 4942, + "token_acc": 0.2906472048738915 + }, + { + "epoch": 2.897683963647024, + "grad_norm": 0.3444259396545603, + "learning_rate": 0.00013658317474587116, + "loss": 3.0137124061584473, + "step": 4943, + "token_acc": 0.2986298941464974 + }, + { + "epoch": 2.8982703019642333, + "grad_norm": 0.29109573841778436, + "learning_rate": 0.00013658108067154378, + "loss": 3.073519706726074, + "step": 4944, + "token_acc": 0.2929093126344929 + }, + { + "epoch": 2.8988566402814424, + "grad_norm": 0.3204679143190589, + "learning_rate": 0.00013657898597177687, + "loss": 3.0761518478393555, + "step": 4945, + "token_acc": 0.29018853425407387 + }, + { + "epoch": 2.8994429785986515, + "grad_norm": 0.29703454911732785, + "learning_rate": 0.0001365768906465901, + "loss": 3.067534923553467, + "step": 4946, + "token_acc": 0.29181703217050675 + }, + { + "epoch": 2.9000293169158606, + "grad_norm": 0.30361983711839424, + "learning_rate": 0.00013657479469600316, + "loss": 3.060234546661377, + "step": 4947, + "token_acc": 0.29071343143468587 + }, + { + "epoch": 2.9006156552330697, + "grad_norm": 0.29778000566172885, + "learning_rate": 0.00013657269812003572, + "loss": 3.0399765968322754, + "step": 4948, + "token_acc": 0.2951117630970437 + }, + { + "epoch": 2.9012019935502784, + "grad_norm": 0.35153781734569994, + "learning_rate": 0.0001365706009187075, + "loss": 3.113093376159668, + "step": 4949, + "token_acc": 0.28482262007002707 + }, + { + "epoch": 2.9017883318674875, + "grad_norm": 0.36123649498069826, + "learning_rate": 0.0001365685030920382, + "loss": 3.114863634109497, + "step": 4950, + "token_acc": 0.2857342711053567 + }, + { + "epoch": 2.9023746701846966, + "grad_norm": 0.30646521125901743, + "learning_rate": 0.00013656640464004748, + "loss": 3.091856002807617, + "step": 4951, + "token_acc": 0.2882375903526077 + }, + { + "epoch": 2.9029610085019057, + "grad_norm": 0.28533678018414266, + "learning_rate": 0.0001365643055627551, + "loss": 3.056678295135498, + "step": 4952, + "token_acc": 0.29238449593012955 + }, + { + "epoch": 2.9035473468191144, + "grad_norm": 0.2689144862927939, + "learning_rate": 0.0001365622058601808, + "loss": 3.095412492752075, + "step": 4953, + "token_acc": 0.28751764448347267 + }, + { + "epoch": 2.9041336851363235, + "grad_norm": 0.3458216999362823, + "learning_rate": 0.00013656010553234424, + "loss": 3.029428005218506, + "step": 4954, + "token_acc": 0.29690173815128423 + }, + { + "epoch": 2.9047200234535326, + "grad_norm": 0.30200490007804476, + "learning_rate": 0.0001365580045792652, + "loss": 3.0824837684631348, + "step": 4955, + "token_acc": 0.28756752565328647 + }, + { + "epoch": 2.9053063617707418, + "grad_norm": 0.31702497963085324, + "learning_rate": 0.00013655590300096335, + "loss": 3.088308334350586, + "step": 4956, + "token_acc": 0.28978314365209634 + }, + { + "epoch": 2.905892700087951, + "grad_norm": 0.3233925598151169, + "learning_rate": 0.00013655380079745851, + "loss": 3.075321674346924, + "step": 4957, + "token_acc": 0.28840160050542274 + }, + { + "epoch": 2.90647903840516, + "grad_norm": 0.31153609092363393, + "learning_rate": 0.0001365516979687704, + "loss": 3.082887649536133, + "step": 4958, + "token_acc": 0.2884111481657932 + }, + { + "epoch": 2.9070653767223686, + "grad_norm": 0.2999460371432194, + "learning_rate": 0.00013654959451491874, + "loss": 3.0600123405456543, + "step": 4959, + "token_acc": 0.2920953090333645 + }, + { + "epoch": 2.9076517150395778, + "grad_norm": 0.28269850363610893, + "learning_rate": 0.00013654749043592334, + "loss": 3.0931074619293213, + "step": 4960, + "token_acc": 0.28744616164175324 + }, + { + "epoch": 2.908238053356787, + "grad_norm": 0.3097826456803799, + "learning_rate": 0.00013654538573180393, + "loss": 3.076824426651001, + "step": 4961, + "token_acc": 0.2896529039299678 + }, + { + "epoch": 2.908824391673996, + "grad_norm": 0.3006400490309168, + "learning_rate": 0.00013654328040258032, + "loss": 3.0987045764923096, + "step": 4962, + "token_acc": 0.28655580379944107 + }, + { + "epoch": 2.909410729991205, + "grad_norm": 0.35238033973078, + "learning_rate": 0.0001365411744482722, + "loss": 3.0468530654907227, + "step": 4963, + "token_acc": 0.29413446908699176 + }, + { + "epoch": 2.9099970683084138, + "grad_norm": 0.33259029517116534, + "learning_rate": 0.00013653906786889947, + "loss": 3.056656837463379, + "step": 4964, + "token_acc": 0.2929587859155906 + }, + { + "epoch": 2.910583406625623, + "grad_norm": 0.30081725641777596, + "learning_rate": 0.0001365369606644818, + "loss": 3.06235408782959, + "step": 4965, + "token_acc": 0.2926857627136161 + }, + { + "epoch": 2.911169744942832, + "grad_norm": 0.3031980699462104, + "learning_rate": 0.0001365348528350391, + "loss": 3.0619630813598633, + "step": 4966, + "token_acc": 0.29106779669878646 + }, + { + "epoch": 2.911756083260041, + "grad_norm": 0.27279503839745356, + "learning_rate": 0.00013653274438059108, + "loss": 3.0360772609710693, + "step": 4967, + "token_acc": 0.29285142145306337 + }, + { + "epoch": 2.91234242157725, + "grad_norm": 0.29152359546345913, + "learning_rate": 0.0001365306353011576, + "loss": 3.0718564987182617, + "step": 4968, + "token_acc": 0.289821742560575 + }, + { + "epoch": 2.9129287598944593, + "grad_norm": 0.26080717533141556, + "learning_rate": 0.00013652852559675846, + "loss": 3.084115505218506, + "step": 4969, + "token_acc": 0.28900555570323505 + }, + { + "epoch": 2.913515098211668, + "grad_norm": 0.2595279454232396, + "learning_rate": 0.00013652641526741346, + "loss": 3.028672933578491, + "step": 4970, + "token_acc": 0.2960706272456734 + }, + { + "epoch": 2.914101436528877, + "grad_norm": 0.3197042289023389, + "learning_rate": 0.00013652430431314243, + "loss": 3.0395450592041016, + "step": 4971, + "token_acc": 0.29597407296697764 + }, + { + "epoch": 2.914687774846086, + "grad_norm": 0.3550817028947399, + "learning_rate": 0.0001365221927339652, + "loss": 3.0052781105041504, + "step": 4972, + "token_acc": 0.30047953261726573 + }, + { + "epoch": 2.9152741131632953, + "grad_norm": 0.2979948974986824, + "learning_rate": 0.00013652008052990162, + "loss": 3.0458290576934814, + "step": 4973, + "token_acc": 0.294677615009131 + }, + { + "epoch": 2.915860451480504, + "grad_norm": 0.2959829023131944, + "learning_rate": 0.00013651796770097153, + "loss": 3.0879921913146973, + "step": 4974, + "token_acc": 0.2892638061152421 + }, + { + "epoch": 2.916446789797713, + "grad_norm": 0.33111155613564697, + "learning_rate": 0.00013651585424719474, + "loss": 3.0816097259521484, + "step": 4975, + "token_acc": 0.2889419149018947 + }, + { + "epoch": 2.917033128114922, + "grad_norm": 0.29407124018271347, + "learning_rate": 0.00013651374016859113, + "loss": 3.0749902725219727, + "step": 4976, + "token_acc": 0.2887900985097247 + }, + { + "epoch": 2.9176194664321313, + "grad_norm": 0.3175756157058336, + "learning_rate": 0.00013651162546518057, + "loss": 3.0616040229797363, + "step": 4977, + "token_acc": 0.2918483890674387 + }, + { + "epoch": 2.9182058047493404, + "grad_norm": 0.3031476481144282, + "learning_rate": 0.0001365095101369829, + "loss": 3.000711679458618, + "step": 4978, + "token_acc": 0.3012770632123404 + }, + { + "epoch": 2.9187921430665495, + "grad_norm": 0.27422554873944843, + "learning_rate": 0.00013650739418401804, + "loss": 3.043954610824585, + "step": 4979, + "token_acc": 0.29291409416817044 + }, + { + "epoch": 2.9193784813837587, + "grad_norm": 0.31289846560174794, + "learning_rate": 0.00013650527760630582, + "loss": 3.033825159072876, + "step": 4980, + "token_acc": 0.2956345008915349 + }, + { + "epoch": 2.9199648197009673, + "grad_norm": 0.29490525963999986, + "learning_rate": 0.00013650316040386614, + "loss": 3.0716962814331055, + "step": 4981, + "token_acc": 0.2919611036008357 + }, + { + "epoch": 2.9205511580181764, + "grad_norm": 0.30447082446066964, + "learning_rate": 0.00013650104257671887, + "loss": 3.053532838821411, + "step": 4982, + "token_acc": 0.29195695845443664 + }, + { + "epoch": 2.9211374963353856, + "grad_norm": 0.3487472529583517, + "learning_rate": 0.0001364989241248839, + "loss": 3.0250887870788574, + "step": 4983, + "token_acc": 0.296724289846936 + }, + { + "epoch": 2.9217238346525947, + "grad_norm": 0.32301375801029886, + "learning_rate": 0.00013649680504838118, + "loss": 3.0632290840148926, + "step": 4984, + "token_acc": 0.29146726771950976 + }, + { + "epoch": 2.9223101729698033, + "grad_norm": 0.30617853219810576, + "learning_rate": 0.00013649468534723054, + "loss": 3.06593656539917, + "step": 4985, + "token_acc": 0.29225025787095454 + }, + { + "epoch": 2.9228965112870124, + "grad_norm": 0.27267094658589364, + "learning_rate": 0.00013649256502145198, + "loss": 3.078068971633911, + "step": 4986, + "token_acc": 0.28980316817802093 + }, + { + "epoch": 2.9234828496042216, + "grad_norm": 0.31227488685094434, + "learning_rate": 0.00013649044407106534, + "loss": 3.0980372428894043, + "step": 4987, + "token_acc": 0.2860151905038599 + }, + { + "epoch": 2.9240691879214307, + "grad_norm": 0.32659256671658804, + "learning_rate": 0.00013648832249609058, + "loss": 3.0618433952331543, + "step": 4988, + "token_acc": 0.2912424005570035 + }, + { + "epoch": 2.92465552623864, + "grad_norm": 0.3047222930474552, + "learning_rate": 0.00013648620029654764, + "loss": 3.0806467533111572, + "step": 4989, + "token_acc": 0.2891033299474219 + }, + { + "epoch": 2.925241864555849, + "grad_norm": 0.2525980342334652, + "learning_rate": 0.00013648407747245643, + "loss": 3.07081937789917, + "step": 4990, + "token_acc": 0.28869326121479266 + }, + { + "epoch": 2.925828202873058, + "grad_norm": 0.27457147916556734, + "learning_rate": 0.00013648195402383688, + "loss": 3.0556640625, + "step": 4991, + "token_acc": 0.2948093588434121 + }, + { + "epoch": 2.9264145411902667, + "grad_norm": 0.27625396222602666, + "learning_rate": 0.000136479829950709, + "loss": 3.088117837905884, + "step": 4992, + "token_acc": 0.28773491620033925 + }, + { + "epoch": 2.927000879507476, + "grad_norm": 0.2676467427143431, + "learning_rate": 0.00013647770525309266, + "loss": 3.0141892433166504, + "step": 4993, + "token_acc": 0.2977599080987938 + }, + { + "epoch": 2.927587217824685, + "grad_norm": 0.30903482405417276, + "learning_rate": 0.00013647557993100786, + "loss": 3.0457448959350586, + "step": 4994, + "token_acc": 0.2927699411056864 + }, + { + "epoch": 2.928173556141894, + "grad_norm": 0.3985688906389621, + "learning_rate": 0.0001364734539844746, + "loss": 3.074188709259033, + "step": 4995, + "token_acc": 0.2897756101993987 + }, + { + "epoch": 2.9287598944591027, + "grad_norm": 0.42465521830582476, + "learning_rate": 0.00013647132741351277, + "loss": 3.061387777328491, + "step": 4996, + "token_acc": 0.2928401556920112 + }, + { + "epoch": 2.929346232776312, + "grad_norm": 0.32635835412107805, + "learning_rate": 0.00013646920021814242, + "loss": 3.0074830055236816, + "step": 4997, + "token_acc": 0.29969890187743536 + }, + { + "epoch": 2.929932571093521, + "grad_norm": 0.3503013173099579, + "learning_rate": 0.0001364670723983835, + "loss": 3.0533993244171143, + "step": 4998, + "token_acc": 0.2937325353436734 + }, + { + "epoch": 2.93051890941073, + "grad_norm": 0.36676561424188475, + "learning_rate": 0.00013646494395425597, + "loss": 3.068729877471924, + "step": 4999, + "token_acc": 0.2899127128476501 + }, + { + "epoch": 2.931105247727939, + "grad_norm": 0.29829901745341364, + "learning_rate": 0.00013646281488577993, + "loss": 3.048556327819824, + "step": 5000, + "token_acc": 0.2922724285919629 + }, + { + "epoch": 2.9316915860451482, + "grad_norm": 0.4084962256189079, + "learning_rate": 0.00013646068519297523, + "loss": 3.0652198791503906, + "step": 5001, + "token_acc": 0.29210373984971355 + }, + { + "epoch": 2.9322779243623573, + "grad_norm": 0.2683981941247538, + "learning_rate": 0.00013645855487586197, + "loss": 3.0624914169311523, + "step": 5002, + "token_acc": 0.2918852875216858 + }, + { + "epoch": 2.932864262679566, + "grad_norm": 0.35479440872004936, + "learning_rate": 0.00013645642393446015, + "loss": 3.0860254764556885, + "step": 5003, + "token_acc": 0.2869742572074307 + }, + { + "epoch": 2.933450600996775, + "grad_norm": 0.2812546149525364, + "learning_rate": 0.00013645429236878976, + "loss": 3.062725782394409, + "step": 5004, + "token_acc": 0.2911306561499056 + }, + { + "epoch": 2.9340369393139842, + "grad_norm": 0.32737679595888164, + "learning_rate": 0.00013645216017887086, + "loss": 3.062804698944092, + "step": 5005, + "token_acc": 0.29160920814205554 + }, + { + "epoch": 2.9346232776311933, + "grad_norm": 0.2765999510941062, + "learning_rate": 0.00013645002736472348, + "loss": 3.048184394836426, + "step": 5006, + "token_acc": 0.2951470122873116 + }, + { + "epoch": 2.935209615948402, + "grad_norm": 0.3236615863564798, + "learning_rate": 0.0001364478939263676, + "loss": 3.106018543243408, + "step": 5007, + "token_acc": 0.2862875321613709 + }, + { + "epoch": 2.935795954265611, + "grad_norm": 0.28419752540242177, + "learning_rate": 0.0001364457598638233, + "loss": 3.0738160610198975, + "step": 5008, + "token_acc": 0.2903925437104565 + }, + { + "epoch": 2.9363822925828202, + "grad_norm": 0.27126176401977553, + "learning_rate": 0.00013644362517711064, + "loss": 3.1003026962280273, + "step": 5009, + "token_acc": 0.2854216246390863 + }, + { + "epoch": 2.9369686309000294, + "grad_norm": 0.286879285616511, + "learning_rate": 0.00013644148986624965, + "loss": 3.063784599304199, + "step": 5010, + "token_acc": 0.2907265043481954 + }, + { + "epoch": 2.9375549692172385, + "grad_norm": 0.2753165379548571, + "learning_rate": 0.00013643935393126036, + "loss": 3.0766971111297607, + "step": 5011, + "token_acc": 0.2903539789969973 + }, + { + "epoch": 2.9381413075344476, + "grad_norm": 0.28601517825402545, + "learning_rate": 0.0001364372173721629, + "loss": 3.0568909645080566, + "step": 5012, + "token_acc": 0.2920344557068563 + }, + { + "epoch": 2.9387276458516562, + "grad_norm": 0.2716559850233657, + "learning_rate": 0.0001364350801889773, + "loss": 3.065601348876953, + "step": 5013, + "token_acc": 0.291685332388761 + }, + { + "epoch": 2.9393139841688654, + "grad_norm": 0.27357823747528187, + "learning_rate": 0.00013643294238172365, + "loss": 3.070812702178955, + "step": 5014, + "token_acc": 0.2908424085733901 + }, + { + "epoch": 2.9399003224860745, + "grad_norm": 0.2818159504292812, + "learning_rate": 0.00013643080395042204, + "loss": 3.071913242340088, + "step": 5015, + "token_acc": 0.2898937223579114 + }, + { + "epoch": 2.9404866608032836, + "grad_norm": 0.26146195808747574, + "learning_rate": 0.0001364286648950925, + "loss": 3.098595142364502, + "step": 5016, + "token_acc": 0.28553850386720064 + }, + { + "epoch": 2.9410729991204922, + "grad_norm": 0.2946662753083827, + "learning_rate": 0.0001364265252157552, + "loss": 3.0257067680358887, + "step": 5017, + "token_acc": 0.29813498705276065 + }, + { + "epoch": 2.9416593374377014, + "grad_norm": 0.27885021576765145, + "learning_rate": 0.0001364243849124302, + "loss": 3.04622483253479, + "step": 5018, + "token_acc": 0.29280674282245567 + }, + { + "epoch": 2.9422456757549105, + "grad_norm": 0.29761512837386006, + "learning_rate": 0.00013642224398513762, + "loss": 3.053049087524414, + "step": 5019, + "token_acc": 0.2926123010177535 + }, + { + "epoch": 2.9428320140721196, + "grad_norm": 0.3021773274975337, + "learning_rate": 0.00013642010243389754, + "loss": 3.0786664485931396, + "step": 5020, + "token_acc": 0.28989505891194073 + }, + { + "epoch": 2.9434183523893287, + "grad_norm": 0.2705874057321925, + "learning_rate": 0.00013641796025873012, + "loss": 3.0685856342315674, + "step": 5021, + "token_acc": 0.29262287688168903 + }, + { + "epoch": 2.944004690706538, + "grad_norm": 0.3070103263924474, + "learning_rate": 0.00013641581745965547, + "loss": 3.0478734970092773, + "step": 5022, + "token_acc": 0.29365975446854525 + }, + { + "epoch": 2.944591029023747, + "grad_norm": 0.37875277964159665, + "learning_rate": 0.0001364136740366937, + "loss": 3.0837907791137695, + "step": 5023, + "token_acc": 0.2899619508462835 + }, + { + "epoch": 2.9451773673409556, + "grad_norm": 0.36665526894275835, + "learning_rate": 0.00013641152998986498, + "loss": 3.0615291595458984, + "step": 5024, + "token_acc": 0.2937001998667555 + }, + { + "epoch": 2.9457637056581647, + "grad_norm": 0.3017481957160068, + "learning_rate": 0.00013640938531918938, + "loss": 3.064027786254883, + "step": 5025, + "token_acc": 0.2900226635972661 + }, + { + "epoch": 2.946350043975374, + "grad_norm": 0.30776079721288263, + "learning_rate": 0.00013640724002468712, + "loss": 3.084296226501465, + "step": 5026, + "token_acc": 0.28989610589298825 + }, + { + "epoch": 2.946936382292583, + "grad_norm": 0.3426569558191926, + "learning_rate": 0.00013640509410637832, + "loss": 3.14373779296875, + "step": 5027, + "token_acc": 0.2802171615657183 + }, + { + "epoch": 2.9475227206097916, + "grad_norm": 0.3497024310859073, + "learning_rate": 0.00013640294756428315, + "loss": 3.0802712440490723, + "step": 5028, + "token_acc": 0.28970475603497853 + }, + { + "epoch": 2.9481090589270007, + "grad_norm": 0.2544327623191903, + "learning_rate": 0.00013640080039842173, + "loss": 3.036038875579834, + "step": 5029, + "token_acc": 0.2946394998145829 + }, + { + "epoch": 2.94869539724421, + "grad_norm": 0.2908310395349893, + "learning_rate": 0.00013639865260881432, + "loss": 3.0567779541015625, + "step": 5030, + "token_acc": 0.2916891470161964 + }, + { + "epoch": 2.949281735561419, + "grad_norm": 0.2815486021729765, + "learning_rate": 0.00013639650419548102, + "loss": 3.0425474643707275, + "step": 5031, + "token_acc": 0.2939626458094532 + }, + { + "epoch": 2.949868073878628, + "grad_norm": 0.2721678776391117, + "learning_rate": 0.000136394355158442, + "loss": 3.0598974227905273, + "step": 5032, + "token_acc": 0.2910339072242214 + }, + { + "epoch": 2.950454412195837, + "grad_norm": 0.3416587668451502, + "learning_rate": 0.00013639220549771752, + "loss": 3.1210145950317383, + "step": 5033, + "token_acc": 0.2849699528506206 + }, + { + "epoch": 2.9510407505130463, + "grad_norm": 0.2936797079149699, + "learning_rate": 0.00013639005521332774, + "loss": 3.038461446762085, + "step": 5034, + "token_acc": 0.2947490679211061 + }, + { + "epoch": 2.951627088830255, + "grad_norm": 0.30741288119880117, + "learning_rate": 0.00013638790430529283, + "loss": 3.07912278175354, + "step": 5035, + "token_acc": 0.29012272992458504 + }, + { + "epoch": 2.952213427147464, + "grad_norm": 0.3109164145975344, + "learning_rate": 0.00013638575277363302, + "loss": 3.063770294189453, + "step": 5036, + "token_acc": 0.2908101915167029 + }, + { + "epoch": 2.952799765464673, + "grad_norm": 0.28501500180686534, + "learning_rate": 0.00013638360061836853, + "loss": 3.073878526687622, + "step": 5037, + "token_acc": 0.28878080879641055 + }, + { + "epoch": 2.9533861037818823, + "grad_norm": 0.2901779524179545, + "learning_rate": 0.00013638144783951957, + "loss": 3.036140203475952, + "step": 5038, + "token_acc": 0.29560738035028566 + }, + { + "epoch": 2.953972442099091, + "grad_norm": 0.3060192026031789, + "learning_rate": 0.00013637929443710635, + "loss": 3.062993049621582, + "step": 5039, + "token_acc": 0.2928577222264377 + }, + { + "epoch": 2.9545587804163, + "grad_norm": 0.357701489648701, + "learning_rate": 0.0001363771404111491, + "loss": 3.075578212738037, + "step": 5040, + "token_acc": 0.29103592188044775 + }, + { + "epoch": 2.955145118733509, + "grad_norm": 0.3167473843322589, + "learning_rate": 0.00013637498576166805, + "loss": 3.1182470321655273, + "step": 5041, + "token_acc": 0.2842987764663673 + }, + { + "epoch": 2.9557314570507183, + "grad_norm": 0.3354989515501741, + "learning_rate": 0.00013637283048868347, + "loss": 3.088789939880371, + "step": 5042, + "token_acc": 0.28890104894057816 + }, + { + "epoch": 2.9563177953679274, + "grad_norm": 0.3013429000464083, + "learning_rate": 0.00013637067459221558, + "loss": 3.06396222114563, + "step": 5043, + "token_acc": 0.2912091597696642 + }, + { + "epoch": 2.9569041336851365, + "grad_norm": 0.2816527102572718, + "learning_rate": 0.00013636851807228466, + "loss": 3.0484938621520996, + "step": 5044, + "token_acc": 0.2933572271378747 + }, + { + "epoch": 2.9574904720023456, + "grad_norm": 0.2814280883159842, + "learning_rate": 0.0001363663609289109, + "loss": 3.0736541748046875, + "step": 5045, + "token_acc": 0.2888682942329079 + }, + { + "epoch": 2.9580768103195543, + "grad_norm": 0.32474160527248996, + "learning_rate": 0.00013636420316211464, + "loss": 3.0716938972473145, + "step": 5046, + "token_acc": 0.29186426776306795 + }, + { + "epoch": 2.9586631486367634, + "grad_norm": 0.31426110042074556, + "learning_rate": 0.0001363620447719161, + "loss": 3.050072193145752, + "step": 5047, + "token_acc": 0.29182473719266744 + }, + { + "epoch": 2.9592494869539725, + "grad_norm": 0.3249605605291814, + "learning_rate": 0.0001363598857583356, + "loss": 3.100602149963379, + "step": 5048, + "token_acc": 0.28758258237972356 + }, + { + "epoch": 2.9598358252711816, + "grad_norm": 0.366975913555725, + "learning_rate": 0.00013635772612139338, + "loss": 3.0870258808135986, + "step": 5049, + "token_acc": 0.2885171062527213 + }, + { + "epoch": 2.9604221635883903, + "grad_norm": 0.41727694137482224, + "learning_rate": 0.00013635556586110974, + "loss": 3.0418734550476074, + "step": 5050, + "token_acc": 0.29484417904081467 + }, + { + "epoch": 2.9610085019055994, + "grad_norm": 0.4144930196425431, + "learning_rate": 0.00013635340497750495, + "loss": 3.069295644760132, + "step": 5051, + "token_acc": 0.2916479224190181 + }, + { + "epoch": 2.9615948402228085, + "grad_norm": 0.3271879148712174, + "learning_rate": 0.0001363512434705994, + "loss": 3.080421209335327, + "step": 5052, + "token_acc": 0.2898569297846176 + }, + { + "epoch": 2.9621811785400176, + "grad_norm": 0.29324398731142676, + "learning_rate": 0.00013634908134041326, + "loss": 3.0160815715789795, + "step": 5053, + "token_acc": 0.2982172231123813 + }, + { + "epoch": 2.9627675168572267, + "grad_norm": 0.3258940986720766, + "learning_rate": 0.00013634691858696693, + "loss": 3.056901454925537, + "step": 5054, + "token_acc": 0.29254112998895687 + }, + { + "epoch": 2.963353855174436, + "grad_norm": 0.2907213764140246, + "learning_rate": 0.0001363447552102807, + "loss": 3.0317893028259277, + "step": 5055, + "token_acc": 0.2964130852770742 + }, + { + "epoch": 2.963940193491645, + "grad_norm": 0.3062349911409993, + "learning_rate": 0.0001363425912103749, + "loss": 3.0666451454162598, + "step": 5056, + "token_acc": 0.2907197114429381 + }, + { + "epoch": 2.9645265318088536, + "grad_norm": 0.32636203550387194, + "learning_rate": 0.00013634042658726983, + "loss": 3.042525053024292, + "step": 5057, + "token_acc": 0.29569820860985097 + }, + { + "epoch": 2.9651128701260627, + "grad_norm": 0.24322285236328897, + "learning_rate": 0.0001363382613409859, + "loss": 3.020813226699829, + "step": 5058, + "token_acc": 0.2984867833293795 + }, + { + "epoch": 2.965699208443272, + "grad_norm": 0.36182778387954334, + "learning_rate": 0.00013633609547154335, + "loss": 3.032301425933838, + "step": 5059, + "token_acc": 0.29459446299566133 + }, + { + "epoch": 2.966285546760481, + "grad_norm": 0.2777151310650473, + "learning_rate": 0.00013633392897896261, + "loss": 3.117042064666748, + "step": 5060, + "token_acc": 0.2839379492522548 + }, + { + "epoch": 2.9668718850776896, + "grad_norm": 0.34344179944575226, + "learning_rate": 0.00013633176186326394, + "loss": 3.066194534301758, + "step": 5061, + "token_acc": 0.29041129910067953 + }, + { + "epoch": 2.9674582233948987, + "grad_norm": 0.32159209078980083, + "learning_rate": 0.0001363295941244678, + "loss": 3.0764331817626953, + "step": 5062, + "token_acc": 0.2882871605498486 + }, + { + "epoch": 2.968044561712108, + "grad_norm": 0.3116400205960304, + "learning_rate": 0.0001363274257625945, + "loss": 3.028153896331787, + "step": 5063, + "token_acc": 0.29643538311459333 + }, + { + "epoch": 2.968630900029317, + "grad_norm": 0.34566690995484034, + "learning_rate": 0.0001363252567776644, + "loss": 3.0133235454559326, + "step": 5064, + "token_acc": 0.2977909531796492 + }, + { + "epoch": 2.969217238346526, + "grad_norm": 0.32993049613609116, + "learning_rate": 0.00013632308716969785, + "loss": 3.0487701892852783, + "step": 5065, + "token_acc": 0.29457125885697316 + }, + { + "epoch": 2.969803576663735, + "grad_norm": 0.3331137804136902, + "learning_rate": 0.00013632091693871533, + "loss": 3.0613999366760254, + "step": 5066, + "token_acc": 0.2929213423389885 + }, + { + "epoch": 2.970389914980944, + "grad_norm": 0.32530746947130224, + "learning_rate": 0.00013631874608473711, + "loss": 3.0511653423309326, + "step": 5067, + "token_acc": 0.2927549490483088 + }, + { + "epoch": 2.970976253298153, + "grad_norm": 0.32281330716448636, + "learning_rate": 0.00013631657460778368, + "loss": 3.002755641937256, + "step": 5068, + "token_acc": 0.2995957872581301 + }, + { + "epoch": 2.971562591615362, + "grad_norm": 0.3094091551811668, + "learning_rate": 0.00013631440250787537, + "loss": 3.0356221199035645, + "step": 5069, + "token_acc": 0.2964823986294157 + }, + { + "epoch": 2.972148929932571, + "grad_norm": 0.3695372309072402, + "learning_rate": 0.00013631222978503261, + "loss": 3.095733880996704, + "step": 5070, + "token_acc": 0.28617151665337537 + }, + { + "epoch": 2.97273526824978, + "grad_norm": 0.3602856961512266, + "learning_rate": 0.00013631005643927583, + "loss": 3.0674257278442383, + "step": 5071, + "token_acc": 0.290475487102362 + }, + { + "epoch": 2.973321606566989, + "grad_norm": 0.2851985176519805, + "learning_rate": 0.0001363078824706254, + "loss": 3.0630340576171875, + "step": 5072, + "token_acc": 0.29165830335407433 + }, + { + "epoch": 2.973907944884198, + "grad_norm": 0.3852149031223918, + "learning_rate": 0.00013630570787910177, + "loss": 3.0783982276916504, + "step": 5073, + "token_acc": 0.28978509373571104 + }, + { + "epoch": 2.974494283201407, + "grad_norm": 0.350012734993071, + "learning_rate": 0.00013630353266472537, + "loss": 3.0373401641845703, + "step": 5074, + "token_acc": 0.29495274345058065 + }, + { + "epoch": 2.9750806215186163, + "grad_norm": 0.3762222832975856, + "learning_rate": 0.0001363013568275166, + "loss": 3.0902204513549805, + "step": 5075, + "token_acc": 0.2880220289568883 + }, + { + "epoch": 2.9756669598358254, + "grad_norm": 0.3618663933291691, + "learning_rate": 0.00013629918036749597, + "loss": 3.0512664318084717, + "step": 5076, + "token_acc": 0.29483844468784226 + }, + { + "epoch": 2.9762532981530345, + "grad_norm": 0.3208166237796758, + "learning_rate": 0.00013629700328468384, + "loss": 3.0551252365112305, + "step": 5077, + "token_acc": 0.2912604052022136 + }, + { + "epoch": 2.976839636470243, + "grad_norm": 0.43641435860404487, + "learning_rate": 0.0001362948255791007, + "loss": 3.026806116104126, + "step": 5078, + "token_acc": 0.29833368797959336 + }, + { + "epoch": 2.9774259747874523, + "grad_norm": 0.3033146370799069, + "learning_rate": 0.00013629264725076705, + "loss": 3.018017053604126, + "step": 5079, + "token_acc": 0.2990045077902382 + }, + { + "epoch": 2.9780123131046614, + "grad_norm": 0.3334356384644916, + "learning_rate": 0.00013629046829970328, + "loss": 3.057142734527588, + "step": 5080, + "token_acc": 0.2933613716570025 + }, + { + "epoch": 2.9785986514218705, + "grad_norm": 0.29101575068654045, + "learning_rate": 0.0001362882887259299, + "loss": 3.073469638824463, + "step": 5081, + "token_acc": 0.28909883489688015 + }, + { + "epoch": 2.979184989739079, + "grad_norm": 0.3522886872369336, + "learning_rate": 0.00013628610852946734, + "loss": 3.0624747276306152, + "step": 5082, + "token_acc": 0.2904996813052025 + }, + { + "epoch": 2.9797713280562883, + "grad_norm": 0.2732145934840157, + "learning_rate": 0.00013628392771033616, + "loss": 3.112614393234253, + "step": 5083, + "token_acc": 0.2859487957398735 + }, + { + "epoch": 2.9803576663734974, + "grad_norm": 0.3415988055820798, + "learning_rate": 0.00013628174626855675, + "loss": 3.0622897148132324, + "step": 5084, + "token_acc": 0.2929668926354114 + }, + { + "epoch": 2.9809440046907065, + "grad_norm": 0.2716813689985832, + "learning_rate": 0.00013627956420414968, + "loss": 3.061203956604004, + "step": 5085, + "token_acc": 0.2932223227110709 + }, + { + "epoch": 2.9815303430079156, + "grad_norm": 0.3284529104383866, + "learning_rate": 0.0001362773815171354, + "loss": 3.0636706352233887, + "step": 5086, + "token_acc": 0.2907071972251662 + }, + { + "epoch": 2.9821166813251248, + "grad_norm": 0.29128177793991056, + "learning_rate": 0.00013627519820753444, + "loss": 3.0941479206085205, + "step": 5087, + "token_acc": 0.2850295259881896 + }, + { + "epoch": 2.982703019642334, + "grad_norm": 0.3044754642911704, + "learning_rate": 0.0001362730142753673, + "loss": 3.104616641998291, + "step": 5088, + "token_acc": 0.2848116422584508 + }, + { + "epoch": 2.9832893579595425, + "grad_norm": 0.28446607656173484, + "learning_rate": 0.00013627082972065448, + "loss": 3.0720901489257812, + "step": 5089, + "token_acc": 0.29115595287308804 + }, + { + "epoch": 2.9838756962767516, + "grad_norm": 0.27921581035105186, + "learning_rate": 0.00013626864454341654, + "loss": 3.0732390880584717, + "step": 5090, + "token_acc": 0.29101294423210927 + }, + { + "epoch": 2.9844620345939608, + "grad_norm": 0.3108962166570425, + "learning_rate": 0.000136266458743674, + "loss": 3.0294482707977295, + "step": 5091, + "token_acc": 0.2964264430881779 + }, + { + "epoch": 2.98504837291117, + "grad_norm": 0.288786195872828, + "learning_rate": 0.00013626427232144733, + "loss": 3.0725369453430176, + "step": 5092, + "token_acc": 0.2897948169805191 + }, + { + "epoch": 2.9856347112283785, + "grad_norm": 0.2899785032171189, + "learning_rate": 0.00013626208527675712, + "loss": 3.045457124710083, + "step": 5093, + "token_acc": 0.293720648570496 + }, + { + "epoch": 2.9862210495455876, + "grad_norm": 0.2838222308098357, + "learning_rate": 0.00013625989760962393, + "loss": 3.057738780975342, + "step": 5094, + "token_acc": 0.2919088696727345 + }, + { + "epoch": 2.9868073878627968, + "grad_norm": 0.26958868885718185, + "learning_rate": 0.00013625770932006826, + "loss": 3.005828380584717, + "step": 5095, + "token_acc": 0.2989685741201478 + }, + { + "epoch": 2.987393726180006, + "grad_norm": 0.29840575465118574, + "learning_rate": 0.0001362555204081107, + "loss": 3.038477659225464, + "step": 5096, + "token_acc": 0.29393617439892966 + }, + { + "epoch": 2.987980064497215, + "grad_norm": 0.30503549006573216, + "learning_rate": 0.00013625333087377185, + "loss": 3.0915145874023438, + "step": 5097, + "token_acc": 0.28663287457910286 + }, + { + "epoch": 2.988566402814424, + "grad_norm": 0.2655160888565439, + "learning_rate": 0.00013625114071707218, + "loss": 3.0453274250030518, + "step": 5098, + "token_acc": 0.2936305616858615 + }, + { + "epoch": 2.989152741131633, + "grad_norm": 0.3019952983141854, + "learning_rate": 0.00013624894993803234, + "loss": 3.06643009185791, + "step": 5099, + "token_acc": 0.29185535210343244 + }, + { + "epoch": 2.989739079448842, + "grad_norm": 0.2782010071089016, + "learning_rate": 0.00013624675853667292, + "loss": 3.02945876121521, + "step": 5100, + "token_acc": 0.2955740625469531 + }, + { + "epoch": 2.990325417766051, + "grad_norm": 0.29022997425637564, + "learning_rate": 0.00013624456651301442, + "loss": 3.0640146732330322, + "step": 5101, + "token_acc": 0.2920845729410355 + }, + { + "epoch": 2.99091175608326, + "grad_norm": 0.352831237249997, + "learning_rate": 0.0001362423738670775, + "loss": 3.07149600982666, + "step": 5102, + "token_acc": 0.2899489746035689 + }, + { + "epoch": 2.991498094400469, + "grad_norm": 0.2916538850319864, + "learning_rate": 0.00013624018059888276, + "loss": 3.060878038406372, + "step": 5103, + "token_acc": 0.2937006948075831 + }, + { + "epoch": 2.992084432717678, + "grad_norm": 0.29713505292256487, + "learning_rate": 0.00013623798670845075, + "loss": 3.0834810733795166, + "step": 5104, + "token_acc": 0.28720010307950006 + }, + { + "epoch": 2.992670771034887, + "grad_norm": 0.2957345732471271, + "learning_rate": 0.00013623579219580213, + "loss": 3.081881046295166, + "step": 5105, + "token_acc": 0.2890877467836801 + }, + { + "epoch": 2.993257109352096, + "grad_norm": 0.30576726584363506, + "learning_rate": 0.00013623359706095749, + "loss": 3.0930802822113037, + "step": 5106, + "token_acc": 0.2872140427958852 + }, + { + "epoch": 2.993843447669305, + "grad_norm": 0.3338994892191529, + "learning_rate": 0.00013623140130393746, + "loss": 3.0544955730438232, + "step": 5107, + "token_acc": 0.29395764882517916 + }, + { + "epoch": 2.9944297859865143, + "grad_norm": 0.29969365244058976, + "learning_rate": 0.00013622920492476265, + "loss": 3.064077377319336, + "step": 5108, + "token_acc": 0.29168774554652604 + }, + { + "epoch": 2.9950161243037234, + "grad_norm": 0.4039136757450711, + "learning_rate": 0.00013622700792345372, + "loss": 3.0626978874206543, + "step": 5109, + "token_acc": 0.2921200832216812 + }, + { + "epoch": 2.9956024626209325, + "grad_norm": 0.3041044296242887, + "learning_rate": 0.00013622481030003129, + "loss": 3.0699901580810547, + "step": 5110, + "token_acc": 0.29167546118451604 + }, + { + "epoch": 2.996188800938141, + "grad_norm": 0.3376620845892908, + "learning_rate": 0.000136222612054516, + "loss": 3.0865955352783203, + "step": 5111, + "token_acc": 0.2881660006770968 + }, + { + "epoch": 2.9967751392553503, + "grad_norm": 0.3480892511274235, + "learning_rate": 0.00013622041318692854, + "loss": 3.0646862983703613, + "step": 5112, + "token_acc": 0.29179754035597644 + }, + { + "epoch": 2.9973614775725594, + "grad_norm": 0.35544446660291185, + "learning_rate": 0.00013621821369728948, + "loss": 3.059173822402954, + "step": 5113, + "token_acc": 0.29277986310443804 + }, + { + "epoch": 2.9979478158897686, + "grad_norm": 0.33960109007722494, + "learning_rate": 0.00013621601358561954, + "loss": 3.029160261154175, + "step": 5114, + "token_acc": 0.2970972777578973 + }, + { + "epoch": 2.998534154206977, + "grad_norm": 0.3080845228092365, + "learning_rate": 0.00013621381285193942, + "loss": 3.107654094696045, + "step": 5115, + "token_acc": 0.28583785734375583 + }, + { + "epoch": 2.9991204925241863, + "grad_norm": 0.3503238901929032, + "learning_rate": 0.00013621161149626973, + "loss": 2.9973530769348145, + "step": 5116, + "token_acc": 0.30067839634713667 + }, + { + "epoch": 2.9997068308413954, + "grad_norm": 0.3222596217633866, + "learning_rate": 0.00013620940951863115, + "loss": 3.0677568912506104, + "step": 5117, + "token_acc": 0.28862210858535414 + }, + { + "epoch": 3.0, + "grad_norm": 0.3686385325390872, + "learning_rate": 0.0001362072069190444, + "loss": 3.057602882385254, + "step": 5118, + "token_acc": 0.2944489963129865 + }, + { + "epoch": 3.0, + "eval_loss": 3.0860543251037598, + "eval_runtime": 22.0388, + "eval_samples_per_second": 11.616, + "eval_steps_per_second": 1.452, + "eval_token_acc": 0.2885165749617454, + "step": 5118 + } + ], + "logging_steps": 1, + "max_steps": 34120, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": -34120, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5760517546672128.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}