diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.8991458114790949, + "epoch": 0.9990509016434387, "eval_steps": 500, - "global_step": 9000, + "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -81008,6 +81008,9006 @@ "mean_token_accuracy": 0.9103310108184814, "num_tokens": 163039548.0, "step": 9000 + }, + { + "epoch": 0.8992457165692592, + "grad_norm": 0.41466863845944607, + "learning_rate": 2.64645573547766e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.9038451611995697, + "num_tokens": 163121097.0, + "step": 9001 + }, + { + "epoch": 0.8993456216594236, + "grad_norm": 0.4826415958756936, + "learning_rate": 2.6412644342782346e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.9107941389083862, + "num_tokens": 163202593.0, + "step": 9002 + }, + { + "epoch": 0.8994455267495879, + "grad_norm": 0.42703980729288565, + "learning_rate": 2.6360780915847484e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.907634824514389, + "num_tokens": 163284187.0, + "step": 9003 + }, + { + "epoch": 0.8995454318397522, + "grad_norm": 0.5038209830555014, + "learning_rate": 2.630896707940228e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.9072529971599579, + "num_tokens": 163365590.0, + "step": 9004 + }, + { + "epoch": 0.8996453369299166, + "grad_norm": 0.4447489803384566, + "learning_rate": 2.625720283887151e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.9090691804885864, + "num_tokens": 163447047.0, + "step": 9005 + }, + { + "epoch": 0.8997452420200809, + "grad_norm": 0.3917801272236353, + "learning_rate": 2.620548819967511e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.9079409241676331, + "num_tokens": 163528539.0, + "step": 9006 + }, + { + "epoch": 0.8998451471102452, + "grad_norm": 0.3762445150181644, + "learning_rate": 2.615382316722753e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.9120180904865265, + "num_tokens": 163610138.0, + "step": 9007 + }, + { + "epoch": 0.8999450522004097, + "grad_norm": 0.43435227458993475, + "learning_rate": 2.610220774693828e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.9073100984096527, + "num_tokens": 163691599.0, + "step": 9008 + }, + { + "epoch": 0.900044957290574, + "grad_norm": 0.4593370918073325, + "learning_rate": 2.6050641944211417e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.9050981402397156, + "num_tokens": 163773058.0, + "step": 9009 + }, + { + "epoch": 0.9001448623807383, + "grad_norm": 0.5451624418938789, + "learning_rate": 2.5999125764445967e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.9085933864116669, + "num_tokens": 163854725.0, + "step": 9010 + }, + { + "epoch": 0.9002447674709027, + "grad_norm": 0.5053376076426749, + "learning_rate": 2.594765921303577e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.9084261357784271, + "num_tokens": 163936182.0, + "step": 9011 + }, + { + "epoch": 0.900344672561067, + "grad_norm": 0.522591843802279, + "learning_rate": 2.589624229536941e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.90744549036026, + "num_tokens": 164017744.0, + "step": 9012 + }, + { + "epoch": 0.9004445776512313, + "grad_norm": 0.3943744764393112, + "learning_rate": 2.5844875016830196e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.9088090360164642, + "num_tokens": 164099254.0, + "step": 9013 + }, + { + "epoch": 0.9005444827413956, + "grad_norm": 0.41784354199809304, + "learning_rate": 2.579355738279654e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.906558632850647, + "num_tokens": 164180792.0, + "step": 9014 + }, + { + "epoch": 0.90064438783156, + "grad_norm": 0.4551140225353378, + "learning_rate": 2.5742289398641153e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.9080483615398407, + "num_tokens": 164262327.0, + "step": 9015 + }, + { + "epoch": 0.9007442929217243, + "grad_norm": 0.48199154139363215, + "learning_rate": 2.5691071069732175e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.9071743190288544, + "num_tokens": 164343807.0, + "step": 9016 + }, + { + "epoch": 0.9008441980118888, + "grad_norm": 0.5298821830702423, + "learning_rate": 2.5639902401431883e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.9093576967716217, + "num_tokens": 164425422.0, + "step": 9017 + }, + { + "epoch": 0.9009441031020531, + "grad_norm": 0.4674123190472548, + "learning_rate": 2.558878339909804e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.9076699614524841, + "num_tokens": 164506924.0, + "step": 9018 + }, + { + "epoch": 0.9010440081922174, + "grad_norm": 0.4343770469859992, + "learning_rate": 2.553771406808253e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.9091534316539764, + "num_tokens": 164588497.0, + "step": 9019 + }, + { + "epoch": 0.9011439132823817, + "grad_norm": 0.47856302332901296, + "learning_rate": 2.5486694413732525e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.9100964963436127, + "num_tokens": 164669918.0, + "step": 9020 + }, + { + "epoch": 0.9012438183725461, + "grad_norm": 0.5027358378639037, + "learning_rate": 2.5435724441389806e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.9083025455474854, + "num_tokens": 164751465.0, + "step": 9021 + }, + { + "epoch": 0.9013437234627104, + "grad_norm": 0.45098786662831825, + "learning_rate": 2.5384804156390996e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.9065844416618347, + "num_tokens": 164833038.0, + "step": 9022 + }, + { + "epoch": 0.9014436285528747, + "grad_norm": 0.7654663178308174, + "learning_rate": 2.533393356406749e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.9079404175281525, + "num_tokens": 164914530.0, + "step": 9023 + }, + { + "epoch": 0.9015435336430391, + "grad_norm": 0.5130361239735889, + "learning_rate": 2.5283112669745426e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.905907928943634, + "num_tokens": 164996044.0, + "step": 9024 + }, + { + "epoch": 0.9016434387332034, + "grad_norm": 0.475231261074473, + "learning_rate": 2.5232341478745924e-07, + "loss": 0.492, + "mean_token_accuracy": 0.9068919718265533, + "num_tokens": 165077506.0, + "step": 9025 + }, + { + "epoch": 0.9017433438233678, + "grad_norm": 0.47199752957840335, + "learning_rate": 2.5181619996384744e-07, + "loss": 0.487, + "mean_token_accuracy": 0.9084951877593994, + "num_tokens": 165159032.0, + "step": 9026 + }, + { + "epoch": 0.9018432489135322, + "grad_norm": 0.9146092337927981, + "learning_rate": 2.5130948227972296e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.907267302274704, + "num_tokens": 165240567.0, + "step": 9027 + }, + { + "epoch": 0.9019431540036965, + "grad_norm": 0.5026195688240748, + "learning_rate": 2.508032617881423e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.9075287282466888, + "num_tokens": 165322108.0, + "step": 9028 + }, + { + "epoch": 0.9020430590938608, + "grad_norm": 0.6505291335093544, + "learning_rate": 2.5029753854210524e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.908045083284378, + "num_tokens": 165403564.0, + "step": 9029 + }, + { + "epoch": 0.9021429641840252, + "grad_norm": 0.44269361401732277, + "learning_rate": 2.497923125945634e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.908696860074997, + "num_tokens": 165485097.0, + "step": 9030 + }, + { + "epoch": 0.9022428692741895, + "grad_norm": 0.3939967257154172, + "learning_rate": 2.4928758399841213e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.9082299470901489, + "num_tokens": 165566600.0, + "step": 9031 + }, + { + "epoch": 0.9023427743643538, + "grad_norm": 0.44363309587012545, + "learning_rate": 2.487833528064987e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.9086495637893677, + "num_tokens": 165648065.0, + "step": 9032 + }, + { + "epoch": 0.9024426794545182, + "grad_norm": 0.47012602711505724, + "learning_rate": 2.4827961907161636e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.9105177819728851, + "num_tokens": 165729564.0, + "step": 9033 + }, + { + "epoch": 0.9025425845446825, + "grad_norm": 0.44650820221432996, + "learning_rate": 2.4777638284650574e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.9087754487991333, + "num_tokens": 165811020.0, + "step": 9034 + }, + { + "epoch": 0.902642489634847, + "grad_norm": 0.4821764770633757, + "learning_rate": 2.472736441838569e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.9079589545726776, + "num_tokens": 165892568.0, + "step": 9035 + }, + { + "epoch": 0.9027423947250113, + "grad_norm": 0.4630544093157891, + "learning_rate": 2.4677140313630665e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.9087773263454437, + "num_tokens": 165974087.0, + "step": 9036 + }, + { + "epoch": 0.9028422998151756, + "grad_norm": 0.4512754185741349, + "learning_rate": 2.462696597564407e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.9117187261581421, + "num_tokens": 166055695.0, + "step": 9037 + }, + { + "epoch": 0.9029422049053399, + "grad_norm": 0.4138233865492857, + "learning_rate": 2.4576841409679196e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9078445434570312, + "num_tokens": 166137180.0, + "step": 9038 + }, + { + "epoch": 0.9030421099955043, + "grad_norm": 0.41227226944520595, + "learning_rate": 2.452676662098402e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.9108091592788696, + "num_tokens": 166218729.0, + "step": 9039 + }, + { + "epoch": 0.9031420150856686, + "grad_norm": 0.5938897311174207, + "learning_rate": 2.4476741614801624e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.9081340134143829, + "num_tokens": 166300202.0, + "step": 9040 + }, + { + "epoch": 0.9032419201758329, + "grad_norm": 0.504362711641917, + "learning_rate": 2.4426766396369484e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.9063086211681366, + "num_tokens": 166381689.0, + "step": 9041 + }, + { + "epoch": 0.9033418252659973, + "grad_norm": 0.40361820521773867, + "learning_rate": 2.4376840970920133e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.9062611162662506, + "num_tokens": 166463190.0, + "step": 9042 + }, + { + "epoch": 0.9034417303561616, + "grad_norm": 0.5473631704154601, + "learning_rate": 2.432696534368084e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.9072644412517548, + "num_tokens": 166544615.0, + "step": 9043 + }, + { + "epoch": 0.903541635446326, + "grad_norm": 0.44254525958292545, + "learning_rate": 2.427713951987354e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.9069789946079254, + "num_tokens": 166626070.0, + "step": 9044 + }, + { + "epoch": 0.9036415405364904, + "grad_norm": 0.4840238717976633, + "learning_rate": 2.4227363504715164e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.9096340835094452, + "num_tokens": 166707520.0, + "step": 9045 + }, + { + "epoch": 0.9037414456266547, + "grad_norm": 0.4983280222160579, + "learning_rate": 2.417763730341721e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.9057429730892181, + "num_tokens": 166789062.0, + "step": 9046 + }, + { + "epoch": 0.903841350716819, + "grad_norm": 0.4507963834607827, + "learning_rate": 2.412796092118613e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.9088799059391022, + "num_tokens": 166870622.0, + "step": 9047 + }, + { + "epoch": 0.9039412558069834, + "grad_norm": 0.38711856204346073, + "learning_rate": 2.4078334363223155e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.9079558551311493, + "num_tokens": 166952105.0, + "step": 9048 + }, + { + "epoch": 0.9040411608971477, + "grad_norm": 0.428488560895366, + "learning_rate": 2.4028757634723955e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.9095941185951233, + "num_tokens": 167033650.0, + "step": 9049 + }, + { + "epoch": 0.904141065987312, + "grad_norm": 0.39775669528469826, + "learning_rate": 2.39792307408796e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.9062966108322144, + "num_tokens": 167115201.0, + "step": 9050 + }, + { + "epoch": 0.9042409710774764, + "grad_norm": 0.4481894066529299, + "learning_rate": 2.3929753686875335e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.910254031419754, + "num_tokens": 167196695.0, + "step": 9051 + }, + { + "epoch": 0.9043408761676407, + "grad_norm": 0.6723514488565397, + "learning_rate": 2.388032647789168e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.9074620306491852, + "num_tokens": 167278129.0, + "step": 9052 + }, + { + "epoch": 0.9044407812578051, + "grad_norm": 0.4757731749585499, + "learning_rate": 2.3830949119103608e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.9046443402767181, + "num_tokens": 167359708.0, + "step": 9053 + }, + { + "epoch": 0.9045406863479695, + "grad_norm": 0.4312497069726324, + "learning_rate": 2.3781621615680928e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.9090568423271179, + "num_tokens": 167441255.0, + "step": 9054 + }, + { + "epoch": 0.9046405914381338, + "grad_norm": 0.4397588593023434, + "learning_rate": 2.3732343972788285e-07, + "loss": 0.489, + "mean_token_accuracy": 0.9073401689529419, + "num_tokens": 167522771.0, + "step": 9055 + }, + { + "epoch": 0.9047404965282981, + "grad_norm": 0.48090366404034063, + "learning_rate": 2.3683116195585165e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.9105727076530457, + "num_tokens": 167604341.0, + "step": 9056 + }, + { + "epoch": 0.9048404016184625, + "grad_norm": 0.5297984916767402, + "learning_rate": 2.3633938289225778e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.9098677635192871, + "num_tokens": 167685831.0, + "step": 9057 + }, + { + "epoch": 0.9049403067086268, + "grad_norm": 0.4400656391626884, + "learning_rate": 2.3584810258859002e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.9085605442523956, + "num_tokens": 167767362.0, + "step": 9058 + }, + { + "epoch": 0.9050402117987911, + "grad_norm": 0.46831668978163565, + "learning_rate": 2.3535732109628672e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.9083029925823212, + "num_tokens": 167848852.0, + "step": 9059 + }, + { + "epoch": 0.9051401168889555, + "grad_norm": 0.679905806169319, + "learning_rate": 2.3486703846673343e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.9068062901496887, + "num_tokens": 167930349.0, + "step": 9060 + }, + { + "epoch": 0.9052400219791198, + "grad_norm": 0.4608422179990698, + "learning_rate": 2.3437725475126126e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.9084310531616211, + "num_tokens": 168011871.0, + "step": 9061 + }, + { + "epoch": 0.9053399270692842, + "grad_norm": 0.489709163279492, + "learning_rate": 2.3388797000115427e-07, + "loss": 0.489, + "mean_token_accuracy": 0.9092162549495697, + "num_tokens": 168093340.0, + "step": 9062 + }, + { + "epoch": 0.9054398321594486, + "grad_norm": 0.4847201556065621, + "learning_rate": 2.3339918426763808e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.9051731526851654, + "num_tokens": 168174795.0, + "step": 9063 + }, + { + "epoch": 0.9055397372496129, + "grad_norm": 0.5060534054340503, + "learning_rate": 2.3291089760189066e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.9083052575588226, + "num_tokens": 168256253.0, + "step": 9064 + }, + { + "epoch": 0.9056396423397772, + "grad_norm": 0.458810566981165, + "learning_rate": 2.3242311005503503e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.9102255702018738, + "num_tokens": 168337757.0, + "step": 9065 + }, + { + "epoch": 0.9057395474299416, + "grad_norm": 0.46444091865092557, + "learning_rate": 2.3193582167814422e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.9063871502876282, + "num_tokens": 168419267.0, + "step": 9066 + }, + { + "epoch": 0.9058394525201059, + "grad_norm": 0.39999134837243927, + "learning_rate": 2.3144903252223682e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.9095664918422699, + "num_tokens": 168500885.0, + "step": 9067 + }, + { + "epoch": 0.9059393576102702, + "grad_norm": 0.48290795349286614, + "learning_rate": 2.30962742638281e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.9083252549171448, + "num_tokens": 168582474.0, + "step": 9068 + }, + { + "epoch": 0.9060392627004346, + "grad_norm": 0.41965532444027975, + "learning_rate": 2.30476952077191e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.9126620888710022, + "num_tokens": 168664045.0, + "step": 9069 + }, + { + "epoch": 0.9061391677905989, + "grad_norm": 0.7247202214206272, + "learning_rate": 2.299916608898306e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.9105346500873566, + "num_tokens": 168745603.0, + "step": 9070 + }, + { + "epoch": 0.9062390728807633, + "grad_norm": 0.4828740090384797, + "learning_rate": 2.2950686912700859e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.9111142158508301, + "num_tokens": 168827166.0, + "step": 9071 + }, + { + "epoch": 0.9063389779709277, + "grad_norm": 0.42072382229227806, + "learning_rate": 2.2902257683948493e-07, + "loss": 0.487, + "mean_token_accuracy": 0.9070605933666229, + "num_tokens": 168908705.0, + "step": 9072 + }, + { + "epoch": 0.906438883061092, + "grad_norm": 0.5080048842617604, + "learning_rate": 2.2853878407796403e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.9090977311134338, + "num_tokens": 168990130.0, + "step": 9073 + }, + { + "epoch": 0.9065387881512563, + "grad_norm": 0.593655964839865, + "learning_rate": 2.2805549089310097e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.9078828692436218, + "num_tokens": 169071702.0, + "step": 9074 + }, + { + "epoch": 0.9066386932414207, + "grad_norm": 0.48680731149744816, + "learning_rate": 2.2757269733549525e-07, + "loss": 0.485, + "mean_token_accuracy": 0.9077847898006439, + "num_tokens": 169153225.0, + "step": 9075 + }, + { + "epoch": 0.906738598331585, + "grad_norm": 0.4829142721264415, + "learning_rate": 2.2709040345569699e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.9093983769416809, + "num_tokens": 169234698.0, + "step": 9076 + }, + { + "epoch": 0.9068385034217493, + "grad_norm": 0.41784440109678056, + "learning_rate": 2.2660860930420249e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.9114145934581757, + "num_tokens": 169316278.0, + "step": 9077 + }, + { + "epoch": 0.9069384085119137, + "grad_norm": 0.426888323612275, + "learning_rate": 2.2612731493145635e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.9096763134002686, + "num_tokens": 169397838.0, + "step": 9078 + }, + { + "epoch": 0.907038313602078, + "grad_norm": 0.5175941158254177, + "learning_rate": 2.256465203878505e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.9078860878944397, + "num_tokens": 169479359.0, + "step": 9079 + }, + { + "epoch": 0.9071382186922423, + "grad_norm": 0.46239968465507897, + "learning_rate": 2.2516622572372416e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.9113606214523315, + "num_tokens": 169560935.0, + "step": 9080 + }, + { + "epoch": 0.9072381237824068, + "grad_norm": 0.4376089677711934, + "learning_rate": 2.2468643098936482e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.9056975841522217, + "num_tokens": 169642476.0, + "step": 9081 + }, + { + "epoch": 0.9073380288725711, + "grad_norm": 0.429848387033915, + "learning_rate": 2.2420713623500845e-07, + "loss": 0.49, + "mean_token_accuracy": 0.9098618626594543, + "num_tokens": 169723964.0, + "step": 9082 + }, + { + "epoch": 0.9074379339627354, + "grad_norm": 0.46944378278762894, + "learning_rate": 2.2372834151083546e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.9086093008518219, + "num_tokens": 169805612.0, + "step": 9083 + }, + { + "epoch": 0.9075378390528998, + "grad_norm": 0.5274471004284544, + "learning_rate": 2.2325004686697904e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.907584547996521, + "num_tokens": 169887151.0, + "step": 9084 + }, + { + "epoch": 0.9076377441430641, + "grad_norm": 0.5099439487666011, + "learning_rate": 2.2277225235351475e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.905110627412796, + "num_tokens": 169968612.0, + "step": 9085 + }, + { + "epoch": 0.9077376492332284, + "grad_norm": 0.4615166668259301, + "learning_rate": 2.2229495802046919e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.9088765680789948, + "num_tokens": 170050128.0, + "step": 9086 + }, + { + "epoch": 0.9078375543233927, + "grad_norm": 0.706120259878603, + "learning_rate": 2.218181639178152e-07, + "loss": 0.487, + "mean_token_accuracy": 0.9089201092720032, + "num_tokens": 170131639.0, + "step": 9087 + }, + { + "epoch": 0.9079374594135571, + "grad_norm": 0.43341566575982154, + "learning_rate": 2.2134187009547337e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.905479371547699, + "num_tokens": 170213142.0, + "step": 9088 + }, + { + "epoch": 0.9080373645037214, + "grad_norm": 0.48858818285527994, + "learning_rate": 2.2086607660331271e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9095132946968079, + "num_tokens": 170294589.0, + "step": 9089 + }, + { + "epoch": 0.9081372695938859, + "grad_norm": 0.4765793608404039, + "learning_rate": 2.2039078349114894e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.9068298041820526, + "num_tokens": 170376051.0, + "step": 9090 + }, + { + "epoch": 0.9082371746840502, + "grad_norm": 0.48148584105954056, + "learning_rate": 2.199159908087456e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.9079758822917938, + "num_tokens": 170457567.0, + "step": 9091 + }, + { + "epoch": 0.9083370797742145, + "grad_norm": 0.41957091900951476, + "learning_rate": 2.194416986058151e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.9085675776004791, + "num_tokens": 170539068.0, + "step": 9092 + }, + { + "epoch": 0.9084369848643788, + "grad_norm": 0.42672810813404727, + "learning_rate": 2.1896790693201387e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.9068649709224701, + "num_tokens": 170620534.0, + "step": 9093 + }, + { + "epoch": 0.9085368899545432, + "grad_norm": 0.45335149279064363, + "learning_rate": 2.184946158369511e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.907209575176239, + "num_tokens": 170702117.0, + "step": 9094 + }, + { + "epoch": 0.9086367950447075, + "grad_norm": 0.4532973232341754, + "learning_rate": 2.1802182537017825e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.9071137607097626, + "num_tokens": 170783721.0, + "step": 9095 + }, + { + "epoch": 0.9087367001348718, + "grad_norm": 0.44309921522184986, + "learning_rate": 2.1754953558119962e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.9074755609035492, + "num_tokens": 170865172.0, + "step": 9096 + }, + { + "epoch": 0.9088366052250362, + "grad_norm": 0.42291889163067664, + "learning_rate": 2.1707774651946234e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.9102655351161957, + "num_tokens": 170946717.0, + "step": 9097 + }, + { + "epoch": 0.9089365103152005, + "grad_norm": 0.4604710233700829, + "learning_rate": 2.166064582343641e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.9083793759346008, + "num_tokens": 171028309.0, + "step": 9098 + }, + { + "epoch": 0.909036415405365, + "grad_norm": 0.5053557053964827, + "learning_rate": 2.1613567077524878e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.9085015952587128, + "num_tokens": 171109860.0, + "step": 9099 + }, + { + "epoch": 0.9091363204955293, + "grad_norm": 0.5468472434093549, + "learning_rate": 2.156653841914086e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.9101752042770386, + "num_tokens": 171191435.0, + "step": 9100 + }, + { + "epoch": 0.9092362255856936, + "grad_norm": 0.4048570135592257, + "learning_rate": 2.1519559853208306e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.90805783867836, + "num_tokens": 171272945.0, + "step": 9101 + }, + { + "epoch": 0.909336130675858, + "grad_norm": 0.4328437262089885, + "learning_rate": 2.1472631384645893e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.9096358418464661, + "num_tokens": 171354461.0, + "step": 9102 + }, + { + "epoch": 0.9094360357660223, + "grad_norm": 0.4603383885432434, + "learning_rate": 2.1425753018367134e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.9066296219825745, + "num_tokens": 171435991.0, + "step": 9103 + }, + { + "epoch": 0.9095359408561866, + "grad_norm": 0.5196896494852739, + "learning_rate": 2.1378924759280218e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.9077319800853729, + "num_tokens": 171517475.0, + "step": 9104 + }, + { + "epoch": 0.9096358459463509, + "grad_norm": 0.4161164226608505, + "learning_rate": 2.1332146612287942e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.9087856113910675, + "num_tokens": 171598962.0, + "step": 9105 + }, + { + "epoch": 0.9097357510365153, + "grad_norm": 0.46341181045395463, + "learning_rate": 2.1285418582288331e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.9109818041324615, + "num_tokens": 171680515.0, + "step": 9106 + }, + { + "epoch": 0.9098356561266796, + "grad_norm": 0.48566566441842396, + "learning_rate": 2.1238740674173584e-07, + "loss": 0.486, + "mean_token_accuracy": 0.9069222509860992, + "num_tokens": 171762061.0, + "step": 9107 + }, + { + "epoch": 0.909935561216844, + "grad_norm": 0.4495330899893933, + "learning_rate": 2.119211289283113e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.9069503545761108, + "num_tokens": 171843619.0, + "step": 9108 + }, + { + "epoch": 0.9100354663070084, + "grad_norm": 0.6470147843463648, + "learning_rate": 2.1145535243142833e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.9064382314682007, + "num_tokens": 171925074.0, + "step": 9109 + }, + { + "epoch": 0.9101353713971727, + "grad_norm": 0.518356427980283, + "learning_rate": 2.109900772998541e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.9081064164638519, + "num_tokens": 172006590.0, + "step": 9110 + }, + { + "epoch": 0.910235276487337, + "grad_norm": 0.4059612907923295, + "learning_rate": 2.105253035823035e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.9093934297561646, + "num_tokens": 172088160.0, + "step": 9111 + }, + { + "epoch": 0.9103351815775014, + "grad_norm": 0.6349906373762112, + "learning_rate": 2.1006103132743871e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.9086527228355408, + "num_tokens": 172169659.0, + "step": 9112 + }, + { + "epoch": 0.9104350866676657, + "grad_norm": 0.43475492913313435, + "learning_rate": 2.095972605838703e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.9103167951107025, + "num_tokens": 172251204.0, + "step": 9113 + }, + { + "epoch": 0.91053499175783, + "grad_norm": 0.4484834805402464, + "learning_rate": 2.09133991400155e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.9087415635585785, + "num_tokens": 172332751.0, + "step": 9114 + }, + { + "epoch": 0.9106348968479944, + "grad_norm": 0.534764979601424, + "learning_rate": 2.086712238247962e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.905162513256073, + "num_tokens": 172414210.0, + "step": 9115 + }, + { + "epoch": 0.9107348019381587, + "grad_norm": 0.4253169817859164, + "learning_rate": 2.0820895790624906e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.9076356887817383, + "num_tokens": 172495737.0, + "step": 9116 + }, + { + "epoch": 0.9108347070283231, + "grad_norm": 0.5125959641552242, + "learning_rate": 2.0774719369290985e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.9081911146640778, + "num_tokens": 172577325.0, + "step": 9117 + }, + { + "epoch": 0.9109346121184875, + "grad_norm": 0.5630779598110263, + "learning_rate": 2.0728593123312934e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.9069545269012451, + "num_tokens": 172658841.0, + "step": 9118 + }, + { + "epoch": 0.9110345172086518, + "grad_norm": 0.5026718153958388, + "learning_rate": 2.0682517057519946e-07, + "loss": 0.489, + "mean_token_accuracy": 0.9078053832054138, + "num_tokens": 172740334.0, + "step": 9119 + }, + { + "epoch": 0.9111344222988161, + "grad_norm": 0.4153882204749758, + "learning_rate": 2.0636491176736273e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.9088005423545837, + "num_tokens": 172821911.0, + "step": 9120 + }, + { + "epoch": 0.9112343273889805, + "grad_norm": 0.4879157994380275, + "learning_rate": 2.0590515485780948e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.9082272946834564, + "num_tokens": 172903448.0, + "step": 9121 + }, + { + "epoch": 0.9113342324791448, + "grad_norm": 0.5776740285107798, + "learning_rate": 2.0544589989467622e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.9060452878475189, + "num_tokens": 172984954.0, + "step": 9122 + }, + { + "epoch": 0.9114341375693091, + "grad_norm": 0.5854688716296002, + "learning_rate": 2.0498714692604726e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.9101127684116364, + "num_tokens": 173066542.0, + "step": 9123 + }, + { + "epoch": 0.9115340426594735, + "grad_norm": 0.43837407026478836, + "learning_rate": 2.0452889599995528e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.9061529040336609, + "num_tokens": 173147981.0, + "step": 9124 + }, + { + "epoch": 0.9116339477496378, + "grad_norm": 0.4314610064698893, + "learning_rate": 2.0407114716437858e-07, + "loss": 0.4783, + "mean_token_accuracy": 0.907769650220871, + "num_tokens": 173229644.0, + "step": 9125 + }, + { + "epoch": 0.9117338528398022, + "grad_norm": 0.4975968140178015, + "learning_rate": 2.0361390046724494e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.9076572954654694, + "num_tokens": 173311207.0, + "step": 9126 + }, + { + "epoch": 0.9118337579299666, + "grad_norm": 0.601977244105777, + "learning_rate": 2.0315715595642715e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.9071713089942932, + "num_tokens": 173392622.0, + "step": 9127 + }, + { + "epoch": 0.9119336630201309, + "grad_norm": 0.4337646557938821, + "learning_rate": 2.0270091367974864e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.9100657105445862, + "num_tokens": 173474194.0, + "step": 9128 + }, + { + "epoch": 0.9120335681102952, + "grad_norm": 0.8778750390403851, + "learning_rate": 2.022451736849762e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.909359335899353, + "num_tokens": 173555740.0, + "step": 9129 + }, + { + "epoch": 0.9121334732004596, + "grad_norm": 0.48109578129793723, + "learning_rate": 2.0178993601982887e-07, + "loss": 0.49, + "mean_token_accuracy": 0.9090196490287781, + "num_tokens": 173637255.0, + "step": 9130 + }, + { + "epoch": 0.9122333782906239, + "grad_norm": 0.4746982237157486, + "learning_rate": 2.01335200731968e-07, + "loss": 0.4921, + "mean_token_accuracy": 0.9086045026779175, + "num_tokens": 173718714.0, + "step": 9131 + }, + { + "epoch": 0.9123332833807882, + "grad_norm": 0.4502015098329941, + "learning_rate": 2.0088096786900657e-07, + "loss": 0.482, + "mean_token_accuracy": 0.9073570072650909, + "num_tokens": 173800298.0, + "step": 9132 + }, + { + "epoch": 0.9124331884709526, + "grad_norm": 0.5543239230680603, + "learning_rate": 2.0042723747850213e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.9116096794605255, + "num_tokens": 173881774.0, + "step": 9133 + }, + { + "epoch": 0.9125330935611169, + "grad_norm": 0.4524575417998671, + "learning_rate": 1.999740096079611e-07, + "loss": 0.489, + "mean_token_accuracy": 0.9076583385467529, + "num_tokens": 173963272.0, + "step": 9134 + }, + { + "epoch": 0.9126329986512813, + "grad_norm": 0.4782280186418097, + "learning_rate": 1.9952128430483718e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.9092390239238739, + "num_tokens": 174044768.0, + "step": 9135 + }, + { + "epoch": 0.9127329037414457, + "grad_norm": 0.43688102384608096, + "learning_rate": 1.9906906161653083e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.9063006043434143, + "num_tokens": 174126180.0, + "step": 9136 + }, + { + "epoch": 0.91283280883161, + "grad_norm": 0.41253028192575897, + "learning_rate": 1.986173415903897e-07, + "loss": 0.484, + "mean_token_accuracy": 0.9108928442001343, + "num_tokens": 174207738.0, + "step": 9137 + }, + { + "epoch": 0.9129327139217743, + "grad_norm": 0.4713238971101682, + "learning_rate": 1.9816612427371041e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.9076566100120544, + "num_tokens": 174289237.0, + "step": 9138 + }, + { + "epoch": 0.9130326190119387, + "grad_norm": 0.4688418252085951, + "learning_rate": 1.977154097137346e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.9076076149940491, + "num_tokens": 174370787.0, + "step": 9139 + }, + { + "epoch": 0.913132524102103, + "grad_norm": 0.44165945324266875, + "learning_rate": 1.9726519795765454e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.9079676866531372, + "num_tokens": 174452289.0, + "step": 9140 + }, + { + "epoch": 0.9132324291922673, + "grad_norm": 0.4959816936260715, + "learning_rate": 1.9681548905260528e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.9082237184047699, + "num_tokens": 174533805.0, + "step": 9141 + }, + { + "epoch": 0.9133323342824317, + "grad_norm": 0.514869603208946, + "learning_rate": 1.9636628304567363e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.9069916009902954, + "num_tokens": 174615324.0, + "step": 9142 + }, + { + "epoch": 0.913432239372596, + "grad_norm": 0.4863977762467076, + "learning_rate": 1.9591757998389082e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.9088314175605774, + "num_tokens": 174696813.0, + "step": 9143 + }, + { + "epoch": 0.9135321444627604, + "grad_norm": 0.49730270951113553, + "learning_rate": 1.9546937991423764e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.9109646081924438, + "num_tokens": 174778421.0, + "step": 9144 + }, + { + "epoch": 0.9136320495529248, + "grad_norm": 0.41444783510922556, + "learning_rate": 1.9502168288363988e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.9064990282058716, + "num_tokens": 174859940.0, + "step": 9145 + }, + { + "epoch": 0.9137319546430891, + "grad_norm": 0.4323862033699564, + "learning_rate": 1.9457448893897224e-07, + "loss": 0.485, + "mean_token_accuracy": 0.9083109200000763, + "num_tokens": 174941483.0, + "step": 9146 + }, + { + "epoch": 0.9138318597332534, + "grad_norm": 0.41431496957466746, + "learning_rate": 1.9412779812705673e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.9093120098114014, + "num_tokens": 175023036.0, + "step": 9147 + }, + { + "epoch": 0.9139317648234178, + "grad_norm": 0.5125146214896784, + "learning_rate": 1.936816104946626e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9102168381214142, + "num_tokens": 175104524.0, + "step": 9148 + }, + { + "epoch": 0.9140316699135821, + "grad_norm": 0.7136811086861418, + "learning_rate": 1.9323592608850472e-07, + "loss": 0.485, + "mean_token_accuracy": 0.9074530303478241, + "num_tokens": 175186096.0, + "step": 9149 + }, + { + "epoch": 0.9141315750037464, + "grad_norm": 0.4263539396398254, + "learning_rate": 1.9279074495524852e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.9074597656726837, + "num_tokens": 175267513.0, + "step": 9150 + }, + { + "epoch": 0.9142314800939108, + "grad_norm": 0.4926592853941273, + "learning_rate": 1.9234606714150283e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.9089039862155914, + "num_tokens": 175348998.0, + "step": 9151 + }, + { + "epoch": 0.9143313851840751, + "grad_norm": 0.5158477690995978, + "learning_rate": 1.919018926938282e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.9070118963718414, + "num_tokens": 175430658.0, + "step": 9152 + }, + { + "epoch": 0.9144312902742395, + "grad_norm": 0.4632636129770277, + "learning_rate": 1.9145822165872852e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.9062116146087646, + "num_tokens": 175512193.0, + "step": 9153 + }, + { + "epoch": 0.9145311953644039, + "grad_norm": 0.4764912331333103, + "learning_rate": 1.9101505408265663e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.9064796566963196, + "num_tokens": 175593659.0, + "step": 9154 + }, + { + "epoch": 0.9146311004545682, + "grad_norm": 0.5090554170243544, + "learning_rate": 1.905723900120132e-07, + "loss": 0.487, + "mean_token_accuracy": 0.9077791571617126, + "num_tokens": 175675221.0, + "step": 9155 + }, + { + "epoch": 0.9147310055447325, + "grad_norm": 0.5226594743566855, + "learning_rate": 1.9013022949314563e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.9096358418464661, + "num_tokens": 175756714.0, + "step": 9156 + }, + { + "epoch": 0.9148309106348969, + "grad_norm": 0.44270103114008036, + "learning_rate": 1.896885725723474e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.9073331654071808, + "num_tokens": 175838286.0, + "step": 9157 + }, + { + "epoch": 0.9149308157250612, + "grad_norm": 0.4262771181413148, + "learning_rate": 1.892474192958621e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.9048900306224823, + "num_tokens": 175919822.0, + "step": 9158 + }, + { + "epoch": 0.9150307208152255, + "grad_norm": 0.4453003895321088, + "learning_rate": 1.8880676970987776e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.906513899564743, + "num_tokens": 176001314.0, + "step": 9159 + }, + { + "epoch": 0.9151306259053898, + "grad_norm": 0.563140923908084, + "learning_rate": 1.8836662386053194e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.9093511700630188, + "num_tokens": 176082827.0, + "step": 9160 + }, + { + "epoch": 0.9152305309955542, + "grad_norm": 0.546629305651399, + "learning_rate": 1.879269817939061e-07, + "loss": 0.495, + "mean_token_accuracy": 0.9037237763404846, + "num_tokens": 176164293.0, + "step": 9161 + }, + { + "epoch": 0.9153304360857185, + "grad_norm": 0.4735866223986199, + "learning_rate": 1.87487843556034e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.9088438749313354, + "num_tokens": 176245776.0, + "step": 9162 + }, + { + "epoch": 0.915430341175883, + "grad_norm": 0.4239946762785816, + "learning_rate": 1.8704920919289215e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.9085643291473389, + "num_tokens": 176327353.0, + "step": 9163 + }, + { + "epoch": 0.9155302462660473, + "grad_norm": 0.5883559669248463, + "learning_rate": 1.8661107875040607e-07, + "loss": 0.489, + "mean_token_accuracy": 0.9105300307273865, + "num_tokens": 176408868.0, + "step": 9164 + }, + { + "epoch": 0.9156301513562116, + "grad_norm": 0.3977499163462531, + "learning_rate": 1.8617345227444906e-07, + "loss": 0.494, + "mean_token_accuracy": 0.9090353548526764, + "num_tokens": 176490310.0, + "step": 9165 + }, + { + "epoch": 0.915730056446376, + "grad_norm": 0.4294755636587763, + "learning_rate": 1.8573632981084054e-07, + "loss": 0.495, + "mean_token_accuracy": 0.9091488122940063, + "num_tokens": 176571751.0, + "step": 9166 + }, + { + "epoch": 0.9158299615365403, + "grad_norm": 0.45463092801476435, + "learning_rate": 1.8529971140534786e-07, + "loss": 0.485, + "mean_token_accuracy": 0.9106537103652954, + "num_tokens": 176653275.0, + "step": 9167 + }, + { + "epoch": 0.9159298666267046, + "grad_norm": 0.40336404436551204, + "learning_rate": 1.8486359710368496e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.9088101983070374, + "num_tokens": 176734774.0, + "step": 9168 + }, + { + "epoch": 0.916029771716869, + "grad_norm": 0.4218221472439268, + "learning_rate": 1.8442798695151421e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.9084711968898773, + "num_tokens": 176816225.0, + "step": 9169 + }, + { + "epoch": 0.9161296768070333, + "grad_norm": 0.6531980705131006, + "learning_rate": 1.8399288099444467e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.907098799943924, + "num_tokens": 176897658.0, + "step": 9170 + }, + { + "epoch": 0.9162295818971976, + "grad_norm": 0.5721560801406466, + "learning_rate": 1.835582792780305e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.9059367775917053, + "num_tokens": 176979272.0, + "step": 9171 + }, + { + "epoch": 0.916329486987362, + "grad_norm": 0.46241501651689104, + "learning_rate": 1.8312418184777748e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9095598459243774, + "num_tokens": 177060768.0, + "step": 9172 + }, + { + "epoch": 0.9164293920775264, + "grad_norm": 0.5600936884300122, + "learning_rate": 1.8269058874913314e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.9088342785835266, + "num_tokens": 177142258.0, + "step": 9173 + }, + { + "epoch": 0.9165292971676907, + "grad_norm": 0.3807797163278651, + "learning_rate": 1.8225750002749842e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.9100233316421509, + "num_tokens": 177223843.0, + "step": 9174 + }, + { + "epoch": 0.916629202257855, + "grad_norm": 0.7104828344499806, + "learning_rate": 1.8182491572821536e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.9098189771175385, + "num_tokens": 177305395.0, + "step": 9175 + }, + { + "epoch": 0.9167291073480194, + "grad_norm": 1.3702780192961543, + "learning_rate": 1.813928358965772e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.907956451177597, + "num_tokens": 177387019.0, + "step": 9176 + }, + { + "epoch": 0.9168290124381837, + "grad_norm": 0.5277272989365346, + "learning_rate": 1.8096126057782326e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.9065672159194946, + "num_tokens": 177468527.0, + "step": 9177 + }, + { + "epoch": 0.916928917528348, + "grad_norm": 0.49884441047871764, + "learning_rate": 1.805301898171391e-07, + "loss": 0.488, + "mean_token_accuracy": 0.907881110906601, + "num_tokens": 177550069.0, + "step": 9178 + }, + { + "epoch": 0.9170288226185124, + "grad_norm": 0.41326110768264956, + "learning_rate": 1.800996236596586e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.9120817482471466, + "num_tokens": 177631697.0, + "step": 9179 + }, + { + "epoch": 0.9171287277086767, + "grad_norm": 0.43063373078878064, + "learning_rate": 1.7966956215046293e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.909684032201767, + "num_tokens": 177713160.0, + "step": 9180 + }, + { + "epoch": 0.9172286327988411, + "grad_norm": 1.3843576672696225, + "learning_rate": 1.7924000533457942e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.9090996384620667, + "num_tokens": 177794719.0, + "step": 9181 + }, + { + "epoch": 0.9173285378890055, + "grad_norm": 0.4671020735990311, + "learning_rate": 1.7881095325698372e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.9072317183017731, + "num_tokens": 177876143.0, + "step": 9182 + }, + { + "epoch": 0.9174284429791698, + "grad_norm": 0.4044766011593993, + "learning_rate": 1.7838240596259604e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.9099518656730652, + "num_tokens": 177957630.0, + "step": 9183 + }, + { + "epoch": 0.9175283480693341, + "grad_norm": 0.5640769628238333, + "learning_rate": 1.779543634962888e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.9067934155464172, + "num_tokens": 178039134.0, + "step": 9184 + }, + { + "epoch": 0.9176282531594985, + "grad_norm": 0.4779482382023491, + "learning_rate": 1.7752682590287674e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.9094280302524567, + "num_tokens": 178120695.0, + "step": 9185 + }, + { + "epoch": 0.9177281582496628, + "grad_norm": 0.4548214934510262, + "learning_rate": 1.770997932271229e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.9083956480026245, + "num_tokens": 178202244.0, + "step": 9186 + }, + { + "epoch": 0.9178280633398271, + "grad_norm": 0.40176153291033084, + "learning_rate": 1.7667326551373876e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.9073463082313538, + "num_tokens": 178283751.0, + "step": 9187 + }, + { + "epoch": 0.9179279684299915, + "grad_norm": 0.4946456990173467, + "learning_rate": 1.7624724280738247e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.9085221886634827, + "num_tokens": 178365193.0, + "step": 9188 + }, + { + "epoch": 0.9180278735201558, + "grad_norm": 0.683796246200463, + "learning_rate": 1.7582172515265837e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.9092452228069305, + "num_tokens": 178446744.0, + "step": 9189 + }, + { + "epoch": 0.9181277786103202, + "grad_norm": 0.4408167335960811, + "learning_rate": 1.7539671259411916e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.9096752107143402, + "num_tokens": 178528230.0, + "step": 9190 + }, + { + "epoch": 0.9182276837004846, + "grad_norm": 2.2333039538342216, + "learning_rate": 1.7497220517626368e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.9100279211997986, + "num_tokens": 178609766.0, + "step": 9191 + }, + { + "epoch": 0.9183275887906489, + "grad_norm": 0.41233106544459996, + "learning_rate": 1.745482029435386e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.9089048206806183, + "num_tokens": 178691229.0, + "step": 9192 + }, + { + "epoch": 0.9184274938808132, + "grad_norm": 0.5907434446308414, + "learning_rate": 1.741247059403367e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.9086999893188477, + "num_tokens": 178772741.0, + "step": 9193 + }, + { + "epoch": 0.9185273989709776, + "grad_norm": 0.6917566640313446, + "learning_rate": 1.737017142109998e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.9060759544372559, + "num_tokens": 178854324.0, + "step": 9194 + }, + { + "epoch": 0.9186273040611419, + "grad_norm": 0.6638790912505008, + "learning_rate": 1.7327922779981354e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.909859299659729, + "num_tokens": 178935868.0, + "step": 9195 + }, + { + "epoch": 0.9187272091513062, + "grad_norm": 0.4048870604516236, + "learning_rate": 1.728572467510148e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.9097892045974731, + "num_tokens": 179017441.0, + "step": 9196 + }, + { + "epoch": 0.9188271142414706, + "grad_norm": 0.40583061012666916, + "learning_rate": 1.724357711087843e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.9096035659313202, + "num_tokens": 179099085.0, + "step": 9197 + }, + { + "epoch": 0.9189270193316349, + "grad_norm": 0.417116715201841, + "learning_rate": 1.720148009172512e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.9067785441875458, + "num_tokens": 179180514.0, + "step": 9198 + }, + { + "epoch": 0.9190269244217993, + "grad_norm": 0.5518115626809857, + "learning_rate": 1.7159433622049082e-07, + "loss": 0.483, + "mean_token_accuracy": 0.907796174287796, + "num_tokens": 179262133.0, + "step": 9199 + }, + { + "epoch": 0.9191268295119637, + "grad_norm": 0.4971532299752873, + "learning_rate": 1.7117437706252738e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.9094298779964447, + "num_tokens": 179343617.0, + "step": 9200 + }, + { + "epoch": 0.919226734602128, + "grad_norm": 0.46110586641546086, + "learning_rate": 1.7075492348733014e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.9050517082214355, + "num_tokens": 179425083.0, + "step": 9201 + }, + { + "epoch": 0.9193266396922923, + "grad_norm": 0.49174958392629403, + "learning_rate": 1.7033597553881676e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.9074353575706482, + "num_tokens": 179506607.0, + "step": 9202 + }, + { + "epoch": 0.9194265447824567, + "grad_norm": 0.4115778005168427, + "learning_rate": 1.6991753326085158e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.9076738357543945, + "num_tokens": 179588163.0, + "step": 9203 + }, + { + "epoch": 0.919526449872621, + "grad_norm": 0.41088592244299355, + "learning_rate": 1.694995966972457e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.908581018447876, + "num_tokens": 179669752.0, + "step": 9204 + }, + { + "epoch": 0.9196263549627853, + "grad_norm": 0.47484971642363, + "learning_rate": 1.6908216589175686e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.907447874546051, + "num_tokens": 179751247.0, + "step": 9205 + }, + { + "epoch": 0.9197262600529497, + "grad_norm": 0.4685152372741542, + "learning_rate": 1.6866524088809176e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.9098647236824036, + "num_tokens": 179832727.0, + "step": 9206 + }, + { + "epoch": 0.919826165143114, + "grad_norm": 0.4280343603283052, + "learning_rate": 1.682488217299022e-07, + "loss": 0.492, + "mean_token_accuracy": 0.9047538340091705, + "num_tokens": 179914179.0, + "step": 9207 + }, + { + "epoch": 0.9199260702332784, + "grad_norm": 0.6561649545413166, + "learning_rate": 1.6783290846078714e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.9081147611141205, + "num_tokens": 179995698.0, + "step": 9208 + }, + { + "epoch": 0.9200259753234428, + "grad_norm": 0.4896445310933717, + "learning_rate": 1.67417501124294e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.9092079401016235, + "num_tokens": 180077255.0, + "step": 9209 + }, + { + "epoch": 0.9201258804136071, + "grad_norm": 0.468603387610648, + "learning_rate": 1.6700259976391575e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.9065096080303192, + "num_tokens": 180158703.0, + "step": 9210 + }, + { + "epoch": 0.9202257855037714, + "grad_norm": 0.5281419943163705, + "learning_rate": 1.665882044230932e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.9096082448959351, + "num_tokens": 180240241.0, + "step": 9211 + }, + { + "epoch": 0.9203256905939358, + "grad_norm": 0.46127733457804454, + "learning_rate": 1.6617431514521387e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.9059768915176392, + "num_tokens": 180321718.0, + "step": 9212 + }, + { + "epoch": 0.9204255956841001, + "grad_norm": 0.5125782663177938, + "learning_rate": 1.6576093197361253e-07, + "loss": 0.4898, + "mean_token_accuracy": 0.9065872132778168, + "num_tokens": 180403237.0, + "step": 9213 + }, + { + "epoch": 0.9205255007742644, + "grad_norm": 0.466332849019182, + "learning_rate": 1.6534805495157126e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.9061771631240845, + "num_tokens": 180484735.0, + "step": 9214 + }, + { + "epoch": 0.9206254058644288, + "grad_norm": 0.5198925345909939, + "learning_rate": 1.649356841223171e-07, + "loss": 0.492, + "mean_token_accuracy": 0.9075772762298584, + "num_tokens": 180566206.0, + "step": 9215 + }, + { + "epoch": 0.9207253109545931, + "grad_norm": 0.4375747512388713, + "learning_rate": 1.645238195290272e-07, + "loss": 0.492, + "mean_token_accuracy": 0.905984491109848, + "num_tokens": 180647665.0, + "step": 9216 + }, + { + "epoch": 0.9208252160447575, + "grad_norm": 0.3875023393020887, + "learning_rate": 1.6411246121482316e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.9096975326538086, + "num_tokens": 180729250.0, + "step": 9217 + }, + { + "epoch": 0.9209251211349219, + "grad_norm": 0.44803565119902766, + "learning_rate": 1.6370160922277613e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.9084159135818481, + "num_tokens": 180810671.0, + "step": 9218 + }, + { + "epoch": 0.9210250262250862, + "grad_norm": 0.45769786633049925, + "learning_rate": 1.6329126359590108e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.906061202287674, + "num_tokens": 180892166.0, + "step": 9219 + }, + { + "epoch": 0.9211249313152505, + "grad_norm": 0.40029224570367583, + "learning_rate": 1.6288142437716259e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.9118313491344452, + "num_tokens": 180973681.0, + "step": 9220 + }, + { + "epoch": 0.9212248364054149, + "grad_norm": 0.4060783185544749, + "learning_rate": 1.6247209160947074e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.9112387001514435, + "num_tokens": 181055300.0, + "step": 9221 + }, + { + "epoch": 0.9213247414955792, + "grad_norm": 0.721245281179511, + "learning_rate": 1.6206326533568296e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.9088598489761353, + "num_tokens": 181136805.0, + "step": 9222 + }, + { + "epoch": 0.9214246465857435, + "grad_norm": 0.4233141769176798, + "learning_rate": 1.6165494559860383e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.9077424108982086, + "num_tokens": 181218368.0, + "step": 9223 + }, + { + "epoch": 0.9215245516759079, + "grad_norm": 0.5263234862904078, + "learning_rate": 1.6124713244098533e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.9074606001377106, + "num_tokens": 181299905.0, + "step": 9224 + }, + { + "epoch": 0.9216244567660722, + "grad_norm": 0.6531605494523501, + "learning_rate": 1.608398259055255e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.9061931073665619, + "num_tokens": 181381371.0, + "step": 9225 + }, + { + "epoch": 0.9217243618562366, + "grad_norm": 0.45669919557765104, + "learning_rate": 1.6043302603487076e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.9067661166191101, + "num_tokens": 181462892.0, + "step": 9226 + }, + { + "epoch": 0.921824266946401, + "grad_norm": 0.42641304444638656, + "learning_rate": 1.6002673287161097e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.9052527844905853, + "num_tokens": 181544417.0, + "step": 9227 + }, + { + "epoch": 0.9219241720365653, + "grad_norm": 0.46134560859517054, + "learning_rate": 1.5962094645828764e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.9066403806209564, + "num_tokens": 181625865.0, + "step": 9228 + }, + { + "epoch": 0.9220240771267296, + "grad_norm": 0.47986954727652054, + "learning_rate": 1.5921566683738566e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.9068242907524109, + "num_tokens": 181707477.0, + "step": 9229 + }, + { + "epoch": 0.922123982216894, + "grad_norm": 0.8803500124533871, + "learning_rate": 1.5881089405133998e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.9068374335765839, + "num_tokens": 181788960.0, + "step": 9230 + }, + { + "epoch": 0.9222238873070583, + "grad_norm": 0.4389912077606929, + "learning_rate": 1.5840662814252893e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.905593603849411, + "num_tokens": 181870427.0, + "step": 9231 + }, + { + "epoch": 0.9223237923972226, + "grad_norm": 0.5065398218859778, + "learning_rate": 1.5800286915328034e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.9082672297954559, + "num_tokens": 181952054.0, + "step": 9232 + }, + { + "epoch": 0.922423697487387, + "grad_norm": 0.46583327946518266, + "learning_rate": 1.5759961712586814e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.9076125919818878, + "num_tokens": 182033572.0, + "step": 9233 + }, + { + "epoch": 0.9225236025775513, + "grad_norm": 0.4360679591210777, + "learning_rate": 1.57196872102513e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.9085051119327545, + "num_tokens": 182115144.0, + "step": 9234 + }, + { + "epoch": 0.9226235076677157, + "grad_norm": 0.48544167299250457, + "learning_rate": 1.567946341253823e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.9065068662166595, + "num_tokens": 182196572.0, + "step": 9235 + }, + { + "epoch": 0.9227234127578801, + "grad_norm": 0.4469781557215338, + "learning_rate": 1.563929032365924e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.9103035032749176, + "num_tokens": 182278089.0, + "step": 9236 + }, + { + "epoch": 0.9228233178480444, + "grad_norm": 0.45259737415459567, + "learning_rate": 1.5599167947820236e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.9075913429260254, + "num_tokens": 182359604.0, + "step": 9237 + }, + { + "epoch": 0.9229232229382087, + "grad_norm": 0.4332019216770616, + "learning_rate": 1.5559096289222364e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.9086812138557434, + "num_tokens": 182441143.0, + "step": 9238 + }, + { + "epoch": 0.923023128028373, + "grad_norm": 0.6132344381212252, + "learning_rate": 1.5519075352060874e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.9066788256168365, + "num_tokens": 182522688.0, + "step": 9239 + }, + { + "epoch": 0.9231230331185374, + "grad_norm": 0.5793835268046104, + "learning_rate": 1.5479105140526252e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.9071809649467468, + "num_tokens": 182604163.0, + "step": 9240 + }, + { + "epoch": 0.9232229382087017, + "grad_norm": 0.6095379639729732, + "learning_rate": 1.5439185658803256e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.9082092642784119, + "num_tokens": 182685633.0, + "step": 9241 + }, + { + "epoch": 0.923322843298866, + "grad_norm": 0.5354791545666803, + "learning_rate": 1.5399316911071493e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.9065687954425812, + "num_tokens": 182767158.0, + "step": 9242 + }, + { + "epoch": 0.9234227483890304, + "grad_norm": 0.45786605355919163, + "learning_rate": 1.5359498901505342e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.9084268808364868, + "num_tokens": 182848658.0, + "step": 9243 + }, + { + "epoch": 0.9235226534791947, + "grad_norm": 0.7619090775668435, + "learning_rate": 1.5319731634273804e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.9080530107021332, + "num_tokens": 182930118.0, + "step": 9244 + }, + { + "epoch": 0.9236225585693592, + "grad_norm": 0.43433494960461655, + "learning_rate": 1.5280015113540437e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.90865758061409, + "num_tokens": 183011660.0, + "step": 9245 + }, + { + "epoch": 0.9237224636595235, + "grad_norm": 0.48824470916404755, + "learning_rate": 1.5240349343463688e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.9073526263237, + "num_tokens": 183093101.0, + "step": 9246 + }, + { + "epoch": 0.9238223687496878, + "grad_norm": 0.49142527237591305, + "learning_rate": 1.5200734328196575e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.9079068899154663, + "num_tokens": 183174525.0, + "step": 9247 + }, + { + "epoch": 0.9239222738398521, + "grad_norm": 0.44843296635937197, + "learning_rate": 1.5161170071886889e-07, + "loss": 0.4825, + "mean_token_accuracy": 0.9111439883708954, + "num_tokens": 183256091.0, + "step": 9248 + }, + { + "epoch": 0.9240221789300165, + "grad_norm": 0.5526390086984302, + "learning_rate": 1.5121656578676868e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.9088256359100342, + "num_tokens": 183337613.0, + "step": 9249 + }, + { + "epoch": 0.9241220840201808, + "grad_norm": 0.49833145947070484, + "learning_rate": 1.5082193852703874e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.9092201590538025, + "num_tokens": 183419115.0, + "step": 9250 + }, + { + "epoch": 0.9242219891103451, + "grad_norm": 0.40096283895868023, + "learning_rate": 1.5042781898099433e-07, + "loss": 0.483, + "mean_token_accuracy": 0.9071786105632782, + "num_tokens": 183500671.0, + "step": 9251 + }, + { + "epoch": 0.9243218942005095, + "grad_norm": 0.40199794066122696, + "learning_rate": 1.5003420718990247e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.9094644784927368, + "num_tokens": 183582180.0, + "step": 9252 + }, + { + "epoch": 0.9244217992906738, + "grad_norm": 0.4254518957461577, + "learning_rate": 1.4964110319497294e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.9081852436065674, + "num_tokens": 183663647.0, + "step": 9253 + }, + { + "epoch": 0.9245217043808382, + "grad_norm": 0.5153350102732088, + "learning_rate": 1.4924850703736448e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9062866568565369, + "num_tokens": 183745191.0, + "step": 9254 + }, + { + "epoch": 0.9246216094710026, + "grad_norm": 0.5033052525034206, + "learning_rate": 1.4885641875818314e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.9056989848613739, + "num_tokens": 183826700.0, + "step": 9255 + }, + { + "epoch": 0.9247215145611669, + "grad_norm": 0.5558087737868349, + "learning_rate": 1.4846483839847992e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.9074465334415436, + "num_tokens": 183908149.0, + "step": 9256 + }, + { + "epoch": 0.9248214196513312, + "grad_norm": 0.47790797510641514, + "learning_rate": 1.4807376599925427e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.9065232276916504, + "num_tokens": 183989600.0, + "step": 9257 + }, + { + "epoch": 0.9249213247414956, + "grad_norm": 0.4597261670366281, + "learning_rate": 1.4768320160145179e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.911343902349472, + "num_tokens": 184071173.0, + "step": 9258 + }, + { + "epoch": 0.9250212298316599, + "grad_norm": 0.399583068190474, + "learning_rate": 1.4729314524596473e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.9075961112976074, + "num_tokens": 184152755.0, + "step": 9259 + }, + { + "epoch": 0.9251211349218242, + "grad_norm": 0.4503802563627616, + "learning_rate": 1.469035969736332e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.9079181849956512, + "num_tokens": 184234257.0, + "step": 9260 + }, + { + "epoch": 0.9252210400119886, + "grad_norm": 0.4337278728140227, + "learning_rate": 1.465145568252413e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.9080163538455963, + "num_tokens": 184315768.0, + "step": 9261 + }, + { + "epoch": 0.9253209451021529, + "grad_norm": 0.39290098580936594, + "learning_rate": 1.4612602484152472e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.9072727262973785, + "num_tokens": 184397282.0, + "step": 9262 + }, + { + "epoch": 0.9254208501923173, + "grad_norm": 0.5821370430045187, + "learning_rate": 1.4573800106316093e-07, + "loss": 0.493, + "mean_token_accuracy": 0.905328094959259, + "num_tokens": 184478771.0, + "step": 9263 + }, + { + "epoch": 0.9255207552824817, + "grad_norm": 0.4103862243605954, + "learning_rate": 1.4535048553077692e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.908450573682785, + "num_tokens": 184560239.0, + "step": 9264 + }, + { + "epoch": 0.925620660372646, + "grad_norm": 0.5312064947594364, + "learning_rate": 1.449634782849463e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.9091659486293793, + "num_tokens": 184641789.0, + "step": 9265 + }, + { + "epoch": 0.9257205654628103, + "grad_norm": 0.3984979528397973, + "learning_rate": 1.4457697936618887e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.9106059968471527, + "num_tokens": 184723352.0, + "step": 9266 + }, + { + "epoch": 0.9258204705529747, + "grad_norm": 0.43773324737778013, + "learning_rate": 1.4419098881497172e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.9094973802566528, + "num_tokens": 184804967.0, + "step": 9267 + }, + { + "epoch": 0.925920375643139, + "grad_norm": 0.42434381921134007, + "learning_rate": 1.4380550667170757e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.913523405790329, + "num_tokens": 184886541.0, + "step": 9268 + }, + { + "epoch": 0.9260202807333033, + "grad_norm": 0.4383905893124883, + "learning_rate": 1.4342053297675794e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9096437096595764, + "num_tokens": 184968011.0, + "step": 9269 + }, + { + "epoch": 0.9261201858234677, + "grad_norm": 0.42375581301782395, + "learning_rate": 1.430360677704301e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.9055374562740326, + "num_tokens": 185049510.0, + "step": 9270 + }, + { + "epoch": 0.926220090913632, + "grad_norm": 0.5833194254379106, + "learning_rate": 1.4265211109297627e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.909438282251358, + "num_tokens": 185131096.0, + "step": 9271 + }, + { + "epoch": 0.9263199960037964, + "grad_norm": 0.5824713100272156, + "learning_rate": 1.4226866298459873e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.9073399007320404, + "num_tokens": 185212568.0, + "step": 9272 + }, + { + "epoch": 0.9264199010939608, + "grad_norm": 0.47266033123652995, + "learning_rate": 1.4188572348544316e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.9070958197116852, + "num_tokens": 185294139.0, + "step": 9273 + }, + { + "epoch": 0.9265198061841251, + "grad_norm": 0.40198928972349757, + "learning_rate": 1.4150329263560637e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.9085531234741211, + "num_tokens": 185375694.0, + "step": 9274 + }, + { + "epoch": 0.9266197112742894, + "grad_norm": 0.40643205208813177, + "learning_rate": 1.4112137047512686e-07, + "loss": 0.4929, + "mean_token_accuracy": 0.9084010124206543, + "num_tokens": 185457170.0, + "step": 9275 + }, + { + "epoch": 0.9267196163644538, + "grad_norm": 0.47232234405371615, + "learning_rate": 1.4073995704399267e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.9064986109733582, + "num_tokens": 185538701.0, + "step": 9276 + }, + { + "epoch": 0.9268195214546181, + "grad_norm": 1.037432953126705, + "learning_rate": 1.403590523821391e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.9105291664600372, + "num_tokens": 185620216.0, + "step": 9277 + }, + { + "epoch": 0.9269194265447824, + "grad_norm": 0.4123862624144621, + "learning_rate": 1.3997865652944642e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.9052873253822327, + "num_tokens": 185701638.0, + "step": 9278 + }, + { + "epoch": 0.9270193316349468, + "grad_norm": 0.43960229563326464, + "learning_rate": 1.395987695257428e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.9098560512065887, + "num_tokens": 185783281.0, + "step": 9279 + }, + { + "epoch": 0.9271192367251111, + "grad_norm": 0.5163196544941482, + "learning_rate": 1.3921939141080255e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.9088525176048279, + "num_tokens": 185864718.0, + "step": 9280 + }, + { + "epoch": 0.9272191418152755, + "grad_norm": 0.5135376807610532, + "learning_rate": 1.388405222243472e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.9117270708084106, + "num_tokens": 185946258.0, + "step": 9281 + }, + { + "epoch": 0.9273190469054399, + "grad_norm": 0.43985130958154944, + "learning_rate": 1.38462162006045e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.9104128181934357, + "num_tokens": 186027830.0, + "step": 9282 + }, + { + "epoch": 0.9274189519956042, + "grad_norm": 0.4379922377660512, + "learning_rate": 1.3808431079550933e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.9073959290981293, + "num_tokens": 186109241.0, + "step": 9283 + }, + { + "epoch": 0.9275188570857685, + "grad_norm": 0.4747382816315204, + "learning_rate": 1.3770696863230347e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.9076738953590393, + "num_tokens": 186190679.0, + "step": 9284 + }, + { + "epoch": 0.9276187621759329, + "grad_norm": 0.5250817131722813, + "learning_rate": 1.373301355559342e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.9083004891872406, + "num_tokens": 186272176.0, + "step": 9285 + }, + { + "epoch": 0.9277186672660972, + "grad_norm": 0.6396754710709844, + "learning_rate": 1.3695381160585665e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.9079844951629639, + "num_tokens": 186353766.0, + "step": 9286 + }, + { + "epoch": 0.9278185723562615, + "grad_norm": 0.5285364350212415, + "learning_rate": 1.36577996821472e-07, + "loss": 0.4924, + "mean_token_accuracy": 0.9062432646751404, + "num_tokens": 186435210.0, + "step": 9287 + }, + { + "epoch": 0.9279184774464259, + "grad_norm": 1.5909768361321512, + "learning_rate": 1.3620269124212882e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.9085366129875183, + "num_tokens": 186516763.0, + "step": 9288 + }, + { + "epoch": 0.9280183825365902, + "grad_norm": 0.5147495935266098, + "learning_rate": 1.3582789490712179e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.9084225594997406, + "num_tokens": 186598272.0, + "step": 9289 + }, + { + "epoch": 0.9281182876267546, + "grad_norm": 0.42431065599011825, + "learning_rate": 1.354536078556923e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.9065579771995544, + "num_tokens": 186679745.0, + "step": 9290 + }, + { + "epoch": 0.928218192716919, + "grad_norm": 0.44901213113553606, + "learning_rate": 1.350798301270295e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.9092753827571869, + "num_tokens": 186761252.0, + "step": 9291 + }, + { + "epoch": 0.9283180978070833, + "grad_norm": 0.46928453550950905, + "learning_rate": 1.347065617602672e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.9084386825561523, + "num_tokens": 186842796.0, + "step": 9292 + }, + { + "epoch": 0.9284180028972476, + "grad_norm": 0.4672617134115694, + "learning_rate": 1.3433380279448682e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.9101945459842682, + "num_tokens": 186924272.0, + "step": 9293 + }, + { + "epoch": 0.928517907987412, + "grad_norm": 0.499217330555886, + "learning_rate": 1.339615532687183e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.9115666449069977, + "num_tokens": 187005868.0, + "step": 9294 + }, + { + "epoch": 0.9286178130775763, + "grad_norm": 0.4620591139077648, + "learning_rate": 1.335898132219343e-07, + "loss": 0.4907, + "mean_token_accuracy": 0.9113054573535919, + "num_tokens": 187087346.0, + "step": 9295 + }, + { + "epoch": 0.9287177181677406, + "grad_norm": 0.4315368087100429, + "learning_rate": 1.332185826930582e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.9097703993320465, + "num_tokens": 187168862.0, + "step": 9296 + }, + { + "epoch": 0.928817623257905, + "grad_norm": 0.5033562902419131, + "learning_rate": 1.328478617209572e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.9095306098461151, + "num_tokens": 187250465.0, + "step": 9297 + }, + { + "epoch": 0.9289175283480693, + "grad_norm": 0.5436843794667457, + "learning_rate": 1.3247765034444582e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.9086712598800659, + "num_tokens": 187331910.0, + "step": 9298 + }, + { + "epoch": 0.9290174334382337, + "grad_norm": 0.4457430469024507, + "learning_rate": 1.3210794860228694e-07, + "loss": 0.487, + "mean_token_accuracy": 0.9087370038032532, + "num_tokens": 187413415.0, + "step": 9299 + }, + { + "epoch": 0.9291173385283981, + "grad_norm": 0.431442379546773, + "learning_rate": 1.3173875653318736e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.9103135764598846, + "num_tokens": 187494891.0, + "step": 9300 + }, + { + "epoch": 0.9292172436185624, + "grad_norm": 0.4538126505244113, + "learning_rate": 1.3137007417580227e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.9062821865081787, + "num_tokens": 187576403.0, + "step": 9301 + }, + { + "epoch": 0.9293171487087267, + "grad_norm": 0.45625777779120147, + "learning_rate": 1.3100190156873306e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.9072687327861786, + "num_tokens": 187657893.0, + "step": 9302 + }, + { + "epoch": 0.9294170537988911, + "grad_norm": 0.5570492027845192, + "learning_rate": 1.3063423875052772e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.9108878672122955, + "num_tokens": 187739438.0, + "step": 9303 + }, + { + "epoch": 0.9295169588890554, + "grad_norm": 0.48504620712500673, + "learning_rate": 1.3026708575968217e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.9044283628463745, + "num_tokens": 187820886.0, + "step": 9304 + }, + { + "epoch": 0.9296168639792197, + "grad_norm": 0.47521303834810497, + "learning_rate": 1.2990044263463508e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.906231164932251, + "num_tokens": 187902522.0, + "step": 9305 + }, + { + "epoch": 0.929716769069384, + "grad_norm": 0.4278992142612925, + "learning_rate": 1.2953430941377687e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.9068869054317474, + "num_tokens": 187984036.0, + "step": 9306 + }, + { + "epoch": 0.9298166741595484, + "grad_norm": 0.4511984309407676, + "learning_rate": 1.2916868613544076e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.9050352871417999, + "num_tokens": 188065446.0, + "step": 9307 + }, + { + "epoch": 0.9299165792497128, + "grad_norm": 0.7257892261458494, + "learning_rate": 1.2880357283790778e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.9085132479667664, + "num_tokens": 188146988.0, + "step": 9308 + }, + { + "epoch": 0.9300164843398772, + "grad_norm": 0.44239789460724527, + "learning_rate": 1.284389695594057e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.9075508117675781, + "num_tokens": 188228478.0, + "step": 9309 + }, + { + "epoch": 0.9301163894300415, + "grad_norm": 0.4215393897557795, + "learning_rate": 1.2807487633810955e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.9076575040817261, + "num_tokens": 188309933.0, + "step": 9310 + }, + { + "epoch": 0.9302162945202058, + "grad_norm": 0.5018001718032218, + "learning_rate": 1.2771129321213993e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.9088895320892334, + "num_tokens": 188391463.0, + "step": 9311 + }, + { + "epoch": 0.9303161996103702, + "grad_norm": 0.4364071032439802, + "learning_rate": 1.273482202195636e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.9074607491493225, + "num_tokens": 188472871.0, + "step": 9312 + }, + { + "epoch": 0.9304161047005345, + "grad_norm": 0.47055039479195815, + "learning_rate": 1.2698565739839575e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.9083445072174072, + "num_tokens": 188554438.0, + "step": 9313 + }, + { + "epoch": 0.9305160097906988, + "grad_norm": 0.6127122966801419, + "learning_rate": 1.2662360478659707e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.9050380885601044, + "num_tokens": 188635892.0, + "step": 9314 + }, + { + "epoch": 0.9306159148808631, + "grad_norm": 0.46974053845770175, + "learning_rate": 1.262620624220734e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.9091389179229736, + "num_tokens": 188717429.0, + "step": 9315 + }, + { + "epoch": 0.9307158199710275, + "grad_norm": 0.4712012909839568, + "learning_rate": 1.2590103034268053e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.9097053408622742, + "num_tokens": 188798906.0, + "step": 9316 + }, + { + "epoch": 0.9308157250611919, + "grad_norm": 0.46096919691895943, + "learning_rate": 1.255405085862166e-07, + "loss": 0.485, + "mean_token_accuracy": 0.9090189933776855, + "num_tokens": 188880502.0, + "step": 9317 + }, + { + "epoch": 0.9309156301513563, + "grad_norm": 0.48135693517825084, + "learning_rate": 1.2518049719043078e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.9070186913013458, + "num_tokens": 188961957.0, + "step": 9318 + }, + { + "epoch": 0.9310155352415206, + "grad_norm": 0.36463415844096425, + "learning_rate": 1.248209961930158e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.9080676436424255, + "num_tokens": 189043481.0, + "step": 9319 + }, + { + "epoch": 0.9311154403316849, + "grad_norm": 0.6589197325581817, + "learning_rate": 1.2446200563161093e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.9064081609249115, + "num_tokens": 189124983.0, + "step": 9320 + }, + { + "epoch": 0.9312153454218492, + "grad_norm": 0.4699485835052215, + "learning_rate": 1.241035255438039e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.9064609110355377, + "num_tokens": 189206534.0, + "step": 9321 + }, + { + "epoch": 0.9313152505120136, + "grad_norm": 0.4392249087183497, + "learning_rate": 1.23745555967128e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.9095361828804016, + "num_tokens": 189288093.0, + "step": 9322 + }, + { + "epoch": 0.9314151556021779, + "grad_norm": 0.6211935411755555, + "learning_rate": 1.2338809693906162e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.9077247083187103, + "num_tokens": 189369568.0, + "step": 9323 + }, + { + "epoch": 0.9315150606923422, + "grad_norm": 0.5185367353446607, + "learning_rate": 1.2303114849703257e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.9065137803554535, + "num_tokens": 189451134.0, + "step": 9324 + }, + { + "epoch": 0.9316149657825066, + "grad_norm": 0.5050526646736486, + "learning_rate": 1.2267471067841318e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.9097047448158264, + "num_tokens": 189532565.0, + "step": 9325 + }, + { + "epoch": 0.9317148708726709, + "grad_norm": 0.45942328330399884, + "learning_rate": 1.2231878352052362e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.9088730812072754, + "num_tokens": 189614114.0, + "step": 9326 + }, + { + "epoch": 0.9318147759628354, + "grad_norm": 0.571678077133573, + "learning_rate": 1.219633670606274e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.9079931080341339, + "num_tokens": 189695586.0, + "step": 9327 + }, + { + "epoch": 0.9319146810529997, + "grad_norm": 0.5200727216083803, + "learning_rate": 1.2160846133593974e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.9079350531101227, + "num_tokens": 189777120.0, + "step": 9328 + }, + { + "epoch": 0.932014586143164, + "grad_norm": 0.5325712122883418, + "learning_rate": 1.2125406638361813e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9058091342449188, + "num_tokens": 189858630.0, + "step": 9329 + }, + { + "epoch": 0.9321144912333283, + "grad_norm": 0.5152828954555662, + "learning_rate": 1.2090018224076848e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.9078179597854614, + "num_tokens": 189940197.0, + "step": 9330 + }, + { + "epoch": 0.9322143963234927, + "grad_norm": 0.4449090894520333, + "learning_rate": 1.2054680894444281e-07, + "loss": 0.485, + "mean_token_accuracy": 0.9077323973178864, + "num_tokens": 190021766.0, + "step": 9331 + }, + { + "epoch": 0.932314301413657, + "grad_norm": 0.4465106739074029, + "learning_rate": 1.201939465316393e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.9116365313529968, + "num_tokens": 190103310.0, + "step": 9332 + }, + { + "epoch": 0.9324142065038213, + "grad_norm": 0.4796615042165938, + "learning_rate": 1.1984159503930337e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.9086100161075592, + "num_tokens": 190184759.0, + "step": 9333 + }, + { + "epoch": 0.9325141115939857, + "grad_norm": 0.4246974805283688, + "learning_rate": 1.194897545043261e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.907311350107193, + "num_tokens": 190266154.0, + "step": 9334 + }, + { + "epoch": 0.93261401668415, + "grad_norm": 0.5118612266245097, + "learning_rate": 1.1913842496354633e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.9063694179058075, + "num_tokens": 190347728.0, + "step": 9335 + }, + { + "epoch": 0.9327139217743144, + "grad_norm": 0.48587057443263587, + "learning_rate": 1.1878760645374854e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.905231386423111, + "num_tokens": 190429253.0, + "step": 9336 + }, + { + "epoch": 0.9328138268644788, + "grad_norm": 0.7142541369933061, + "learning_rate": 1.1843729901166223e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.9084234535694122, + "num_tokens": 190510731.0, + "step": 9337 + }, + { + "epoch": 0.9329137319546431, + "grad_norm": 0.47874798979888183, + "learning_rate": 1.1808750267396696e-07, + "loss": 0.488, + "mean_token_accuracy": 0.9081290364265442, + "num_tokens": 190592282.0, + "step": 9338 + }, + { + "epoch": 0.9330136370448074, + "grad_norm": 0.38902797590935356, + "learning_rate": 1.1773821747728509e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.9060905277729034, + "num_tokens": 190673835.0, + "step": 9339 + }, + { + "epoch": 0.9331135421349718, + "grad_norm": 0.463671437019814, + "learning_rate": 1.1738944345818904e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.9090762436389923, + "num_tokens": 190755338.0, + "step": 9340 + }, + { + "epoch": 0.9332134472251361, + "grad_norm": 0.4464703843144342, + "learning_rate": 1.1704118065319404e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.9102691113948822, + "num_tokens": 190836828.0, + "step": 9341 + }, + { + "epoch": 0.9333133523153004, + "grad_norm": 0.993231639978474, + "learning_rate": 1.1669342909876425e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9104272723197937, + "num_tokens": 190918305.0, + "step": 9342 + }, + { + "epoch": 0.9334132574054648, + "grad_norm": 0.41525307750176776, + "learning_rate": 1.1634618883130943e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.9076140820980072, + "num_tokens": 190999799.0, + "step": 9343 + }, + { + "epoch": 0.9335131624956291, + "grad_norm": 0.48865399805772325, + "learning_rate": 1.1599945988718608e-07, + "loss": 0.486, + "mean_token_accuracy": 0.9078728556632996, + "num_tokens": 191081310.0, + "step": 9344 + }, + { + "epoch": 0.9336130675857935, + "grad_norm": 0.48792182803462614, + "learning_rate": 1.1565324230269681e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.9073997437953949, + "num_tokens": 191162855.0, + "step": 9345 + }, + { + "epoch": 0.9337129726759579, + "grad_norm": 0.5103852292545327, + "learning_rate": 1.1530753611409151e-07, + "loss": 0.481, + "mean_token_accuracy": 0.9116959571838379, + "num_tokens": 191244429.0, + "step": 9346 + }, + { + "epoch": 0.9338128777661222, + "grad_norm": 0.5170027919680381, + "learning_rate": 1.1496234135756568e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.9103578925132751, + "num_tokens": 191325931.0, + "step": 9347 + }, + { + "epoch": 0.9339127828562865, + "grad_norm": 0.4841117917762865, + "learning_rate": 1.1461765806926206e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.91011843085289, + "num_tokens": 191407508.0, + "step": 9348 + }, + { + "epoch": 0.9340126879464509, + "grad_norm": 0.4337579465682964, + "learning_rate": 1.1427348628526735e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.9090793132781982, + "num_tokens": 191489074.0, + "step": 9349 + }, + { + "epoch": 0.9341125930366152, + "grad_norm": 0.4316439271690671, + "learning_rate": 1.1392982604161939e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.9098004698753357, + "num_tokens": 191570563.0, + "step": 9350 + }, + { + "epoch": 0.9342124981267795, + "grad_norm": 0.7772735021409637, + "learning_rate": 1.1358667737429718e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.9076440632343292, + "num_tokens": 191652070.0, + "step": 9351 + }, + { + "epoch": 0.9343124032169439, + "grad_norm": 0.4330649102144167, + "learning_rate": 1.1324404031923142e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.906688928604126, + "num_tokens": 191733574.0, + "step": 9352 + }, + { + "epoch": 0.9344123083071082, + "grad_norm": 0.4678151609598545, + "learning_rate": 1.1290191491229397e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.9077704548835754, + "num_tokens": 191815041.0, + "step": 9353 + }, + { + "epoch": 0.9345122133972726, + "grad_norm": 0.5722651336388346, + "learning_rate": 1.1256030118930727e-07, + "loss": 0.493, + "mean_token_accuracy": 0.9074739515781403, + "num_tokens": 191896525.0, + "step": 9354 + }, + { + "epoch": 0.934612118487437, + "grad_norm": 0.49169812540688634, + "learning_rate": 1.122191991860383e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.9069390594959259, + "num_tokens": 191978128.0, + "step": 9355 + }, + { + "epoch": 0.9347120235776013, + "grad_norm": 0.4472793769302275, + "learning_rate": 1.1187860893820012e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.9096685945987701, + "num_tokens": 192059754.0, + "step": 9356 + }, + { + "epoch": 0.9348119286677656, + "grad_norm": 0.4710765520178498, + "learning_rate": 1.1153853048145369e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.9098422825336456, + "num_tokens": 192141287.0, + "step": 9357 + }, + { + "epoch": 0.93491183375793, + "grad_norm": 0.3924131721506214, + "learning_rate": 1.1119896385140549e-07, + "loss": 0.487, + "mean_token_accuracy": 0.9091237187385559, + "num_tokens": 192222769.0, + "step": 9358 + }, + { + "epoch": 0.9350117388480943, + "grad_norm": 0.4538283195501295, + "learning_rate": 1.1085990908360766e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.9077732264995575, + "num_tokens": 192304327.0, + "step": 9359 + }, + { + "epoch": 0.9351116439382586, + "grad_norm": 0.43073992075515766, + "learning_rate": 1.1052136621356069e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.906847357749939, + "num_tokens": 192385838.0, + "step": 9360 + }, + { + "epoch": 0.935211549028423, + "grad_norm": 0.38663876463681107, + "learning_rate": 1.1018333527670954e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.9075597524642944, + "num_tokens": 192467244.0, + "step": 9361 + }, + { + "epoch": 0.9353114541185873, + "grad_norm": 0.41548589929352114, + "learning_rate": 1.0984581630844705e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.9095426499843597, + "num_tokens": 192548751.0, + "step": 9362 + }, + { + "epoch": 0.9354113592087517, + "grad_norm": 0.4159971361479775, + "learning_rate": 1.0950880934411101e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.9068896472454071, + "num_tokens": 192630247.0, + "step": 9363 + }, + { + "epoch": 0.9355112642989161, + "grad_norm": 0.4917880667090252, + "learning_rate": 1.0917231441898658e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.9079594314098358, + "num_tokens": 192711870.0, + "step": 9364 + }, + { + "epoch": 0.9356111693890804, + "grad_norm": 0.36488051564568635, + "learning_rate": 1.0883633156830553e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.9107024073600769, + "num_tokens": 192793421.0, + "step": 9365 + }, + { + "epoch": 0.9357110744792447, + "grad_norm": 0.5067267554728375, + "learning_rate": 1.0850086082724531e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.9099061191082001, + "num_tokens": 192874920.0, + "step": 9366 + }, + { + "epoch": 0.9358109795694091, + "grad_norm": 0.7059895288879086, + "learning_rate": 1.0816590223092948e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.9080784916877747, + "num_tokens": 192956523.0, + "step": 9367 + }, + { + "epoch": 0.9359108846595734, + "grad_norm": 0.5088358044638542, + "learning_rate": 1.0783145581442999e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.9100134968757629, + "num_tokens": 193038070.0, + "step": 9368 + }, + { + "epoch": 0.9360107897497377, + "grad_norm": 0.5242344351177765, + "learning_rate": 1.0749752161276217e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.9082521796226501, + "num_tokens": 193119620.0, + "step": 9369 + }, + { + "epoch": 0.9361106948399021, + "grad_norm": 0.5122686326975121, + "learning_rate": 1.0716409966089081e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.9070451855659485, + "num_tokens": 193201142.0, + "step": 9370 + }, + { + "epoch": 0.9362105999300664, + "grad_norm": 0.8627943604396927, + "learning_rate": 1.0683118999372355e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.9096684157848358, + "num_tokens": 193282672.0, + "step": 9371 + }, + { + "epoch": 0.9363105050202308, + "grad_norm": 0.51660978320781, + "learning_rate": 1.0649879264611862e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.9086962640285492, + "num_tokens": 193364293.0, + "step": 9372 + }, + { + "epoch": 0.9364104101103952, + "grad_norm": 0.46319823984411074, + "learning_rate": 1.0616690765287596e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.9081477224826813, + "num_tokens": 193445845.0, + "step": 9373 + }, + { + "epoch": 0.9365103152005595, + "grad_norm": 0.43954974710460354, + "learning_rate": 1.0583553504874611e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.908193051815033, + "num_tokens": 193527292.0, + "step": 9374 + }, + { + "epoch": 0.9366102202907238, + "grad_norm": 0.40999780102795746, + "learning_rate": 1.0550467486842353e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.906932532787323, + "num_tokens": 193608785.0, + "step": 9375 + }, + { + "epoch": 0.9367101253808882, + "grad_norm": 0.4528850952519613, + "learning_rate": 1.0517432714654884e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.9120399057865143, + "num_tokens": 193690284.0, + "step": 9376 + }, + { + "epoch": 0.9368100304710525, + "grad_norm": 0.481019472330253, + "learning_rate": 1.0484449191771106e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.9052794277667999, + "num_tokens": 193771746.0, + "step": 9377 + }, + { + "epoch": 0.9369099355612168, + "grad_norm": 1.1071653560342423, + "learning_rate": 1.0451516921644366e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.9110446274280548, + "num_tokens": 193853417.0, + "step": 9378 + }, + { + "epoch": 0.9370098406513812, + "grad_norm": 0.43405701002757874, + "learning_rate": 1.0418635907722685e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.906599760055542, + "num_tokens": 193934903.0, + "step": 9379 + }, + { + "epoch": 0.9371097457415455, + "grad_norm": 0.7386093834949908, + "learning_rate": 1.0385806153448752e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.9088829457759857, + "num_tokens": 194016407.0, + "step": 9380 + }, + { + "epoch": 0.9372096508317099, + "grad_norm": 0.4612671226192617, + "learning_rate": 1.0353027662259874e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.9078719019889832, + "num_tokens": 194097816.0, + "step": 9381 + }, + { + "epoch": 0.9373095559218743, + "grad_norm": 0.4274711534009883, + "learning_rate": 1.0320300437588083e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.9088755249977112, + "num_tokens": 194179387.0, + "step": 9382 + }, + { + "epoch": 0.9374094610120386, + "grad_norm": 0.5262576996055887, + "learning_rate": 1.0287624482859693e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.9074623584747314, + "num_tokens": 194260839.0, + "step": 9383 + }, + { + "epoch": 0.9375093661022029, + "grad_norm": 0.4145292548752531, + "learning_rate": 1.0254999801496246e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.9068121910095215, + "num_tokens": 194342333.0, + "step": 9384 + }, + { + "epoch": 0.9376092711923673, + "grad_norm": 0.6736155763376003, + "learning_rate": 1.0222426396913343e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.9085520803928375, + "num_tokens": 194423875.0, + "step": 9385 + }, + { + "epoch": 0.9377091762825316, + "grad_norm": 0.4975085239284444, + "learning_rate": 1.0189904272521534e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.9100915789604187, + "num_tokens": 194505485.0, + "step": 9386 + }, + { + "epoch": 0.9378090813726959, + "grad_norm": 0.5095060518015796, + "learning_rate": 1.0157433431725872e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.9099049866199493, + "num_tokens": 194587043.0, + "step": 9387 + }, + { + "epoch": 0.9379089864628603, + "grad_norm": 0.4969009372906642, + "learning_rate": 1.0125013877926137e-07, + "loss": 0.4876, + "mean_token_accuracy": 0.906960129737854, + "num_tokens": 194668571.0, + "step": 9388 + }, + { + "epoch": 0.9380088915530246, + "grad_norm": 0.4608897641213864, + "learning_rate": 1.0092645614516672e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.9053474068641663, + "num_tokens": 194750019.0, + "step": 9389 + }, + { + "epoch": 0.938108796643189, + "grad_norm": 0.5008243663854477, + "learning_rate": 1.006032864488643e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.908392071723938, + "num_tokens": 194831519.0, + "step": 9390 + }, + { + "epoch": 0.9382087017333534, + "grad_norm": 0.48440641035428667, + "learning_rate": 1.0028062972419095e-07, + "loss": 0.488, + "mean_token_accuracy": 0.9078194797039032, + "num_tokens": 194913065.0, + "step": 9391 + }, + { + "epoch": 0.9383086068235177, + "grad_norm": 0.464632710496575, + "learning_rate": 9.995848600492907e-08, + "loss": 0.4868, + "mean_token_accuracy": 0.9085684716701508, + "num_tokens": 194994587.0, + "step": 9392 + }, + { + "epoch": 0.938408511913682, + "grad_norm": 0.4324352758069574, + "learning_rate": 9.963685532480672e-08, + "loss": 0.4837, + "mean_token_accuracy": 0.9099394679069519, + "num_tokens": 195076138.0, + "step": 9393 + }, + { + "epoch": 0.9385084170038464, + "grad_norm": 0.5205275862746498, + "learning_rate": 9.931573771749969e-08, + "loss": 0.486, + "mean_token_accuracy": 0.9067725837230682, + "num_tokens": 195157713.0, + "step": 9394 + }, + { + "epoch": 0.9386083220940107, + "grad_norm": 0.3875426226693426, + "learning_rate": 9.899513321662835e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.909242570400238, + "num_tokens": 195239265.0, + "step": 9395 + }, + { + "epoch": 0.938708227184175, + "grad_norm": 0.4233626704838917, + "learning_rate": 9.867504185576193e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.9089950323104858, + "num_tokens": 195320837.0, + "step": 9396 + }, + { + "epoch": 0.9388081322743393, + "grad_norm": 0.42249106465777087, + "learning_rate": 9.835546366841308e-08, + "loss": 0.4854, + "mean_token_accuracy": 0.906926155090332, + "num_tokens": 195402354.0, + "step": 9397 + }, + { + "epoch": 0.9389080373645037, + "grad_norm": 0.4985216067738798, + "learning_rate": 9.80363986880417e-08, + "loss": 0.481, + "mean_token_accuracy": 0.9089611172676086, + "num_tokens": 195483996.0, + "step": 9398 + }, + { + "epoch": 0.9390079424546681, + "grad_norm": 0.4416781906549375, + "learning_rate": 9.771784694805498e-08, + "loss": 0.4842, + "mean_token_accuracy": 0.9086793661117554, + "num_tokens": 195565540.0, + "step": 9399 + }, + { + "epoch": 0.9391078475448325, + "grad_norm": 1.030421330282771, + "learning_rate": 9.739980848180563e-08, + "loss": 0.4835, + "mean_token_accuracy": 0.9093633592128754, + "num_tokens": 195647109.0, + "step": 9400 + }, + { + "epoch": 0.9392077526349968, + "grad_norm": 0.5128467193586499, + "learning_rate": 9.708228332259151e-08, + "loss": 0.4923, + "mean_token_accuracy": 0.9067511260509491, + "num_tokens": 195728541.0, + "step": 9401 + }, + { + "epoch": 0.9393076577251611, + "grad_norm": 0.4563145478698038, + "learning_rate": 9.676527150365933e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.90943643450737, + "num_tokens": 195809996.0, + "step": 9402 + }, + { + "epoch": 0.9394075628153254, + "grad_norm": 0.4330875463144893, + "learning_rate": 9.644877305819977e-08, + "loss": 0.4846, + "mean_token_accuracy": 0.9126030206680298, + "num_tokens": 195891533.0, + "step": 9403 + }, + { + "epoch": 0.9395074679054898, + "grad_norm": 0.5631594719950679, + "learning_rate": 9.613278801935078e-08, + "loss": 0.491, + "mean_token_accuracy": 0.9079879820346832, + "num_tokens": 195972999.0, + "step": 9404 + }, + { + "epoch": 0.9396073729956541, + "grad_norm": 0.3856013325482216, + "learning_rate": 9.581731642019532e-08, + "loss": 0.4828, + "mean_token_accuracy": 0.9104238748550415, + "num_tokens": 196054561.0, + "step": 9405 + }, + { + "epoch": 0.9397072780858184, + "grad_norm": 0.5076289000397198, + "learning_rate": 9.550235829376476e-08, + "loss": 0.4878, + "mean_token_accuracy": 0.909959077835083, + "num_tokens": 196136079.0, + "step": 9406 + }, + { + "epoch": 0.9398071831759828, + "grad_norm": 0.49601397035697775, + "learning_rate": 9.518791367303493e-08, + "loss": 0.4947, + "mean_token_accuracy": 0.9059317409992218, + "num_tokens": 196217543.0, + "step": 9407 + }, + { + "epoch": 0.9399070882661471, + "grad_norm": 0.41648312084315664, + "learning_rate": 9.487398259092894e-08, + "loss": 0.4853, + "mean_token_accuracy": 0.9066489934921265, + "num_tokens": 196299088.0, + "step": 9408 + }, + { + "epoch": 0.9400069933563115, + "grad_norm": 0.4549417283882151, + "learning_rate": 9.456056508031497e-08, + "loss": 0.4856, + "mean_token_accuracy": 0.9066731333732605, + "num_tokens": 196380621.0, + "step": 9409 + }, + { + "epoch": 0.9401068984464759, + "grad_norm": 0.504450919923777, + "learning_rate": 9.424766117400897e-08, + "loss": 0.4863, + "mean_token_accuracy": 0.9104371070861816, + "num_tokens": 196462139.0, + "step": 9410 + }, + { + "epoch": 0.9402068035366402, + "grad_norm": 0.5023808857397167, + "learning_rate": 9.393527090477194e-08, + "loss": 0.4827, + "mean_token_accuracy": 0.909758448600769, + "num_tokens": 196543769.0, + "step": 9411 + }, + { + "epoch": 0.9403067086268045, + "grad_norm": 0.4000349501385197, + "learning_rate": 9.362339430531108e-08, + "loss": 0.4836, + "mean_token_accuracy": 0.9078176617622375, + "num_tokens": 196625314.0, + "step": 9412 + }, + { + "epoch": 0.9404066137169689, + "grad_norm": 0.5139931500659727, + "learning_rate": 9.331203140828083e-08, + "loss": 0.4931, + "mean_token_accuracy": 0.9117718935012817, + "num_tokens": 196706790.0, + "step": 9413 + }, + { + "epoch": 0.9405065188071332, + "grad_norm": 0.5118576149848993, + "learning_rate": 9.300118224628062e-08, + "loss": 0.4894, + "mean_token_accuracy": 0.9083629548549652, + "num_tokens": 196788262.0, + "step": 9414 + }, + { + "epoch": 0.9406064238972975, + "grad_norm": 0.5215888354694128, + "learning_rate": 9.26908468518567e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9078333079814911, + "num_tokens": 196869820.0, + "step": 9415 + }, + { + "epoch": 0.9407063289874619, + "grad_norm": 0.46096885691034123, + "learning_rate": 9.238102525750247e-08, + "loss": 0.4801, + "mean_token_accuracy": 0.908872663974762, + "num_tokens": 196951434.0, + "step": 9416 + }, + { + "epoch": 0.9408062340776262, + "grad_norm": 0.4944256535812787, + "learning_rate": 9.20717174956548e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.9086793959140778, + "num_tokens": 197032899.0, + "step": 9417 + }, + { + "epoch": 0.9409061391677906, + "grad_norm": 0.5064666812782163, + "learning_rate": 9.176292359869998e-08, + "loss": 0.4888, + "mean_token_accuracy": 0.9078266322612762, + "num_tokens": 197114444.0, + "step": 9418 + }, + { + "epoch": 0.941006044257955, + "grad_norm": 0.4139100690496146, + "learning_rate": 9.145464359896882e-08, + "loss": 0.4828, + "mean_token_accuracy": 0.9075442254543304, + "num_tokens": 197196043.0, + "step": 9419 + }, + { + "epoch": 0.9411059493481193, + "grad_norm": 0.40956823117798774, + "learning_rate": 9.114687752873774e-08, + "loss": 0.4851, + "mean_token_accuracy": 0.9096431136131287, + "num_tokens": 197277593.0, + "step": 9420 + }, + { + "epoch": 0.9412058544382836, + "grad_norm": 0.609408007294868, + "learning_rate": 9.083962542023095e-08, + "loss": 0.5002, + "mean_token_accuracy": 0.9069423079490662, + "num_tokens": 197358946.0, + "step": 9421 + }, + { + "epoch": 0.941305759528448, + "grad_norm": 0.44563724639339386, + "learning_rate": 9.053288730561716e-08, + "loss": 0.4843, + "mean_token_accuracy": 0.9079417884349823, + "num_tokens": 197440467.0, + "step": 9422 + }, + { + "epoch": 0.9414056646186123, + "grad_norm": 0.4502432072559637, + "learning_rate": 9.022666321701345e-08, + "loss": 0.481, + "mean_token_accuracy": 0.9108677208423615, + "num_tokens": 197522032.0, + "step": 9423 + }, + { + "epoch": 0.9415055697087766, + "grad_norm": 0.48033722050874006, + "learning_rate": 8.992095318648086e-08, + "loss": 0.4899, + "mean_token_accuracy": 0.9058666825294495, + "num_tokens": 197603501.0, + "step": 9424 + }, + { + "epoch": 0.941605474798941, + "grad_norm": 0.4900773229837796, + "learning_rate": 8.961575724602767e-08, + "loss": 0.4868, + "mean_token_accuracy": 0.9089857637882233, + "num_tokens": 197684994.0, + "step": 9425 + }, + { + "epoch": 0.9417053798891053, + "grad_norm": 0.4038067180166453, + "learning_rate": 8.931107542760831e-08, + "loss": 0.4872, + "mean_token_accuracy": 0.906688392162323, + "num_tokens": 197766542.0, + "step": 9426 + }, + { + "epoch": 0.9418052849792697, + "grad_norm": 0.3924905034172008, + "learning_rate": 8.900690776312282e-08, + "loss": 0.4855, + "mean_token_accuracy": 0.9072016179561615, + "num_tokens": 197848068.0, + "step": 9427 + }, + { + "epoch": 0.9419051900694341, + "grad_norm": 0.47449573937069184, + "learning_rate": 8.870325428441906e-08, + "loss": 0.4841, + "mean_token_accuracy": 0.909557044506073, + "num_tokens": 197929642.0, + "step": 9428 + }, + { + "epoch": 0.9420050951595984, + "grad_norm": 0.5157241966084327, + "learning_rate": 8.840011502328882e-08, + "loss": 0.4875, + "mean_token_accuracy": 0.9075360000133514, + "num_tokens": 198011165.0, + "step": 9429 + }, + { + "epoch": 0.9421050002497627, + "grad_norm": 0.5072917124294724, + "learning_rate": 8.809749001147117e-08, + "loss": 0.485, + "mean_token_accuracy": 0.9110197722911835, + "num_tokens": 198092736.0, + "step": 9430 + }, + { + "epoch": 0.9422049053399271, + "grad_norm": 0.43784319656658266, + "learning_rate": 8.779537928065129e-08, + "loss": 0.4822, + "mean_token_accuracy": 0.9092020690441132, + "num_tokens": 198174354.0, + "step": 9431 + }, + { + "epoch": 0.9423048104300914, + "grad_norm": 0.44470067033101446, + "learning_rate": 8.749378286246057e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.9064221382141113, + "num_tokens": 198255847.0, + "step": 9432 + }, + { + "epoch": 0.9424047155202557, + "grad_norm": 0.5023015083498787, + "learning_rate": 8.719270078847653e-08, + "loss": 0.489, + "mean_token_accuracy": 0.9070541262626648, + "num_tokens": 198337362.0, + "step": 9433 + }, + { + "epoch": 0.9425046206104201, + "grad_norm": 0.4611053856934159, + "learning_rate": 8.689213309022226e-08, + "loss": 0.4917, + "mean_token_accuracy": 0.9061837792396545, + "num_tokens": 198418815.0, + "step": 9434 + }, + { + "epoch": 0.9426045257005844, + "grad_norm": 0.5783328161905658, + "learning_rate": 8.659207979916872e-08, + "loss": 0.4873, + "mean_token_accuracy": 0.9085182249546051, + "num_tokens": 198500347.0, + "step": 9435 + }, + { + "epoch": 0.9427044307907488, + "grad_norm": 0.39841686491386824, + "learning_rate": 8.629254094673078e-08, + "loss": 0.4829, + "mean_token_accuracy": 0.9081696569919586, + "num_tokens": 198581967.0, + "step": 9436 + }, + { + "epoch": 0.9428043358809132, + "grad_norm": 0.45591262321564363, + "learning_rate": 8.599351656426946e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.908284068107605, + "num_tokens": 198663485.0, + "step": 9437 + }, + { + "epoch": 0.9429042409710775, + "grad_norm": 0.4219790009020158, + "learning_rate": 8.569500668309583e-08, + "loss": 0.4872, + "mean_token_accuracy": 0.9087246954441071, + "num_tokens": 198745032.0, + "step": 9438 + }, + { + "epoch": 0.9430041460612418, + "grad_norm": 0.5283141703342396, + "learning_rate": 8.539701133446099e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.9079068899154663, + "num_tokens": 198826544.0, + "step": 9439 + }, + { + "epoch": 0.9431040511514062, + "grad_norm": 0.4181463305084277, + "learning_rate": 8.509953054956776e-08, + "loss": 0.4837, + "mean_token_accuracy": 0.9097058475017548, + "num_tokens": 198908121.0, + "step": 9440 + }, + { + "epoch": 0.9432039562415705, + "grad_norm": 0.9466228059559659, + "learning_rate": 8.480256435956124e-08, + "loss": 0.4892, + "mean_token_accuracy": 0.9078221321105957, + "num_tokens": 198989601.0, + "step": 9441 + }, + { + "epoch": 0.9433038613317348, + "grad_norm": 0.5809602971396418, + "learning_rate": 8.450611279553434e-08, + "loss": 0.4908, + "mean_token_accuracy": 0.905708372592926, + "num_tokens": 199071114.0, + "step": 9442 + }, + { + "epoch": 0.9434037664218992, + "grad_norm": 0.41470976808761856, + "learning_rate": 8.42101758885261e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9080464243888855, + "num_tokens": 199152638.0, + "step": 9443 + }, + { + "epoch": 0.9435036715120635, + "grad_norm": 0.4665248529530343, + "learning_rate": 8.391475366952118e-08, + "loss": 0.4911, + "mean_token_accuracy": 0.9088443219661713, + "num_tokens": 199234118.0, + "step": 9444 + }, + { + "epoch": 0.9436035766022279, + "grad_norm": 0.6386831547632197, + "learning_rate": 8.361984616945095e-08, + "loss": 0.492, + "mean_token_accuracy": 0.905034065246582, + "num_tokens": 199315575.0, + "step": 9445 + }, + { + "epoch": 0.9437034816923923, + "grad_norm": 0.47393682110471647, + "learning_rate": 8.33254534191924e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9061211347579956, + "num_tokens": 199397091.0, + "step": 9446 + }, + { + "epoch": 0.9438033867825566, + "grad_norm": 0.41930302973939854, + "learning_rate": 8.303157544956808e-08, + "loss": 0.4945, + "mean_token_accuracy": 0.9075711667537689, + "num_tokens": 199478521.0, + "step": 9447 + }, + { + "epoch": 0.9439032918727209, + "grad_norm": 0.5404540130617516, + "learning_rate": 8.273821229134893e-08, + "loss": 0.4837, + "mean_token_accuracy": 0.9101338386535645, + "num_tokens": 199560115.0, + "step": 9448 + }, + { + "epoch": 0.9440031969628853, + "grad_norm": 0.5264696572218108, + "learning_rate": 8.244536397524761e-08, + "loss": 0.4909, + "mean_token_accuracy": 0.9073116481304169, + "num_tokens": 199641556.0, + "step": 9449 + }, + { + "epoch": 0.9441031020530496, + "grad_norm": 0.5519320983006818, + "learning_rate": 8.215303053192847e-08, + "loss": 0.4839, + "mean_token_accuracy": 0.9095746874809265, + "num_tokens": 199723113.0, + "step": 9450 + }, + { + "epoch": 0.9442030071432139, + "grad_norm": 0.44970722534440966, + "learning_rate": 8.186121199199759e-08, + "loss": 0.4893, + "mean_token_accuracy": 0.9093037545681, + "num_tokens": 199804588.0, + "step": 9451 + }, + { + "epoch": 0.9443029122333783, + "grad_norm": 0.8442240440505686, + "learning_rate": 8.156990838600942e-08, + "loss": 0.4907, + "mean_token_accuracy": 0.9049462676048279, + "num_tokens": 199886097.0, + "step": 9452 + }, + { + "epoch": 0.9444028173235426, + "grad_norm": 0.644362976132025, + "learning_rate": 8.127911974446234e-08, + "loss": 0.4842, + "mean_token_accuracy": 0.9091416001319885, + "num_tokens": 199967723.0, + "step": 9453 + }, + { + "epoch": 0.944502722413707, + "grad_norm": 0.4882664332955522, + "learning_rate": 8.098884609780366e-08, + "loss": 0.492, + "mean_token_accuracy": 0.9092385172843933, + "num_tokens": 200049214.0, + "step": 9454 + }, + { + "epoch": 0.9446026275038714, + "grad_norm": 0.6147417822007217, + "learning_rate": 8.069908747642518e-08, + "loss": 0.492, + "mean_token_accuracy": 0.9072476625442505, + "num_tokens": 200130679.0, + "step": 9455 + }, + { + "epoch": 0.9447025325940357, + "grad_norm": 0.4288044199201241, + "learning_rate": 8.040984391066376e-08, + "loss": 0.4846, + "mean_token_accuracy": 0.9084631502628326, + "num_tokens": 200212252.0, + "step": 9456 + }, + { + "epoch": 0.9448024376842, + "grad_norm": 0.5125939862441037, + "learning_rate": 8.012111543080458e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.9100548028945923, + "num_tokens": 200293746.0, + "step": 9457 + }, + { + "epoch": 0.9449023427743644, + "grad_norm": 0.494291190712142, + "learning_rate": 7.983290206707794e-08, + "loss": 0.4834, + "mean_token_accuracy": 0.9099139869213104, + "num_tokens": 200375363.0, + "step": 9458 + }, + { + "epoch": 0.9450022478645287, + "grad_norm": 0.4263319445372662, + "learning_rate": 7.954520384965913e-08, + "loss": 0.488, + "mean_token_accuracy": 0.9082058966159821, + "num_tokens": 200456855.0, + "step": 9459 + }, + { + "epoch": 0.945102152954693, + "grad_norm": 0.48765240621360223, + "learning_rate": 7.925802080867129e-08, + "loss": 0.494, + "mean_token_accuracy": 0.9057016968727112, + "num_tokens": 200538355.0, + "step": 9460 + }, + { + "epoch": 0.9452020580448574, + "grad_norm": 0.5159763005445028, + "learning_rate": 7.897135297418145e-08, + "loss": 0.4894, + "mean_token_accuracy": 0.9070906043052673, + "num_tokens": 200619831.0, + "step": 9461 + }, + { + "epoch": 0.9453019631350217, + "grad_norm": 0.48814496031195775, + "learning_rate": 7.868520037620619e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.9062583148479462, + "num_tokens": 200701311.0, + "step": 9462 + }, + { + "epoch": 0.9454018682251861, + "grad_norm": 0.4388715567238983, + "learning_rate": 7.839956304470431e-08, + "loss": 0.4905, + "mean_token_accuracy": 0.9078208804130554, + "num_tokens": 200782780.0, + "step": 9463 + }, + { + "epoch": 0.9455017733153505, + "grad_norm": 0.5626829750950866, + "learning_rate": 7.811444100958244e-08, + "loss": 0.49, + "mean_token_accuracy": 0.9098625183105469, + "num_tokens": 200864283.0, + "step": 9464 + }, + { + "epoch": 0.9456016784055148, + "grad_norm": 0.43009492821560225, + "learning_rate": 7.782983430069336e-08, + "loss": 0.48, + "mean_token_accuracy": 0.910521000623703, + "num_tokens": 200945928.0, + "step": 9465 + }, + { + "epoch": 0.9457015834956791, + "grad_norm": 0.5016010127205845, + "learning_rate": 7.754574294783601e-08, + "loss": 0.4954, + "mean_token_accuracy": 0.9082710146903992, + "num_tokens": 201027329.0, + "step": 9466 + }, + { + "epoch": 0.9458014885858435, + "grad_norm": 0.4174258136251985, + "learning_rate": 7.72621669807544e-08, + "loss": 0.4852, + "mean_token_accuracy": 0.9077035486698151, + "num_tokens": 201108863.0, + "step": 9467 + }, + { + "epoch": 0.9459013936760078, + "grad_norm": 0.44186573646074045, + "learning_rate": 7.697910642913974e-08, + "loss": 0.4875, + "mean_token_accuracy": 0.9086694419384003, + "num_tokens": 201190352.0, + "step": 9468 + }, + { + "epoch": 0.9460012987661721, + "grad_norm": 0.5113294373330521, + "learning_rate": 7.669656132262893e-08, + "loss": 0.4894, + "mean_token_accuracy": 0.9067578017711639, + "num_tokens": 201271850.0, + "step": 9469 + }, + { + "epoch": 0.9461012038563364, + "grad_norm": 0.8952909190815975, + "learning_rate": 7.641453169080437e-08, + "loss": 0.4866, + "mean_token_accuracy": 0.9073822796344757, + "num_tokens": 201353359.0, + "step": 9470 + }, + { + "epoch": 0.9462011089465008, + "grad_norm": 0.5518231878336533, + "learning_rate": 7.613301756319413e-08, + "loss": 0.4849, + "mean_token_accuracy": 0.9087570011615753, + "num_tokens": 201434880.0, + "step": 9471 + }, + { + "epoch": 0.9463010140366652, + "grad_norm": 0.5178794184113568, + "learning_rate": 7.585201896927408e-08, + "loss": 0.4924, + "mean_token_accuracy": 0.9063474237918854, + "num_tokens": 201516335.0, + "step": 9472 + }, + { + "epoch": 0.9464009191268296, + "grad_norm": 0.5192957536742087, + "learning_rate": 7.557153593846456e-08, + "loss": 0.4903, + "mean_token_accuracy": 0.9074569046497345, + "num_tokens": 201597818.0, + "step": 9473 + }, + { + "epoch": 0.9465008242169939, + "grad_norm": 0.4820723714314002, + "learning_rate": 7.529156850013319e-08, + "loss": 0.4884, + "mean_token_accuracy": 0.9081041216850281, + "num_tokens": 201679326.0, + "step": 9474 + }, + { + "epoch": 0.9466007293071582, + "grad_norm": 0.4766249350600314, + "learning_rate": 7.501211668359154e-08, + "loss": 0.4863, + "mean_token_accuracy": 0.9079048931598663, + "num_tokens": 201760813.0, + "step": 9475 + }, + { + "epoch": 0.9467006343973225, + "grad_norm": 0.6263373192197905, + "learning_rate": 7.473318051809952e-08, + "loss": 0.4885, + "mean_token_accuracy": 0.9069832265377045, + "num_tokens": 201842322.0, + "step": 9476 + }, + { + "epoch": 0.9468005394874869, + "grad_norm": 0.4520655207462025, + "learning_rate": 7.4454760032861e-08, + "loss": 0.4895, + "mean_token_accuracy": 0.9093016684055328, + "num_tokens": 201923829.0, + "step": 9477 + }, + { + "epoch": 0.9469004445776512, + "grad_norm": 0.5210057610754184, + "learning_rate": 7.417685525702822e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.9062894284725189, + "num_tokens": 202005367.0, + "step": 9478 + }, + { + "epoch": 0.9470003496678155, + "grad_norm": 0.43580827508392034, + "learning_rate": 7.389946621969679e-08, + "loss": 0.4893, + "mean_token_accuracy": 0.9079418778419495, + "num_tokens": 202086866.0, + "step": 9479 + }, + { + "epoch": 0.9471002547579799, + "grad_norm": 0.5860668043737444, + "learning_rate": 7.36225929499107e-08, + "loss": 0.492, + "mean_token_accuracy": 0.9049552977085114, + "num_tokens": 202168346.0, + "step": 9480 + }, + { + "epoch": 0.9472001598481442, + "grad_norm": 0.38345004912630776, + "learning_rate": 7.334623547665787e-08, + "loss": 0.4824, + "mean_token_accuracy": 0.9091385304927826, + "num_tokens": 202249904.0, + "step": 9481 + }, + { + "epoch": 0.9473000649383087, + "grad_norm": 0.48070232337356966, + "learning_rate": 7.30703938288746e-08, + "loss": 0.4907, + "mean_token_accuracy": 0.9079331159591675, + "num_tokens": 202331415.0, + "step": 9482 + }, + { + "epoch": 0.947399970028473, + "grad_norm": 0.4955268511923167, + "learning_rate": 7.279506803543945e-08, + "loss": 0.4893, + "mean_token_accuracy": 0.9072016179561615, + "num_tokens": 202412961.0, + "step": 9483 + }, + { + "epoch": 0.9474998751186373, + "grad_norm": 0.6051457275616343, + "learning_rate": 7.252025812518215e-08, + "loss": 0.493, + "mean_token_accuracy": 0.9091998934745789, + "num_tokens": 202494426.0, + "step": 9484 + }, + { + "epoch": 0.9475997802088016, + "grad_norm": 0.5589753599570305, + "learning_rate": 7.224596412687301e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9085868000984192, + "num_tokens": 202575941.0, + "step": 9485 + }, + { + "epoch": 0.947699685298966, + "grad_norm": 0.479052346631337, + "learning_rate": 7.19721860692324e-08, + "loss": 0.488, + "mean_token_accuracy": 0.90728759765625, + "num_tokens": 202657477.0, + "step": 9486 + }, + { + "epoch": 0.9477995903891303, + "grad_norm": 0.4041274148090531, + "learning_rate": 7.169892398092515e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9086707830429077, + "num_tokens": 202738986.0, + "step": 9487 + }, + { + "epoch": 0.9478994954792946, + "grad_norm": 0.6103291977916703, + "learning_rate": 7.142617789056116e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9073558449745178, + "num_tokens": 202820493.0, + "step": 9488 + }, + { + "epoch": 0.947999400569459, + "grad_norm": 0.46861396338256983, + "learning_rate": 7.11539478266976e-08, + "loss": 0.4936, + "mean_token_accuracy": 0.9073939621448517, + "num_tokens": 202901910.0, + "step": 9489 + }, + { + "epoch": 0.9480993056596233, + "grad_norm": 0.40485456924558494, + "learning_rate": 7.088223381783777e-08, + "loss": 0.4792, + "mean_token_accuracy": 0.9106132984161377, + "num_tokens": 202983552.0, + "step": 9490 + }, + { + "epoch": 0.9481992107497877, + "grad_norm": 0.4421392635474571, + "learning_rate": 7.061103589242946e-08, + "loss": 0.4854, + "mean_token_accuracy": 0.9080496430397034, + "num_tokens": 203065119.0, + "step": 9491 + }, + { + "epoch": 0.9482991158399521, + "grad_norm": 0.9867908568445788, + "learning_rate": 7.03403540788683e-08, + "loss": 0.4881, + "mean_token_accuracy": 0.9069166481494904, + "num_tokens": 203146643.0, + "step": 9492 + }, + { + "epoch": 0.9483990209301164, + "grad_norm": 0.4003324986975823, + "learning_rate": 7.00701884054944e-08, + "loss": 0.4903, + "mean_token_accuracy": 0.9082597196102142, + "num_tokens": 203228115.0, + "step": 9493 + }, + { + "epoch": 0.9484989260202807, + "grad_norm": 0.4776842557853962, + "learning_rate": 6.980053890059457e-08, + "loss": 0.4903, + "mean_token_accuracy": 0.9088703691959381, + "num_tokens": 203309573.0, + "step": 9494 + }, + { + "epoch": 0.9485988311104451, + "grad_norm": 0.494690575612888, + "learning_rate": 6.953140559240068e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.9086218774318695, + "num_tokens": 203391143.0, + "step": 9495 + }, + { + "epoch": 0.9486987362006094, + "grad_norm": 0.4184043974509083, + "learning_rate": 6.926278850909295e-08, + "loss": 0.4859, + "mean_token_accuracy": 0.9065468311309814, + "num_tokens": 203472689.0, + "step": 9496 + }, + { + "epoch": 0.9487986412907737, + "grad_norm": 0.4679185070694092, + "learning_rate": 6.899468767879392e-08, + "loss": 0.488, + "mean_token_accuracy": 0.9083121716976166, + "num_tokens": 203554222.0, + "step": 9497 + }, + { + "epoch": 0.9488985463809381, + "grad_norm": 0.36743426658901085, + "learning_rate": 6.872710312957498e-08, + "loss": 0.4795, + "mean_token_accuracy": 0.9093244075775146, + "num_tokens": 203635842.0, + "step": 9498 + }, + { + "epoch": 0.9489984514711024, + "grad_norm": 0.5824609283156608, + "learning_rate": 6.846003488945208e-08, + "loss": 0.4846, + "mean_token_accuracy": 0.908869206905365, + "num_tokens": 203717370.0, + "step": 9499 + }, + { + "epoch": 0.9490983565612668, + "grad_norm": 0.42246260489341925, + "learning_rate": 6.819348298638839e-08, + "loss": 0.4895, + "mean_token_accuracy": 0.9105896353721619, + "num_tokens": 203798856.0, + "step": 9500 + }, + { + "epoch": 0.9491982616514312, + "grad_norm": 0.5351715291905501, + "learning_rate": 6.792744744829105e-08, + "loss": 0.4901, + "mean_token_accuracy": 0.9075135290622711, + "num_tokens": 203880369.0, + "step": 9501 + }, + { + "epoch": 0.9492981667415955, + "grad_norm": 0.437689540398204, + "learning_rate": 6.766192830301499e-08, + "loss": 0.4839, + "mean_token_accuracy": 0.9077932238578796, + "num_tokens": 203961930.0, + "step": 9502 + }, + { + "epoch": 0.9493980718317598, + "grad_norm": 0.4303327823702236, + "learning_rate": 6.73969255783602e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9088894724845886, + "num_tokens": 204043425.0, + "step": 9503 + }, + { + "epoch": 0.9494979769219242, + "grad_norm": 0.5285372606283053, + "learning_rate": 6.713243930207281e-08, + "loss": 0.4916, + "mean_token_accuracy": 0.9099509119987488, + "num_tokens": 204124899.0, + "step": 9504 + }, + { + "epoch": 0.9495978820120885, + "grad_norm": 0.6092507240281655, + "learning_rate": 6.6868469501844e-08, + "loss": 0.4892, + "mean_token_accuracy": 0.9093602299690247, + "num_tokens": 204206415.0, + "step": 9505 + }, + { + "epoch": 0.9496977871022528, + "grad_norm": 0.481305455440035, + "learning_rate": 6.660501620531279e-08, + "loss": 0.4864, + "mean_token_accuracy": 0.9094176590442657, + "num_tokens": 204287954.0, + "step": 9506 + }, + { + "epoch": 0.9497976921924172, + "grad_norm": 0.4611888980973661, + "learning_rate": 6.63420794400621e-08, + "loss": 0.4843, + "mean_token_accuracy": 0.9103354513645172, + "num_tokens": 204369518.0, + "step": 9507 + }, + { + "epoch": 0.9498975972825815, + "grad_norm": 0.4501225226007664, + "learning_rate": 6.607965923362214e-08, + "loss": 0.4841, + "mean_token_accuracy": 0.9072293639183044, + "num_tokens": 204451073.0, + "step": 9508 + }, + { + "epoch": 0.9499975023727459, + "grad_norm": 0.4849859954333971, + "learning_rate": 6.581775561346815e-08, + "loss": 0.4822, + "mean_token_accuracy": 0.9079452753067017, + "num_tokens": 204532683.0, + "step": 9509 + }, + { + "epoch": 0.9500974074629103, + "grad_norm": 0.4432216368069803, + "learning_rate": 6.555636860702263e-08, + "loss": 0.4927, + "mean_token_accuracy": 0.9077017605304718, + "num_tokens": 204614166.0, + "step": 9510 + }, + { + "epoch": 0.9501973125530746, + "grad_norm": 0.45642222174352187, + "learning_rate": 6.529549824165204e-08, + "loss": 0.4859, + "mean_token_accuracy": 0.907995879650116, + "num_tokens": 204695707.0, + "step": 9511 + }, + { + "epoch": 0.9502972176432389, + "grad_norm": 0.5193433627493985, + "learning_rate": 6.503514454467008e-08, + "loss": 0.4912, + "mean_token_accuracy": 0.9052293002605438, + "num_tokens": 204777175.0, + "step": 9512 + }, + { + "epoch": 0.9503971227334033, + "grad_norm": 0.44243330935438707, + "learning_rate": 6.47753075433366e-08, + "loss": 0.4913, + "mean_token_accuracy": 0.9050807952880859, + "num_tokens": 204858675.0, + "step": 9513 + }, + { + "epoch": 0.9504970278235676, + "grad_norm": 0.5024939947170972, + "learning_rate": 6.451598726485598e-08, + "loss": 0.4834, + "mean_token_accuracy": 0.9100651741027832, + "num_tokens": 204940226.0, + "step": 9514 + }, + { + "epoch": 0.9505969329137319, + "grad_norm": 0.46882416569990476, + "learning_rate": 6.425718373637923e-08, + "loss": 0.4897, + "mean_token_accuracy": 0.907373309135437, + "num_tokens": 205021724.0, + "step": 9515 + }, + { + "epoch": 0.9506968380038963, + "grad_norm": 0.40595609807356425, + "learning_rate": 6.399889698500472e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.9109266400337219, + "num_tokens": 205103204.0, + "step": 9516 + }, + { + "epoch": 0.9507967430940606, + "grad_norm": 0.4930051560788597, + "learning_rate": 6.374112703777302e-08, + "loss": 0.4916, + "mean_token_accuracy": 0.9072894752025604, + "num_tokens": 205184672.0, + "step": 9517 + }, + { + "epoch": 0.950896648184225, + "grad_norm": 0.43246988532096564, + "learning_rate": 6.348387392167532e-08, + "loss": 0.489, + "mean_token_accuracy": 0.9070815443992615, + "num_tokens": 205266158.0, + "step": 9518 + }, + { + "epoch": 0.9509965532743894, + "grad_norm": 0.5551195084073545, + "learning_rate": 6.322713766364453e-08, + "loss": 0.4934, + "mean_token_accuracy": 0.9078743755817413, + "num_tokens": 205347612.0, + "step": 9519 + }, + { + "epoch": 0.9510964583645537, + "grad_norm": 0.6306251079495461, + "learning_rate": 6.29709182905619e-08, + "loss": 0.4867, + "mean_token_accuracy": 0.9058777093887329, + "num_tokens": 205429189.0, + "step": 9520 + }, + { + "epoch": 0.951196363454718, + "grad_norm": 0.45541081701774977, + "learning_rate": 6.271521582925432e-08, + "loss": 0.4894, + "mean_token_accuracy": 0.9056976735591888, + "num_tokens": 205510709.0, + "step": 9521 + }, + { + "epoch": 0.9512962685448824, + "grad_norm": 0.417798164137379, + "learning_rate": 6.246003030649318e-08, + "loss": 0.4879, + "mean_token_accuracy": 0.9086009562015533, + "num_tokens": 205592267.0, + "step": 9522 + }, + { + "epoch": 0.9513961736350467, + "grad_norm": 0.5563204089234929, + "learning_rate": 6.22053617489965e-08, + "loss": 0.4938, + "mean_token_accuracy": 0.9048750102519989, + "num_tokens": 205673719.0, + "step": 9523 + }, + { + "epoch": 0.951496078725211, + "grad_norm": 0.5749074399038993, + "learning_rate": 6.195121018342909e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.9095174372196198, + "num_tokens": 205755321.0, + "step": 9524 + }, + { + "epoch": 0.9515959838153754, + "grad_norm": 0.4161337028867574, + "learning_rate": 6.169757563640077e-08, + "loss": 0.4927, + "mean_token_accuracy": 0.9082493484020233, + "num_tokens": 205836777.0, + "step": 9525 + }, + { + "epoch": 0.9516958889055397, + "grad_norm": 0.40762087936674624, + "learning_rate": 6.144445813446754e-08, + "loss": 0.4855, + "mean_token_accuracy": 0.9071370959281921, + "num_tokens": 205918297.0, + "step": 9526 + }, + { + "epoch": 0.9517957939957041, + "grad_norm": 0.49989094199913753, + "learning_rate": 6.119185770413038e-08, + "loss": 0.4818, + "mean_token_accuracy": 0.9098529517650604, + "num_tokens": 205999907.0, + "step": 9527 + }, + { + "epoch": 0.9518956990858685, + "grad_norm": 0.3952245314956762, + "learning_rate": 6.093977437183706e-08, + "loss": 0.4863, + "mean_token_accuracy": 0.9093958139419556, + "num_tokens": 206081433.0, + "step": 9528 + }, + { + "epoch": 0.9519956041760328, + "grad_norm": 0.386672050784707, + "learning_rate": 6.068820816398091e-08, + "loss": 0.4817, + "mean_token_accuracy": 0.9094692766666412, + "num_tokens": 206163024.0, + "step": 9529 + }, + { + "epoch": 0.9520955092661971, + "grad_norm": 0.4054732131596339, + "learning_rate": 6.04371591069014e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.9094986021518707, + "num_tokens": 206244604.0, + "step": 9530 + }, + { + "epoch": 0.9521954143563615, + "grad_norm": 0.3784538239118251, + "learning_rate": 6.018662722688362e-08, + "loss": 0.4849, + "mean_token_accuracy": 0.9098161458969116, + "num_tokens": 206326144.0, + "step": 9531 + }, + { + "epoch": 0.9522953194465258, + "grad_norm": 0.4137670564225996, + "learning_rate": 5.99366125501577e-08, + "loss": 0.4913, + "mean_token_accuracy": 0.9066129624843597, + "num_tokens": 206407620.0, + "step": 9532 + }, + { + "epoch": 0.9523952245366901, + "grad_norm": 0.47414573722411885, + "learning_rate": 5.968711510290159e-08, + "loss": 0.4882, + "mean_token_accuracy": 0.9067186415195465, + "num_tokens": 206489125.0, + "step": 9533 + }, + { + "epoch": 0.9524951296268545, + "grad_norm": 0.42290931558008554, + "learning_rate": 5.943813491123718e-08, + "loss": 0.4835, + "mean_token_accuracy": 0.9089045226573944, + "num_tokens": 206570722.0, + "step": 9534 + }, + { + "epoch": 0.9525950347170188, + "grad_norm": 0.9598257086858807, + "learning_rate": 5.9189672001233046e-08, + "loss": 0.4912, + "mean_token_accuracy": 0.905344694852829, + "num_tokens": 206652169.0, + "step": 9535 + }, + { + "epoch": 0.9526949398071832, + "grad_norm": 0.48627748768805595, + "learning_rate": 5.8941726398903944e-08, + "loss": 0.4943, + "mean_token_accuracy": 0.9060976505279541, + "num_tokens": 206733627.0, + "step": 9536 + }, + { + "epoch": 0.9527948448973476, + "grad_norm": 0.40057081170610037, + "learning_rate": 5.8694298130209106e-08, + "loss": 0.4846, + "mean_token_accuracy": 0.9107274413108826, + "num_tokens": 206815151.0, + "step": 9537 + }, + { + "epoch": 0.9528947499875119, + "grad_norm": 0.5334755039365269, + "learning_rate": 5.844738722105558e-08, + "loss": 0.4883, + "mean_token_accuracy": 0.9081845283508301, + "num_tokens": 206896707.0, + "step": 9538 + }, + { + "epoch": 0.9529946550776762, + "grad_norm": 0.49939203399637266, + "learning_rate": 5.820099369729437e-08, + "loss": 0.486, + "mean_token_accuracy": 0.9090007245540619, + "num_tokens": 206978247.0, + "step": 9539 + }, + { + "epoch": 0.9530945601678406, + "grad_norm": 0.43312405220069944, + "learning_rate": 5.795511758472372e-08, + "loss": 0.4892, + "mean_token_accuracy": 0.9081860780715942, + "num_tokens": 207059707.0, + "step": 9540 + }, + { + "epoch": 0.9531944652580049, + "grad_norm": 0.4849086499244865, + "learning_rate": 5.770975890908692e-08, + "loss": 0.4932, + "mean_token_accuracy": 0.905805766582489, + "num_tokens": 207141161.0, + "step": 9541 + }, + { + "epoch": 0.9532943703481692, + "grad_norm": 0.42720588322752484, + "learning_rate": 5.746491769607288e-08, + "loss": 0.4869, + "mean_token_accuracy": 0.906084269285202, + "num_tokens": 207222678.0, + "step": 9542 + }, + { + "epoch": 0.9533942754383335, + "grad_norm": 0.44900150710985276, + "learning_rate": 5.722059397131663e-08, + "loss": 0.4887, + "mean_token_accuracy": 0.9077425003051758, + "num_tokens": 207304188.0, + "step": 9543 + }, + { + "epoch": 0.9534941805284979, + "grad_norm": 0.5455954141204901, + "learning_rate": 5.697678776039994e-08, + "loss": 0.4909, + "mean_token_accuracy": 0.9071772396564484, + "num_tokens": 207385658.0, + "step": 9544 + }, + { + "epoch": 0.9535940856186623, + "grad_norm": 0.4932545550902125, + "learning_rate": 5.6733499088849064e-08, + "loss": 0.487, + "mean_token_accuracy": 0.9090154469013214, + "num_tokens": 207467220.0, + "step": 9545 + }, + { + "epoch": 0.9536939907088267, + "grad_norm": 0.6467344445501951, + "learning_rate": 5.64907279821364e-08, + "loss": 0.4881, + "mean_token_accuracy": 0.909430593252182, + "num_tokens": 207548749.0, + "step": 9546 + }, + { + "epoch": 0.953793895798991, + "grad_norm": 0.5261551188958432, + "learning_rate": 5.624847446568049e-08, + "loss": 0.4888, + "mean_token_accuracy": 0.9088085293769836, + "num_tokens": 207630250.0, + "step": 9547 + }, + { + "epoch": 0.9538938008891553, + "grad_norm": 0.4263842273966351, + "learning_rate": 5.600673856484606e-08, + "loss": 0.4946, + "mean_token_accuracy": 0.907170295715332, + "num_tokens": 207711654.0, + "step": 9548 + }, + { + "epoch": 0.9539937059793197, + "grad_norm": 0.4614875463775571, + "learning_rate": 5.576552030494176e-08, + "loss": 0.4826, + "mean_token_accuracy": 0.9100629091262817, + "num_tokens": 207793239.0, + "step": 9549 + }, + { + "epoch": 0.954093611069484, + "grad_norm": 0.42211431260265153, + "learning_rate": 5.552481971122459e-08, + "loss": 0.492, + "mean_token_accuracy": 0.908145010471344, + "num_tokens": 207874675.0, + "step": 9550 + }, + { + "epoch": 0.9541935161596483, + "grad_norm": 0.6168305553635597, + "learning_rate": 5.528463680889606e-08, + "loss": 0.4882, + "mean_token_accuracy": 0.9064145386219025, + "num_tokens": 207956201.0, + "step": 9551 + }, + { + "epoch": 0.9542934212498126, + "grad_norm": 0.48904736663183723, + "learning_rate": 5.5044971623102736e-08, + "loss": 0.4881, + "mean_token_accuracy": 0.9078790247440338, + "num_tokens": 208037743.0, + "step": 9552 + }, + { + "epoch": 0.954393326339977, + "grad_norm": 0.481809024811085, + "learning_rate": 5.480582417893787e-08, + "loss": 0.4907, + "mean_token_accuracy": 0.9064386188983917, + "num_tokens": 208119238.0, + "step": 9553 + }, + { + "epoch": 0.9544932314301414, + "grad_norm": 0.4268964161045045, + "learning_rate": 5.456719450144143e-08, + "loss": 0.4813, + "mean_token_accuracy": 0.9113510549068451, + "num_tokens": 208200827.0, + "step": 9554 + }, + { + "epoch": 0.9545931365203058, + "grad_norm": 0.46467170077119224, + "learning_rate": 5.432908261559733e-08, + "loss": 0.4836, + "mean_token_accuracy": 0.9098673760890961, + "num_tokens": 208282415.0, + "step": 9555 + }, + { + "epoch": 0.9546930416104701, + "grad_norm": 0.4861323463355608, + "learning_rate": 5.409148854633672e-08, + "loss": 0.4939, + "mean_token_accuracy": 0.9092118144035339, + "num_tokens": 208363865.0, + "step": 9556 + }, + { + "epoch": 0.9547929467006344, + "grad_norm": 0.5797829103792868, + "learning_rate": 5.3854412318535276e-08, + "loss": 0.4922, + "mean_token_accuracy": 0.9057935178279877, + "num_tokens": 208445328.0, + "step": 9557 + }, + { + "epoch": 0.9548928517907987, + "grad_norm": 0.5219407167173696, + "learning_rate": 5.361785395701591e-08, + "loss": 0.4896, + "mean_token_accuracy": 0.9060928523540497, + "num_tokens": 208526840.0, + "step": 9558 + }, + { + "epoch": 0.9549927568809631, + "grad_norm": 0.42058087036039316, + "learning_rate": 5.338181348654548e-08, + "loss": 0.491, + "mean_token_accuracy": 0.9081389307975769, + "num_tokens": 208608294.0, + "step": 9559 + }, + { + "epoch": 0.9550926619711274, + "grad_norm": 0.7291848153020161, + "learning_rate": 5.314629093183865e-08, + "loss": 0.493, + "mean_token_accuracy": 0.9068372249603271, + "num_tokens": 208689801.0, + "step": 9560 + }, + { + "epoch": 0.9551925670612917, + "grad_norm": 0.9615957419775644, + "learning_rate": 5.291128631755349e-08, + "loss": 0.4977, + "mean_token_accuracy": 0.905561238527298, + "num_tokens": 208771229.0, + "step": 9561 + }, + { + "epoch": 0.9552924721514561, + "grad_norm": 0.48045365367978915, + "learning_rate": 5.267679966829697e-08, + "loss": 0.4856, + "mean_token_accuracy": 0.9100370705127716, + "num_tokens": 208852777.0, + "step": 9562 + }, + { + "epoch": 0.9553923772416204, + "grad_norm": 0.5040155209680034, + "learning_rate": 5.244283100861891e-08, + "loss": 0.486, + "mean_token_accuracy": 0.9086353182792664, + "num_tokens": 208934303.0, + "step": 9563 + }, + { + "epoch": 0.9554922823317848, + "grad_norm": 0.44911954997525255, + "learning_rate": 5.2209380363016924e-08, + "loss": 0.4869, + "mean_token_accuracy": 0.9093825817108154, + "num_tokens": 209015818.0, + "step": 9564 + }, + { + "epoch": 0.9555921874219492, + "grad_norm": 0.4145954032850876, + "learning_rate": 5.197644775593258e-08, + "loss": 0.4847, + "mean_token_accuracy": 0.90692138671875, + "num_tokens": 209097374.0, + "step": 9565 + }, + { + "epoch": 0.9556920925121135, + "grad_norm": 0.4728185344431707, + "learning_rate": 5.174403321175414e-08, + "loss": 0.4867, + "mean_token_accuracy": 0.9082363843917847, + "num_tokens": 209178889.0, + "step": 9566 + }, + { + "epoch": 0.9557919976022778, + "grad_norm": 0.4176229539524707, + "learning_rate": 5.151213675481659e-08, + "loss": 0.4872, + "mean_token_accuracy": 0.9093952476978302, + "num_tokens": 209260370.0, + "step": 9567 + }, + { + "epoch": 0.9558919026924422, + "grad_norm": 0.4103824006219394, + "learning_rate": 5.128075840939883e-08, + "loss": 0.4844, + "mean_token_accuracy": 0.9098670780658722, + "num_tokens": 209341958.0, + "step": 9568 + }, + { + "epoch": 0.9559918077826065, + "grad_norm": 0.4063289068672527, + "learning_rate": 5.104989819972706e-08, + "loss": 0.4919, + "mean_token_accuracy": 0.9061520099639893, + "num_tokens": 209423431.0, + "step": 9569 + }, + { + "epoch": 0.9560917128727708, + "grad_norm": 0.4282691132302206, + "learning_rate": 5.081955614997247e-08, + "loss": 0.4879, + "mean_token_accuracy": 0.9086818099021912, + "num_tokens": 209504950.0, + "step": 9570 + }, + { + "epoch": 0.9561916179629352, + "grad_norm": 0.45786566498104564, + "learning_rate": 5.0589732284251345e-08, + "loss": 0.4896, + "mean_token_accuracy": 0.9101666212081909, + "num_tokens": 209586469.0, + "step": 9571 + }, + { + "epoch": 0.9562915230530995, + "grad_norm": 0.47236321989575897, + "learning_rate": 5.036042662662721e-08, + "loss": 0.4914, + "mean_token_accuracy": 0.90825155377388, + "num_tokens": 209667964.0, + "step": 9572 + }, + { + "epoch": 0.9563914281432639, + "grad_norm": 0.4226919934662899, + "learning_rate": 5.013163920110864e-08, + "loss": 0.4898, + "mean_token_accuracy": 0.910307377576828, + "num_tokens": 209749438.0, + "step": 9573 + }, + { + "epoch": 0.9564913332334283, + "grad_norm": 0.46624747165589964, + "learning_rate": 4.9903370031649246e-08, + "loss": 0.4893, + "mean_token_accuracy": 0.9088124632835388, + "num_tokens": 209830906.0, + "step": 9574 + }, + { + "epoch": 0.9565912383235926, + "grad_norm": 0.5460715104791825, + "learning_rate": 4.9675619142149356e-08, + "loss": 0.4889, + "mean_token_accuracy": 0.9072876572608948, + "num_tokens": 209912419.0, + "step": 9575 + }, + { + "epoch": 0.9566911434137569, + "grad_norm": 1.0185357758401408, + "learning_rate": 4.94483865564549e-08, + "loss": 0.4861, + "mean_token_accuracy": 0.9087967276573181, + "num_tokens": 209993973.0, + "step": 9576 + }, + { + "epoch": 0.9567910485039213, + "grad_norm": 0.4745376874996216, + "learning_rate": 4.9221672298357393e-08, + "loss": 0.4857, + "mean_token_accuracy": 0.9077712893486023, + "num_tokens": 210075508.0, + "step": 9577 + }, + { + "epoch": 0.9568909535940856, + "grad_norm": 0.511936777718955, + "learning_rate": 4.899547639159397e-08, + "loss": 0.4865, + "mean_token_accuracy": 0.9070868790149689, + "num_tokens": 210157056.0, + "step": 9578 + }, + { + "epoch": 0.9569908586842499, + "grad_norm": 0.5110259530761786, + "learning_rate": 4.876979885984734e-08, + "loss": 0.4924, + "mean_token_accuracy": 0.903995156288147, + "num_tokens": 210238541.0, + "step": 9579 + }, + { + "epoch": 0.9570907637744143, + "grad_norm": 0.6223542203188401, + "learning_rate": 4.854463972674639e-08, + "loss": 0.4887, + "mean_token_accuracy": 0.9063309133052826, + "num_tokens": 210320048.0, + "step": 9580 + }, + { + "epoch": 0.9571906688645786, + "grad_norm": 0.44857358204436865, + "learning_rate": 4.8319999015865035e-08, + "loss": 0.4831, + "mean_token_accuracy": 0.9063825905323029, + "num_tokens": 210401634.0, + "step": 9581 + }, + { + "epoch": 0.957290573954743, + "grad_norm": 0.47400985280050273, + "learning_rate": 4.809587675072447e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.9077596664428711, + "num_tokens": 210483189.0, + "step": 9582 + }, + { + "epoch": 0.9573904790449074, + "grad_norm": 0.5031073159624493, + "learning_rate": 4.7872272954789244e-08, + "loss": 0.4939, + "mean_token_accuracy": 0.9058516621589661, + "num_tokens": 210564628.0, + "step": 9583 + }, + { + "epoch": 0.9574903841350717, + "grad_norm": 0.530119687125444, + "learning_rate": 4.764918765147175e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9080368280410767, + "num_tokens": 210646149.0, + "step": 9584 + }, + { + "epoch": 0.957590289225236, + "grad_norm": 0.3941849826595599, + "learning_rate": 4.742662086412886e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.906387209892273, + "num_tokens": 210727694.0, + "step": 9585 + }, + { + "epoch": 0.9576901943154004, + "grad_norm": 0.40902394185547836, + "learning_rate": 4.720457261606304e-08, + "loss": 0.4898, + "mean_token_accuracy": 0.9068315923213959, + "num_tokens": 210809189.0, + "step": 9586 + }, + { + "epoch": 0.9577900994055647, + "grad_norm": 0.44811557538495184, + "learning_rate": 4.6983042930524023e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.9056384563446045, + "num_tokens": 210890776.0, + "step": 9587 + }, + { + "epoch": 0.957890004495729, + "grad_norm": 0.43820455232444605, + "learning_rate": 4.676203183070604e-08, + "loss": 0.4875, + "mean_token_accuracy": 0.9111903309822083, + "num_tokens": 210972270.0, + "step": 9588 + }, + { + "epoch": 0.9579899095858934, + "grad_norm": 0.38729183468701817, + "learning_rate": 4.6541539339748366e-08, + "loss": 0.4913, + "mean_token_accuracy": 0.9081854820251465, + "num_tokens": 211053748.0, + "step": 9589 + }, + { + "epoch": 0.9580898146760577, + "grad_norm": 0.4404985423439748, + "learning_rate": 4.632156548073752e-08, + "loss": 0.4884, + "mean_token_accuracy": 0.9087126553058624, + "num_tokens": 211135248.0, + "step": 9590 + }, + { + "epoch": 0.9581897197662221, + "grad_norm": 0.48689017903384424, + "learning_rate": 4.610211027670397e-08, + "loss": 0.4882, + "mean_token_accuracy": 0.9100488126277924, + "num_tokens": 211216732.0, + "step": 9591 + }, + { + "epoch": 0.9582896248563865, + "grad_norm": 0.4509708815496384, + "learning_rate": 4.588317375062656e-08, + "loss": 0.483, + "mean_token_accuracy": 0.9103940427303314, + "num_tokens": 211298312.0, + "step": 9592 + }, + { + "epoch": 0.9583895299465508, + "grad_norm": 0.4793343722949726, + "learning_rate": 4.5664755925426406e-08, + "loss": 0.4914, + "mean_token_accuracy": 0.9062987267971039, + "num_tokens": 211379785.0, + "step": 9593 + }, + { + "epoch": 0.9584894350367151, + "grad_norm": 0.44515877111713315, + "learning_rate": 4.544685682397354e-08, + "loss": 0.4829, + "mean_token_accuracy": 0.9070899784564972, + "num_tokens": 211461368.0, + "step": 9594 + }, + { + "epoch": 0.9585893401268795, + "grad_norm": 0.5227615137482271, + "learning_rate": 4.522947646908138e-08, + "loss": 0.4892, + "mean_token_accuracy": 0.9054054021835327, + "num_tokens": 211542884.0, + "step": 9595 + }, + { + "epoch": 0.9586892452170438, + "grad_norm": 0.47436429069063, + "learning_rate": 4.5012614883510054e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9085226655006409, + "num_tokens": 211624407.0, + "step": 9596 + }, + { + "epoch": 0.9587891503072081, + "grad_norm": 0.4992706798669938, + "learning_rate": 4.479627208996529e-08, + "loss": 0.4947, + "mean_token_accuracy": 0.9056359827518463, + "num_tokens": 211705818.0, + "step": 9597 + }, + { + "epoch": 0.9588890553973725, + "grad_norm": 0.5028454272376454, + "learning_rate": 4.458044811109785e-08, + "loss": 0.4936, + "mean_token_accuracy": 0.9083719253540039, + "num_tokens": 211787289.0, + "step": 9598 + }, + { + "epoch": 0.9589889604875368, + "grad_norm": 0.4427733289246312, + "learning_rate": 4.436514296950523e-08, + "loss": 0.4835, + "mean_token_accuracy": 0.9087640941143036, + "num_tokens": 211868893.0, + "step": 9599 + }, + { + "epoch": 0.9590888655777012, + "grad_norm": 0.45326165580001854, + "learning_rate": 4.4150356687729935e-08, + "loss": 0.4877, + "mean_token_accuracy": 0.9091308414936066, + "num_tokens": 211950365.0, + "step": 9600 + }, + { + "epoch": 0.9591887706678656, + "grad_norm": 0.45012110792014776, + "learning_rate": 4.3936089288260654e-08, + "loss": 0.4875, + "mean_token_accuracy": 0.907136082649231, + "num_tokens": 212031877.0, + "step": 9601 + }, + { + "epoch": 0.9592886757580299, + "grad_norm": 0.4841276405499881, + "learning_rate": 4.3722340793531106e-08, + "loss": 0.48, + "mean_token_accuracy": 0.9091916978359222, + "num_tokens": 212113518.0, + "step": 9602 + }, + { + "epoch": 0.9593885808481942, + "grad_norm": 0.46189794837520987, + "learning_rate": 4.350911122592061e-08, + "loss": 0.4867, + "mean_token_accuracy": 0.9077928960323334, + "num_tokens": 212195062.0, + "step": 9603 + }, + { + "epoch": 0.9594884859383586, + "grad_norm": 0.44380130768707027, + "learning_rate": 4.3296400607755193e-08, + "loss": 0.4875, + "mean_token_accuracy": 0.908366858959198, + "num_tokens": 212276589.0, + "step": 9604 + }, + { + "epoch": 0.9595883910285229, + "grad_norm": 0.5189406447451232, + "learning_rate": 4.308420896130483e-08, + "loss": 0.4961, + "mean_token_accuracy": 0.9071078598499298, + "num_tokens": 212358043.0, + "step": 9605 + }, + { + "epoch": 0.9596882961186872, + "grad_norm": 0.6924419636640576, + "learning_rate": 4.2872536308787847e-08, + "loss": 0.4833, + "mean_token_accuracy": 0.9094472527503967, + "num_tokens": 212439630.0, + "step": 9606 + }, + { + "epoch": 0.9597882012088516, + "grad_norm": 0.45360421496003966, + "learning_rate": 4.266138267236486e-08, + "loss": 0.4843, + "mean_token_accuracy": 0.9094983339309692, + "num_tokens": 212521210.0, + "step": 9607 + }, + { + "epoch": 0.9598881062990159, + "grad_norm": 0.5092145096845399, + "learning_rate": 4.2450748074144844e-08, + "loss": 0.4917, + "mean_token_accuracy": 0.9064461290836334, + "num_tokens": 212602687.0, + "step": 9608 + }, + { + "epoch": 0.9599880113891803, + "grad_norm": 0.4239190675558457, + "learning_rate": 4.224063253618127e-08, + "loss": 0.4856, + "mean_token_accuracy": 0.9086397886276245, + "num_tokens": 212684228.0, + "step": 9609 + }, + { + "epoch": 0.9600879164793447, + "grad_norm": 0.47962116327544135, + "learning_rate": 4.203103608047321e-08, + "loss": 0.4888, + "mean_token_accuracy": 0.9074399173259735, + "num_tokens": 212765742.0, + "step": 9610 + }, + { + "epoch": 0.960187821569509, + "grad_norm": 0.43288928839666163, + "learning_rate": 4.182195872896588e-08, + "loss": 0.4918, + "mean_token_accuracy": 0.9082578122615814, + "num_tokens": 212847183.0, + "step": 9611 + }, + { + "epoch": 0.9602877266596733, + "grad_norm": 0.6520467383464623, + "learning_rate": 4.1613400503550114e-08, + "loss": 0.4834, + "mean_token_accuracy": 0.9084227383136749, + "num_tokens": 212928759.0, + "step": 9612 + }, + { + "epoch": 0.9603876317498377, + "grad_norm": 0.4061607244558005, + "learning_rate": 4.140536142606177e-08, + "loss": 0.4857, + "mean_token_accuracy": 0.9102328419685364, + "num_tokens": 213010290.0, + "step": 9613 + }, + { + "epoch": 0.960487536840002, + "grad_norm": 0.5090643962304451, + "learning_rate": 4.119784151828288e-08, + "loss": 0.4901, + "mean_token_accuracy": 0.9098085165023804, + "num_tokens": 213091767.0, + "step": 9614 + }, + { + "epoch": 0.9605874419301663, + "grad_norm": 0.4713210602249943, + "learning_rate": 4.099084080194049e-08, + "loss": 0.489, + "mean_token_accuracy": 0.9078930616378784, + "num_tokens": 213173308.0, + "step": 9615 + }, + { + "epoch": 0.9606873470203307, + "grad_norm": 0.46726214840895336, + "learning_rate": 4.078435929870839e-08, + "loss": 0.4951, + "mean_token_accuracy": 0.9058213531970978, + "num_tokens": 213254732.0, + "step": 9616 + }, + { + "epoch": 0.960787252110495, + "grad_norm": 0.43361354677116487, + "learning_rate": 4.057839703020483e-08, + "loss": 0.4862, + "mean_token_accuracy": 0.9071291387081146, + "num_tokens": 213336263.0, + "step": 9617 + }, + { + "epoch": 0.9608871572006594, + "grad_norm": 0.666811345811288, + "learning_rate": 4.0372954017995346e-08, + "loss": 0.4892, + "mean_token_accuracy": 0.9062985777854919, + "num_tokens": 213417788.0, + "step": 9618 + }, + { + "epoch": 0.9609870622908238, + "grad_norm": 0.4350041076607782, + "learning_rate": 4.01680302835894e-08, + "loss": 0.4902, + "mean_token_accuracy": 0.9053328633308411, + "num_tokens": 213499239.0, + "step": 9619 + }, + { + "epoch": 0.9610869673809881, + "grad_norm": 0.6931489913209726, + "learning_rate": 3.996362584844204e-08, + "loss": 0.4914, + "mean_token_accuracy": 0.9087207913398743, + "num_tokens": 213580677.0, + "step": 9620 + }, + { + "epoch": 0.9611868724711524, + "grad_norm": 0.5367957941431848, + "learning_rate": 3.975974073395561e-08, + "loss": 0.4854, + "mean_token_accuracy": 0.90767902135849, + "num_tokens": 213662254.0, + "step": 9621 + }, + { + "epoch": 0.9612867775613168, + "grad_norm": 1.1784427176709946, + "learning_rate": 3.955637496147635e-08, + "loss": 0.4869, + "mean_token_accuracy": 0.9065560400485992, + "num_tokens": 213743762.0, + "step": 9622 + }, + { + "epoch": 0.9613866826514811, + "grad_norm": 0.5113241118665179, + "learning_rate": 3.935352855229724e-08, + "loss": 0.4817, + "mean_token_accuracy": 0.9119596481323242, + "num_tokens": 213825367.0, + "step": 9623 + }, + { + "epoch": 0.9614865877416454, + "grad_norm": 0.7007052383348814, + "learning_rate": 3.915120152765684e-08, + "loss": 0.4931, + "mean_token_accuracy": 0.9047211706638336, + "num_tokens": 213906858.0, + "step": 9624 + }, + { + "epoch": 0.9615864928318097, + "grad_norm": 0.45870266462614845, + "learning_rate": 3.8949393908738195e-08, + "loss": 0.4944, + "mean_token_accuracy": 0.9069011807441711, + "num_tokens": 213988306.0, + "step": 9625 + }, + { + "epoch": 0.9616863979219741, + "grad_norm": 0.46862551140218806, + "learning_rate": 3.874810571667109e-08, + "loss": 0.4881, + "mean_token_accuracy": 0.9105350375175476, + "num_tokens": 214069798.0, + "step": 9626 + }, + { + "epoch": 0.9617863030121385, + "grad_norm": 0.5326569232294098, + "learning_rate": 3.854733697253088e-08, + "loss": 0.4875, + "mean_token_accuracy": 0.907804399728775, + "num_tokens": 214151312.0, + "step": 9627 + }, + { + "epoch": 0.9618862081023029, + "grad_norm": 0.5689418200264549, + "learning_rate": 3.834708769733797e-08, + "loss": 0.4885, + "mean_token_accuracy": 0.9085322320461273, + "num_tokens": 214232875.0, + "step": 9628 + }, + { + "epoch": 0.9619861131924672, + "grad_norm": 0.49199963078868336, + "learning_rate": 3.814735791205893e-08, + "loss": 0.4841, + "mean_token_accuracy": 0.9103612005710602, + "num_tokens": 214314422.0, + "step": 9629 + }, + { + "epoch": 0.9620860182826315, + "grad_norm": 1.529145036418149, + "learning_rate": 3.794814763760479e-08, + "loss": 0.4945, + "mean_token_accuracy": 0.9079782366752625, + "num_tokens": 214395884.0, + "step": 9630 + }, + { + "epoch": 0.9621859233727958, + "grad_norm": 0.583561987377197, + "learning_rate": 3.774945689483445e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9076695740222931, + "num_tokens": 214477408.0, + "step": 9631 + }, + { + "epoch": 0.9622858284629602, + "grad_norm": 0.8797568280708856, + "learning_rate": 3.7551285704549576e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9076666831970215, + "num_tokens": 214558909.0, + "step": 9632 + }, + { + "epoch": 0.9623857335531245, + "grad_norm": 0.48822799371350717, + "learning_rate": 3.735363408750026e-08, + "loss": 0.4883, + "mean_token_accuracy": 0.90940722823143, + "num_tokens": 214640424.0, + "step": 9633 + }, + { + "epoch": 0.9624856386432888, + "grad_norm": 0.727544023525746, + "learning_rate": 3.7156502064379374e-08, + "loss": 0.4877, + "mean_token_accuracy": 0.9111944437026978, + "num_tokens": 214721899.0, + "step": 9634 + }, + { + "epoch": 0.9625855437334532, + "grad_norm": 0.5645968502588855, + "learning_rate": 3.69598896558282e-08, + "loss": 0.4887, + "mean_token_accuracy": 0.9094114303588867, + "num_tokens": 214803427.0, + "step": 9635 + }, + { + "epoch": 0.9626854488236176, + "grad_norm": 0.47356442858614844, + "learning_rate": 3.676379688243192e-08, + "loss": 0.492, + "mean_token_accuracy": 0.9068838655948639, + "num_tokens": 214884927.0, + "step": 9636 + }, + { + "epoch": 0.962785353913782, + "grad_norm": 0.44825324338782974, + "learning_rate": 3.65682237647208e-08, + "loss": 0.4867, + "mean_token_accuracy": 0.9077653586864471, + "num_tokens": 214966478.0, + "step": 9637 + }, + { + "epoch": 0.9628852590039463, + "grad_norm": 0.5255661877185932, + "learning_rate": 3.637317032317234e-08, + "loss": 0.4921, + "mean_token_accuracy": 0.9073395431041718, + "num_tokens": 215047952.0, + "step": 9638 + }, + { + "epoch": 0.9629851640941106, + "grad_norm": 0.45793440678851965, + "learning_rate": 3.6178636578208525e-08, + "loss": 0.4804, + "mean_token_accuracy": 0.9072031676769257, + "num_tokens": 215129553.0, + "step": 9639 + }, + { + "epoch": 0.9630850691842749, + "grad_norm": 0.6809788490111263, + "learning_rate": 3.598462255019752e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.9071299433708191, + "num_tokens": 215211044.0, + "step": 9640 + }, + { + "epoch": 0.9631849742744393, + "grad_norm": 0.41154249248358066, + "learning_rate": 3.5791128259452525e-08, + "loss": 0.4867, + "mean_token_accuracy": 0.910077691078186, + "num_tokens": 215292552.0, + "step": 9641 + }, + { + "epoch": 0.9632848793646036, + "grad_norm": 0.6900671405012435, + "learning_rate": 3.559815372623288e-08, + "loss": 0.4842, + "mean_token_accuracy": 0.9090079069137573, + "num_tokens": 215374157.0, + "step": 9642 + }, + { + "epoch": 0.9633847844547679, + "grad_norm": 0.42114843730612356, + "learning_rate": 3.5405698970742416e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9091235399246216, + "num_tokens": 215455661.0, + "step": 9643 + }, + { + "epoch": 0.9634846895449323, + "grad_norm": 0.3797624592583178, + "learning_rate": 3.521376401313226e-08, + "loss": 0.4833, + "mean_token_accuracy": 0.9121258854866028, + "num_tokens": 215537191.0, + "step": 9644 + }, + { + "epoch": 0.9635845946350966, + "grad_norm": 0.5677425406724891, + "learning_rate": 3.5022348873497426e-08, + "loss": 0.4914, + "mean_token_accuracy": 0.9089153409004211, + "num_tokens": 215618656.0, + "step": 9645 + }, + { + "epoch": 0.963684499725261, + "grad_norm": 0.434734054463317, + "learning_rate": 3.483145357187967e-08, + "loss": 0.49, + "mean_token_accuracy": 0.9059318602085114, + "num_tokens": 215700111.0, + "step": 9646 + }, + { + "epoch": 0.9637844048154254, + "grad_norm": 0.41426578236247485, + "learning_rate": 3.4641078128266336e-08, + "loss": 0.4909, + "mean_token_accuracy": 0.9071386754512787, + "num_tokens": 215781588.0, + "step": 9647 + }, + { + "epoch": 0.9638843099055897, + "grad_norm": 0.4506583222709187, + "learning_rate": 3.445122256258926e-08, + "loss": 0.4863, + "mean_token_accuracy": 0.9080137312412262, + "num_tokens": 215863127.0, + "step": 9648 + }, + { + "epoch": 0.963984214995754, + "grad_norm": 0.408916091911987, + "learning_rate": 3.4261886894726424e-08, + "loss": 0.4859, + "mean_token_accuracy": 0.9085707664489746, + "num_tokens": 215944654.0, + "step": 9649 + }, + { + "epoch": 0.9640841200859184, + "grad_norm": 0.4848297918084732, + "learning_rate": 3.407307114450253e-08, + "loss": 0.495, + "mean_token_accuracy": 0.9052442908287048, + "num_tokens": 216026083.0, + "step": 9650 + }, + { + "epoch": 0.9641840251760827, + "grad_norm": 0.4743968073937152, + "learning_rate": 3.3884775331685105e-08, + "loss": 0.4924, + "mean_token_accuracy": 0.9038590490818024, + "num_tokens": 216107576.0, + "step": 9651 + }, + { + "epoch": 0.964283930266247, + "grad_norm": 0.3978724238033031, + "learning_rate": 3.3696999475990585e-08, + "loss": 0.4856, + "mean_token_accuracy": 0.9096372425556183, + "num_tokens": 216189090.0, + "step": 9652 + }, + { + "epoch": 0.9643838353564114, + "grad_norm": 0.3955702539221929, + "learning_rate": 3.35097435970777e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.9088348150253296, + "num_tokens": 216270645.0, + "step": 9653 + }, + { + "epoch": 0.9644837404465757, + "grad_norm": 0.4455351270435082, + "learning_rate": 3.3323007714553546e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.9078717827796936, + "num_tokens": 216352088.0, + "step": 9654 + }, + { + "epoch": 0.9645836455367401, + "grad_norm": 0.7387468659279118, + "learning_rate": 3.313679184796914e-08, + "loss": 0.4918, + "mean_token_accuracy": 0.9092593193054199, + "num_tokens": 216433559.0, + "step": 9655 + }, + { + "epoch": 0.9646835506269045, + "grad_norm": 0.39720812041624165, + "learning_rate": 3.295109601682167e-08, + "loss": 0.484, + "mean_token_accuracy": 0.9084287881851196, + "num_tokens": 216515102.0, + "step": 9656 + }, + { + "epoch": 0.9647834557170688, + "grad_norm": 0.4166417862668272, + "learning_rate": 3.276592024055281e-08, + "loss": 0.4864, + "mean_token_accuracy": 0.9092056155204773, + "num_tokens": 216596649.0, + "step": 9657 + }, + { + "epoch": 0.9648833608072331, + "grad_norm": 0.568514649628697, + "learning_rate": 3.2581264538552035e-08, + "loss": 0.4943, + "mean_token_accuracy": 0.9075986742973328, + "num_tokens": 216678112.0, + "step": 9658 + }, + { + "epoch": 0.9649832658973975, + "grad_norm": 0.41866377460462434, + "learning_rate": 3.239712893015168e-08, + "loss": 0.4822, + "mean_token_accuracy": 0.9090056419372559, + "num_tokens": 216759708.0, + "step": 9659 + }, + { + "epoch": 0.9650831709875618, + "grad_norm": 0.4919027608999647, + "learning_rate": 3.221351343463186e-08, + "loss": 0.495, + "mean_token_accuracy": 0.90667325258255, + "num_tokens": 216841144.0, + "step": 9660 + }, + { + "epoch": 0.9651830760777261, + "grad_norm": 0.4504732761043909, + "learning_rate": 3.203041807121665e-08, + "loss": 0.4886, + "mean_token_accuracy": 0.908411055803299, + "num_tokens": 216922630.0, + "step": 9661 + }, + { + "epoch": 0.9652829811678905, + "grad_norm": 0.5462494347194496, + "learning_rate": 3.184784285907683e-08, + "loss": 0.4889, + "mean_token_accuracy": 0.9079912900924683, + "num_tokens": 217004117.0, + "step": 9662 + }, + { + "epoch": 0.9653828862580548, + "grad_norm": 0.4491274614961854, + "learning_rate": 3.1665787817327656e-08, + "loss": 0.4836, + "mean_token_accuracy": 0.9090801179409027, + "num_tokens": 217085682.0, + "step": 9663 + }, + { + "epoch": 0.9654827913482192, + "grad_norm": 0.5676482879621438, + "learning_rate": 3.14842529650311e-08, + "loss": 0.4798, + "mean_token_accuracy": 0.9088145792484283, + "num_tokens": 217167312.0, + "step": 9664 + }, + { + "epoch": 0.9655826964383836, + "grad_norm": 0.45813562860278173, + "learning_rate": 3.130323832119308e-08, + "loss": 0.4862, + "mean_token_accuracy": 0.9091743528842926, + "num_tokens": 217248872.0, + "step": 9665 + }, + { + "epoch": 0.9656826015285479, + "grad_norm": 0.44361753112229346, + "learning_rate": 3.112274390476733e-08, + "loss": 0.4947, + "mean_token_accuracy": 0.9081938564777374, + "num_tokens": 217330309.0, + "step": 9666 + }, + { + "epoch": 0.9657825066187122, + "grad_norm": 0.4228412749982529, + "learning_rate": 3.0942769734650936e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.9069490134716034, + "num_tokens": 217411760.0, + "step": 9667 + }, + { + "epoch": 0.9658824117088766, + "grad_norm": 0.5148226138485843, + "learning_rate": 3.076331582968717e-08, + "loss": 0.492, + "mean_token_accuracy": 0.906779944896698, + "num_tokens": 217493225.0, + "step": 9668 + }, + { + "epoch": 0.9659823167990409, + "grad_norm": 0.4272194996157584, + "learning_rate": 3.058438220866544e-08, + "loss": 0.4822, + "mean_token_accuracy": 0.9095409214496613, + "num_tokens": 217574807.0, + "step": 9669 + }, + { + "epoch": 0.9660822218892052, + "grad_norm": 0.7383055589737509, + "learning_rate": 3.040596889032077e-08, + "loss": 0.4885, + "mean_token_accuracy": 0.9056343734264374, + "num_tokens": 217656331.0, + "step": 9670 + }, + { + "epoch": 0.9661821269793696, + "grad_norm": 0.4507503292577394, + "learning_rate": 3.02280758933321e-08, + "loss": 0.4816, + "mean_token_accuracy": 0.9060251414775848, + "num_tokens": 217737939.0, + "step": 9671 + }, + { + "epoch": 0.9662820320695339, + "grad_norm": 0.3889543502217398, + "learning_rate": 3.005070323632564e-08, + "loss": 0.4836, + "mean_token_accuracy": 0.909544050693512, + "num_tokens": 217819490.0, + "step": 9672 + }, + { + "epoch": 0.9663819371596983, + "grad_norm": 0.4413359322530998, + "learning_rate": 2.987385093787265e-08, + "loss": 0.4826, + "mean_token_accuracy": 0.9099606573581696, + "num_tokens": 217901076.0, + "step": 9673 + }, + { + "epoch": 0.9664818422498627, + "grad_norm": 0.7295262721205693, + "learning_rate": 2.9697519016488874e-08, + "loss": 0.4843, + "mean_token_accuracy": 0.9102160930633545, + "num_tokens": 217982629.0, + "step": 9674 + }, + { + "epoch": 0.966581747340027, + "grad_norm": 0.4446613502703691, + "learning_rate": 2.9521707490637873e-08, + "loss": 0.4885, + "mean_token_accuracy": 0.908316045999527, + "num_tokens": 218064133.0, + "step": 9675 + }, + { + "epoch": 0.9666816524301913, + "grad_norm": 0.40219744469382696, + "learning_rate": 2.934641637872604e-08, + "loss": 0.485, + "mean_token_accuracy": 0.9105742573738098, + "num_tokens": 218145671.0, + "step": 9676 + }, + { + "epoch": 0.9667815575203557, + "grad_norm": 0.5082524227194403, + "learning_rate": 2.917164569910702e-08, + "loss": 0.4872, + "mean_token_accuracy": 0.90879225730896, + "num_tokens": 218227190.0, + "step": 9677 + }, + { + "epoch": 0.96688146261052, + "grad_norm": 0.3980925557785915, + "learning_rate": 2.899739547007896e-08, + "loss": 0.4823, + "mean_token_accuracy": 0.9076718389987946, + "num_tokens": 218308754.0, + "step": 9678 + }, + { + "epoch": 0.9669813677006843, + "grad_norm": 0.4735362168360321, + "learning_rate": 2.882366570988726e-08, + "loss": 0.4908, + "mean_token_accuracy": 0.9087568521499634, + "num_tokens": 218390238.0, + "step": 9679 + }, + { + "epoch": 0.9670812727908487, + "grad_norm": 0.46338464443002975, + "learning_rate": 2.8650456436720154e-08, + "loss": 0.4885, + "mean_token_accuracy": 0.9086220264434814, + "num_tokens": 218471712.0, + "step": 9680 + }, + { + "epoch": 0.967181177881013, + "grad_norm": 0.43292333290033774, + "learning_rate": 2.8477767668713686e-08, + "loss": 0.4934, + "mean_token_accuracy": 0.9090156257152557, + "num_tokens": 218553141.0, + "step": 9681 + }, + { + "epoch": 0.9672810829711774, + "grad_norm": 0.5795719880881747, + "learning_rate": 2.8305599423948394e-08, + "loss": 0.4907, + "mean_token_accuracy": 0.9066676795482635, + "num_tokens": 218634591.0, + "step": 9682 + }, + { + "epoch": 0.9673809880613418, + "grad_norm": 0.41727828599039607, + "learning_rate": 2.8133951720450415e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9083666503429413, + "num_tokens": 218716086.0, + "step": 9683 + }, + { + "epoch": 0.9674808931515061, + "grad_norm": 0.4307114736705081, + "learning_rate": 2.7962824576191483e-08, + "loss": 0.4837, + "mean_token_accuracy": 0.9092889726161957, + "num_tokens": 218797637.0, + "step": 9684 + }, + { + "epoch": 0.9675807982416704, + "grad_norm": 0.5565077081386276, + "learning_rate": 2.7792218009088377e-08, + "loss": 0.4953, + "mean_token_accuracy": 0.9054078161716461, + "num_tokens": 218879093.0, + "step": 9685 + }, + { + "epoch": 0.9676807033318348, + "grad_norm": 0.5127068155637545, + "learning_rate": 2.762213203700459e-08, + "loss": 0.4882, + "mean_token_accuracy": 0.9065166711807251, + "num_tokens": 218960573.0, + "step": 9686 + }, + { + "epoch": 0.9677806084219991, + "grad_norm": 0.44827937259498424, + "learning_rate": 2.7452566677747538e-08, + "loss": 0.4885, + "mean_token_accuracy": 0.9078951179981232, + "num_tokens": 219042081.0, + "step": 9687 + }, + { + "epoch": 0.9678805135121634, + "grad_norm": 0.5309956156307034, + "learning_rate": 2.7283521949070802e-08, + "loss": 0.4913, + "mean_token_accuracy": 0.9102693200111389, + "num_tokens": 219123617.0, + "step": 9688 + }, + { + "epoch": 0.9679804186023278, + "grad_norm": 0.4697902693955412, + "learning_rate": 2.711499786867411e-08, + "loss": 0.4873, + "mean_token_accuracy": 0.9086889624595642, + "num_tokens": 219205148.0, + "step": 9689 + }, + { + "epoch": 0.9680803236924921, + "grad_norm": 0.44262006321316655, + "learning_rate": 2.6946994454202235e-08, + "loss": 0.4899, + "mean_token_accuracy": 0.9083280563354492, + "num_tokens": 219286640.0, + "step": 9690 + }, + { + "epoch": 0.9681802287826565, + "grad_norm": 0.4989959379836029, + "learning_rate": 2.6779511723244444e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.9070774018764496, + "num_tokens": 219368130.0, + "step": 9691 + }, + { + "epoch": 0.9682801338728209, + "grad_norm": 0.43976699149398474, + "learning_rate": 2.6612549693337264e-08, + "loss": 0.4897, + "mean_token_accuracy": 0.9064203798770905, + "num_tokens": 219449644.0, + "step": 9692 + }, + { + "epoch": 0.9683800389629852, + "grad_norm": 0.5325189005410463, + "learning_rate": 2.64461083819606e-08, + "loss": 0.4873, + "mean_token_accuracy": 0.9115654230117798, + "num_tokens": 219531161.0, + "step": 9693 + }, + { + "epoch": 0.9684799440531495, + "grad_norm": 0.5122149406081147, + "learning_rate": 2.6280187806542735e-08, + "loss": 0.4926, + "mean_token_accuracy": 0.909501850605011, + "num_tokens": 219612599.0, + "step": 9694 + }, + { + "epoch": 0.9685798491433139, + "grad_norm": 0.41797809506126693, + "learning_rate": 2.6114787984454214e-08, + "loss": 0.487, + "mean_token_accuracy": 0.9086633324623108, + "num_tokens": 219694129.0, + "step": 9695 + }, + { + "epoch": 0.9686797542334782, + "grad_norm": 0.4075769303358937, + "learning_rate": 2.5949908933012857e-08, + "loss": 0.4918, + "mean_token_accuracy": 0.9099292159080505, + "num_tokens": 219775536.0, + "step": 9696 + }, + { + "epoch": 0.9687796593236425, + "grad_norm": 0.5291315973589017, + "learning_rate": 2.5785550669482072e-08, + "loss": 0.4941, + "mean_token_accuracy": 0.9071157574653625, + "num_tokens": 219856990.0, + "step": 9697 + }, + { + "epoch": 0.9688795644138068, + "grad_norm": 0.6012665778494094, + "learning_rate": 2.562171321107032e-08, + "loss": 0.492, + "mean_token_accuracy": 0.9071909189224243, + "num_tokens": 219938455.0, + "step": 9698 + }, + { + "epoch": 0.9689794695039712, + "grad_norm": 0.47656285317210034, + "learning_rate": 2.54583965749311e-08, + "loss": 0.489, + "mean_token_accuracy": 0.907319039106369, + "num_tokens": 220019951.0, + "step": 9699 + }, + { + "epoch": 0.9690793745941356, + "grad_norm": 0.4078109436017878, + "learning_rate": 2.529560077816462e-08, + "loss": 0.4866, + "mean_token_accuracy": 0.9082962274551392, + "num_tokens": 220101472.0, + "step": 9700 + }, + { + "epoch": 0.9691792796843, + "grad_norm": 0.459370379667136, + "learning_rate": 2.5133325837815027e-08, + "loss": 0.4881, + "mean_token_accuracy": 0.9068798124790192, + "num_tokens": 220183003.0, + "step": 9701 + }, + { + "epoch": 0.9692791847744643, + "grad_norm": 0.4772883145987322, + "learning_rate": 2.4971571770872614e-08, + "loss": 0.4913, + "mean_token_accuracy": 0.9049268662929535, + "num_tokens": 220264482.0, + "step": 9702 + }, + { + "epoch": 0.9693790898646286, + "grad_norm": 0.4002665988341926, + "learning_rate": 2.4810338594273287e-08, + "loss": 0.4859, + "mean_token_accuracy": 0.9085968434810638, + "num_tokens": 220346008.0, + "step": 9703 + }, + { + "epoch": 0.969478994954793, + "grad_norm": 0.4301958989987976, + "learning_rate": 2.464962632489909e-08, + "loss": 0.4856, + "mean_token_accuracy": 0.9102535247802734, + "num_tokens": 220427574.0, + "step": 9704 + }, + { + "epoch": 0.9695789000449573, + "grad_norm": 0.49226822132641346, + "learning_rate": 2.4489434979575456e-08, + "loss": 0.4894, + "mean_token_accuracy": 0.9086336195468903, + "num_tokens": 220509057.0, + "step": 9705 + }, + { + "epoch": 0.9696788051351216, + "grad_norm": 0.5656595860980745, + "learning_rate": 2.4329764575076186e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9056523144245148, + "num_tokens": 220590553.0, + "step": 9706 + }, + { + "epoch": 0.969778710225286, + "grad_norm": 0.6552191192931918, + "learning_rate": 2.4170615128117358e-08, + "loss": 0.4816, + "mean_token_accuracy": 0.9100930988788605, + "num_tokens": 220672177.0, + "step": 9707 + }, + { + "epoch": 0.9698786153154503, + "grad_norm": 0.4749933754113849, + "learning_rate": 2.4011986655362863e-08, + "loss": 0.491, + "mean_token_accuracy": 0.9058572947978973, + "num_tokens": 220753657.0, + "step": 9708 + }, + { + "epoch": 0.9699785204056147, + "grad_norm": 0.4273940235567518, + "learning_rate": 2.385387917342108e-08, + "loss": 0.4811, + "mean_token_accuracy": 0.9092278182506561, + "num_tokens": 220835235.0, + "step": 9709 + }, + { + "epoch": 0.970078425495779, + "grad_norm": 0.4407808677862937, + "learning_rate": 2.3696292698845992e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9077393114566803, + "num_tokens": 220916743.0, + "step": 9710 + }, + { + "epoch": 0.9701783305859434, + "grad_norm": 0.6156299675839223, + "learning_rate": 2.3539227248137176e-08, + "loss": 0.4916, + "mean_token_accuracy": 0.9080732464790344, + "num_tokens": 220998236.0, + "step": 9711 + }, + { + "epoch": 0.9702782356761077, + "grad_norm": 0.43526097987388723, + "learning_rate": 2.3382682837739258e-08, + "loss": 0.492, + "mean_token_accuracy": 0.9081701636314392, + "num_tokens": 221079683.0, + "step": 9712 + }, + { + "epoch": 0.970378140766272, + "grad_norm": 0.576692465756206, + "learning_rate": 2.3226659484043013e-08, + "loss": 0.4866, + "mean_token_accuracy": 0.9087759554386139, + "num_tokens": 221161216.0, + "step": 9713 + }, + { + "epoch": 0.9704780458564364, + "grad_norm": 0.3995377651548441, + "learning_rate": 2.3071157203384263e-08, + "loss": 0.493, + "mean_token_accuracy": 0.9079346060752869, + "num_tokens": 221242651.0, + "step": 9714 + }, + { + "epoch": 0.9705779509466007, + "grad_norm": 0.41937582692096614, + "learning_rate": 2.2916176012043324e-08, + "loss": 0.4868, + "mean_token_accuracy": 0.9097385406494141, + "num_tokens": 221324165.0, + "step": 9715 + }, + { + "epoch": 0.970677856036765, + "grad_norm": 0.4446191024132426, + "learning_rate": 2.2761715926248316e-08, + "loss": 0.4881, + "mean_token_accuracy": 0.9082393050193787, + "num_tokens": 221405690.0, + "step": 9716 + }, + { + "epoch": 0.9707777611269294, + "grad_norm": 0.4478131074814327, + "learning_rate": 2.2607776962170203e-08, + "loss": 0.4895, + "mean_token_accuracy": 0.9088994264602661, + "num_tokens": 221487178.0, + "step": 9717 + }, + { + "epoch": 0.9708776662170938, + "grad_norm": 0.495995925542865, + "learning_rate": 2.245435913592775e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.9086428880691528, + "num_tokens": 221568722.0, + "step": 9718 + }, + { + "epoch": 0.9709775713072581, + "grad_norm": 0.4536237296546111, + "learning_rate": 2.230146246358256e-08, + "loss": 0.4876, + "mean_token_accuracy": 0.9069240689277649, + "num_tokens": 221650194.0, + "step": 9719 + }, + { + "epoch": 0.9710774763974225, + "grad_norm": 0.6126273542320523, + "learning_rate": 2.2149086961143485e-08, + "loss": 0.4863, + "mean_token_accuracy": 0.9075214862823486, + "num_tokens": 221731760.0, + "step": 9720 + }, + { + "epoch": 0.9711773814875868, + "grad_norm": 0.5247632398955592, + "learning_rate": 2.199723264456499e-08, + "loss": 0.4881, + "mean_token_accuracy": 0.9093181788921356, + "num_tokens": 221813281.0, + "step": 9721 + }, + { + "epoch": 0.9712772865777511, + "grad_norm": 0.47568606035626915, + "learning_rate": 2.1845899529745473e-08, + "loss": 0.4864, + "mean_token_accuracy": 0.9063396155834198, + "num_tokens": 221894834.0, + "step": 9722 + }, + { + "epoch": 0.9713771916679155, + "grad_norm": 0.4284206815261236, + "learning_rate": 2.1695087632530586e-08, + "loss": 0.4883, + "mean_token_accuracy": 0.9087855219841003, + "num_tokens": 221976319.0, + "step": 9723 + }, + { + "epoch": 0.9714770967580798, + "grad_norm": 0.4922777686970537, + "learning_rate": 2.1544796968709925e-08, + "loss": 0.4917, + "mean_token_accuracy": 0.9083262085914612, + "num_tokens": 222057764.0, + "step": 9724 + }, + { + "epoch": 0.9715770018482441, + "grad_norm": 0.48226266334431017, + "learning_rate": 2.1395027554019233e-08, + "loss": 0.4837, + "mean_token_accuracy": 0.9098677337169647, + "num_tokens": 222139354.0, + "step": 9725 + }, + { + "epoch": 0.9716769069384085, + "grad_norm": 0.43758567699674, + "learning_rate": 2.12457794041393e-08, + "loss": 0.4857, + "mean_token_accuracy": 0.9067687392234802, + "num_tokens": 222220918.0, + "step": 9726 + }, + { + "epoch": 0.9717768120285728, + "grad_norm": 0.4385712719089713, + "learning_rate": 2.1097052534696516e-08, + "loss": 0.4849, + "mean_token_accuracy": 0.9082051813602448, + "num_tokens": 222302464.0, + "step": 9727 + }, + { + "epoch": 0.9718767171187372, + "grad_norm": 0.44075984066045226, + "learning_rate": 2.0948846961263425e-08, + "loss": 0.4833, + "mean_token_accuracy": 0.9072590470314026, + "num_tokens": 222384029.0, + "step": 9728 + }, + { + "epoch": 0.9719766222089016, + "grad_norm": 0.41971129809435254, + "learning_rate": 2.0801162699356502e-08, + "loss": 0.4884, + "mean_token_accuracy": 0.9068707823753357, + "num_tokens": 222465549.0, + "step": 9729 + }, + { + "epoch": 0.9720765272990659, + "grad_norm": 0.4143665257683731, + "learning_rate": 2.0653999764438937e-08, + "loss": 0.4933, + "mean_token_accuracy": 0.907900869846344, + "num_tokens": 222546993.0, + "step": 9730 + }, + { + "epoch": 0.9721764323892302, + "grad_norm": 0.5127289745866935, + "learning_rate": 2.05073581719184e-08, + "loss": 0.4921, + "mean_token_accuracy": 0.9066234230995178, + "num_tokens": 222628408.0, + "step": 9731 + }, + { + "epoch": 0.9722763374793946, + "grad_norm": 0.5889216241727677, + "learning_rate": 2.0361237937148725e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.9112353920936584, + "num_tokens": 222709984.0, + "step": 9732 + }, + { + "epoch": 0.9723762425695589, + "grad_norm": 0.6024151061524116, + "learning_rate": 2.0215639075429337e-08, + "loss": 0.4958, + "mean_token_accuracy": 0.9074826240539551, + "num_tokens": 222791394.0, + "step": 9733 + }, + { + "epoch": 0.9724761476597232, + "grad_norm": 0.42147361252092924, + "learning_rate": 2.007056160200305e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.911077231168747, + "num_tokens": 222872866.0, + "step": 9734 + }, + { + "epoch": 0.9725760527498876, + "grad_norm": 0.44076509692684496, + "learning_rate": 1.9926005532061033e-08, + "loss": 0.4924, + "mean_token_accuracy": 0.9067639410495758, + "num_tokens": 222954344.0, + "step": 9735 + }, + { + "epoch": 0.9726759578400519, + "grad_norm": 0.4605541751367253, + "learning_rate": 1.978197088073841e-08, + "loss": 0.4902, + "mean_token_accuracy": 0.9102247655391693, + "num_tokens": 223035800.0, + "step": 9736 + }, + { + "epoch": 0.9727758629302163, + "grad_norm": 0.5247281624543959, + "learning_rate": 1.9638457663114785e-08, + "loss": 0.4857, + "mean_token_accuracy": 0.9068134129047394, + "num_tokens": 223117348.0, + "step": 9737 + }, + { + "epoch": 0.9728757680203807, + "grad_norm": 0.5140596490043116, + "learning_rate": 1.949546589421647e-08, + "loss": 0.4841, + "mean_token_accuracy": 0.9096599519252777, + "num_tokens": 223198909.0, + "step": 9738 + }, + { + "epoch": 0.972975673110545, + "grad_norm": 0.4439868475533188, + "learning_rate": 1.935299558901538e-08, + "loss": 0.4854, + "mean_token_accuracy": 0.9073388576507568, + "num_tokens": 223280445.0, + "step": 9739 + }, + { + "epoch": 0.9730755782007093, + "grad_norm": 0.5848061905221702, + "learning_rate": 1.921104676242791e-08, + "loss": 0.4905, + "mean_token_accuracy": 0.9063684940338135, + "num_tokens": 223361965.0, + "step": 9740 + }, + { + "epoch": 0.9731754832908737, + "grad_norm": 0.4267952507879111, + "learning_rate": 1.906961942931662e-08, + "loss": 0.4854, + "mean_token_accuracy": 0.9096954464912415, + "num_tokens": 223443540.0, + "step": 9741 + }, + { + "epoch": 0.973275388381038, + "grad_norm": 0.6137106174632599, + "learning_rate": 1.8928713604488558e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.9058578908443451, + "num_tokens": 223525004.0, + "step": 9742 + }, + { + "epoch": 0.9733752934712023, + "grad_norm": 0.4869193574408435, + "learning_rate": 1.878832930269636e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9104093611240387, + "num_tokens": 223606540.0, + "step": 9743 + }, + { + "epoch": 0.9734751985613667, + "grad_norm": 0.45671872019147064, + "learning_rate": 1.8648466538639387e-08, + "loss": 0.487, + "mean_token_accuracy": 0.9102073907852173, + "num_tokens": 223688082.0, + "step": 9744 + }, + { + "epoch": 0.973575103651531, + "grad_norm": 0.4668223790587877, + "learning_rate": 1.850912532696092e-08, + "loss": 0.49, + "mean_token_accuracy": 0.9075271189212799, + "num_tokens": 223769560.0, + "step": 9745 + }, + { + "epoch": 0.9736750087416954, + "grad_norm": 0.45545066591055045, + "learning_rate": 1.83703056822504e-08, + "loss": 0.4834, + "mean_token_accuracy": 0.9087472558021545, + "num_tokens": 223851118.0, + "step": 9746 + }, + { + "epoch": 0.9737749138318598, + "grad_norm": 0.4049626735772323, + "learning_rate": 1.8232007619041757e-08, + "loss": 0.4873, + "mean_token_accuracy": 0.9090221226215363, + "num_tokens": 223932644.0, + "step": 9747 + }, + { + "epoch": 0.9738748189220241, + "grad_norm": 0.47662237497038534, + "learning_rate": 1.8094231151816187e-08, + "loss": 0.4871, + "mean_token_accuracy": 0.9067632853984833, + "num_tokens": 224014202.0, + "step": 9748 + }, + { + "epoch": 0.9739747240121884, + "grad_norm": 0.580020192122432, + "learning_rate": 1.7956976294997153e-08, + "loss": 0.4924, + "mean_token_accuracy": 0.9068188071250916, + "num_tokens": 224095647.0, + "step": 9749 + }, + { + "epoch": 0.9740746291023528, + "grad_norm": 0.5308700320409449, + "learning_rate": 1.7820243062957042e-08, + "loss": 0.4955, + "mean_token_accuracy": 0.9099073112010956, + "num_tokens": 224177052.0, + "step": 9750 + }, + { + "epoch": 0.9741745341925171, + "grad_norm": 0.4455353635030578, + "learning_rate": 1.768403147001052e-08, + "loss": 0.4873, + "mean_token_accuracy": 0.9093299508094788, + "num_tokens": 224258583.0, + "step": 9751 + }, + { + "epoch": 0.9742744392826814, + "grad_norm": 0.5513059730519149, + "learning_rate": 1.754834153042062e-08, + "loss": 0.4863, + "mean_token_accuracy": 0.9073232114315033, + "num_tokens": 224340154.0, + "step": 9752 + }, + { + "epoch": 0.9743743443728458, + "grad_norm": 0.6665191312263997, + "learning_rate": 1.74131732583932e-08, + "loss": 0.4972, + "mean_token_accuracy": 0.90775465965271, + "num_tokens": 224421523.0, + "step": 9753 + }, + { + "epoch": 0.9744742494630101, + "grad_norm": 0.4551113095875129, + "learning_rate": 1.7278526668080276e-08, + "loss": 0.4845, + "mean_token_accuracy": 0.9090912938117981, + "num_tokens": 224503091.0, + "step": 9754 + }, + { + "epoch": 0.9745741545531745, + "grad_norm": 0.4874282265190171, + "learning_rate": 1.7144401773580566e-08, + "loss": 0.4861, + "mean_token_accuracy": 0.9066859185695648, + "num_tokens": 224584627.0, + "step": 9755 + }, + { + "epoch": 0.9746740596433389, + "grad_norm": 0.6389567582117793, + "learning_rate": 1.7010798588936173e-08, + "loss": 0.4835, + "mean_token_accuracy": 0.9091262221336365, + "num_tokens": 224666230.0, + "step": 9756 + }, + { + "epoch": 0.9747739647335032, + "grad_norm": 0.6068751789488366, + "learning_rate": 1.6877717128135908e-08, + "loss": 0.493, + "mean_token_accuracy": 0.9044801294803619, + "num_tokens": 224747703.0, + "step": 9757 + }, + { + "epoch": 0.9748738698236675, + "grad_norm": 0.4689185189130463, + "learning_rate": 1.674515740511362e-08, + "loss": 0.483, + "mean_token_accuracy": 0.908018559217453, + "num_tokens": 224829298.0, + "step": 9758 + }, + { + "epoch": 0.9749737749138319, + "grad_norm": 0.4841077596417829, + "learning_rate": 1.661311943374766e-08, + "loss": 0.4806, + "mean_token_accuracy": 0.9081923961639404, + "num_tokens": 224910929.0, + "step": 9759 + }, + { + "epoch": 0.9750736800039962, + "grad_norm": 0.4938405582092139, + "learning_rate": 1.6481603227863074e-08, + "loss": 0.4878, + "mean_token_accuracy": 0.9074394106864929, + "num_tokens": 224992443.0, + "step": 9760 + }, + { + "epoch": 0.9751735850941605, + "grad_norm": 0.5599629261144411, + "learning_rate": 1.635060880122996e-08, + "loss": 0.4923, + "mean_token_accuracy": 0.9087142050266266, + "num_tokens": 225073881.0, + "step": 9761 + }, + { + "epoch": 0.9752734901843249, + "grad_norm": 0.503311304157966, + "learning_rate": 1.622013616756346e-08, + "loss": 0.4862, + "mean_token_accuracy": 0.909568190574646, + "num_tokens": 225155390.0, + "step": 9762 + }, + { + "epoch": 0.9753733952744892, + "grad_norm": 0.5139726172721144, + "learning_rate": 1.609018534052431e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.9060748517513275, + "num_tokens": 225236939.0, + "step": 9763 + }, + { + "epoch": 0.9754733003646536, + "grad_norm": 0.4292296185938996, + "learning_rate": 1.596075633371774e-08, + "loss": 0.4874, + "mean_token_accuracy": 0.9065966308116913, + "num_tokens": 225318449.0, + "step": 9764 + }, + { + "epoch": 0.975573205454818, + "grad_norm": 0.564556303410169, + "learning_rate": 1.583184916069569e-08, + "loss": 0.4856, + "mean_token_accuracy": 0.9073008596897125, + "num_tokens": 225400038.0, + "step": 9765 + }, + { + "epoch": 0.9756731105449823, + "grad_norm": 0.4383178057534281, + "learning_rate": 1.5703463834955135e-08, + "loss": 0.4852, + "mean_token_accuracy": 0.9085744023323059, + "num_tokens": 225481571.0, + "step": 9766 + }, + { + "epoch": 0.9757730156351466, + "grad_norm": 0.5971091048583032, + "learning_rate": 1.5575600369937548e-08, + "loss": 0.482, + "mean_token_accuracy": 0.9091520011425018, + "num_tokens": 225563175.0, + "step": 9767 + }, + { + "epoch": 0.975872920725311, + "grad_norm": 0.40009517604346345, + "learning_rate": 1.5448258779030555e-08, + "loss": 0.4912, + "mean_token_accuracy": 0.9087123572826385, + "num_tokens": 225644599.0, + "step": 9768 + }, + { + "epoch": 0.9759728258154753, + "grad_norm": 0.44086577338425526, + "learning_rate": 1.532143907556738e-08, + "loss": 0.4915, + "mean_token_accuracy": 0.907945990562439, + "num_tokens": 225726046.0, + "step": 9769 + }, + { + "epoch": 0.9760727309056396, + "grad_norm": 0.4098392010393972, + "learning_rate": 1.5195141272825732e-08, + "loss": 0.4852, + "mean_token_accuracy": 0.9091401994228363, + "num_tokens": 225807584.0, + "step": 9770 + }, + { + "epoch": 0.976172635995804, + "grad_norm": 0.42170275096580423, + "learning_rate": 1.5069365384028368e-08, + "loss": 0.4859, + "mean_token_accuracy": 0.9103630781173706, + "num_tokens": 225889117.0, + "step": 9771 + }, + { + "epoch": 0.9762725410859683, + "grad_norm": 0.43654783275311293, + "learning_rate": 1.4944111422345864e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9087845683097839, + "num_tokens": 225970581.0, + "step": 9772 + }, + { + "epoch": 0.9763724461761327, + "grad_norm": 0.4608268019377365, + "learning_rate": 1.4819379400891066e-08, + "loss": 0.4889, + "mean_token_accuracy": 0.9106245636940002, + "num_tokens": 226052090.0, + "step": 9773 + }, + { + "epoch": 0.9764723512662971, + "grad_norm": 0.4501260930744608, + "learning_rate": 1.469516933272408e-08, + "loss": 0.489, + "mean_token_accuracy": 0.9065529704093933, + "num_tokens": 226133552.0, + "step": 9774 + }, + { + "epoch": 0.9765722563564614, + "grad_norm": 0.49712819866526736, + "learning_rate": 1.457148123085006e-08, + "loss": 0.4891, + "mean_token_accuracy": 0.9078895747661591, + "num_tokens": 226215095.0, + "step": 9775 + }, + { + "epoch": 0.9766721614466257, + "grad_norm": 0.47030273092901426, + "learning_rate": 1.4448315108218647e-08, + "loss": 0.4873, + "mean_token_accuracy": 0.9085085988044739, + "num_tokens": 226296590.0, + "step": 9776 + }, + { + "epoch": 0.97677206653679, + "grad_norm": 0.4075276062610213, + "learning_rate": 1.4325670977725637e-08, + "loss": 0.4865, + "mean_token_accuracy": 0.9086578190326691, + "num_tokens": 226378086.0, + "step": 9777 + }, + { + "epoch": 0.9768719716269544, + "grad_norm": 0.3878333199473236, + "learning_rate": 1.4203548852211868e-08, + "loss": 0.4815, + "mean_token_accuracy": 0.9107424914836884, + "num_tokens": 226459677.0, + "step": 9778 + }, + { + "epoch": 0.9769718767171187, + "grad_norm": 0.4052431443161988, + "learning_rate": 1.4081948744464336e-08, + "loss": 0.4882, + "mean_token_accuracy": 0.9080460369586945, + "num_tokens": 226541176.0, + "step": 9779 + }, + { + "epoch": 0.977071781807283, + "grad_norm": 0.47764350197212796, + "learning_rate": 1.3960870667214521e-08, + "loss": 0.4819, + "mean_token_accuracy": 0.9099103212356567, + "num_tokens": 226622780.0, + "step": 9780 + }, + { + "epoch": 0.9771716868974474, + "grad_norm": 0.7931676806977351, + "learning_rate": 1.3840314633138397e-08, + "loss": 0.486, + "mean_token_accuracy": 0.9075196087360382, + "num_tokens": 226704376.0, + "step": 9781 + }, + { + "epoch": 0.9772715919876118, + "grad_norm": 0.47257625939547654, + "learning_rate": 1.3720280654859197e-08, + "loss": 0.4911, + "mean_token_accuracy": 0.905034989118576, + "num_tokens": 226785840.0, + "step": 9782 + }, + { + "epoch": 0.9773714970777762, + "grad_norm": 0.4732571066061264, + "learning_rate": 1.3600768744944648e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.9069404006004333, + "num_tokens": 226867331.0, + "step": 9783 + }, + { + "epoch": 0.9774714021679405, + "grad_norm": 0.37460815528802266, + "learning_rate": 1.3481778915907517e-08, + "loss": 0.4798, + "mean_token_accuracy": 0.9125819206237793, + "num_tokens": 226948952.0, + "step": 9784 + }, + { + "epoch": 0.9775713072581048, + "grad_norm": 0.6062433198106086, + "learning_rate": 1.3363311180206174e-08, + "loss": 0.4888, + "mean_token_accuracy": 0.908545047044754, + "num_tokens": 227030461.0, + "step": 9785 + }, + { + "epoch": 0.9776712123482691, + "grad_norm": 0.573489889868805, + "learning_rate": 1.3245365550244583e-08, + "loss": 0.4897, + "mean_token_accuracy": 0.9062381088733673, + "num_tokens": 227111960.0, + "step": 9786 + }, + { + "epoch": 0.9777711174384335, + "grad_norm": 0.5016198356848512, + "learning_rate": 1.31279420383712e-08, + "loss": 0.4898, + "mean_token_accuracy": 0.9059098958969116, + "num_tokens": 227193470.0, + "step": 9787 + }, + { + "epoch": 0.9778710225285978, + "grad_norm": 0.4092340663069826, + "learning_rate": 1.3011040656880636e-08, + "loss": 0.4863, + "mean_token_accuracy": 0.9073299169540405, + "num_tokens": 227274986.0, + "step": 9788 + }, + { + "epoch": 0.9779709276187621, + "grad_norm": 0.4185744873320975, + "learning_rate": 1.2894661418012545e-08, + "loss": 0.49, + "mean_token_accuracy": 0.909152090549469, + "num_tokens": 227356448.0, + "step": 9789 + }, + { + "epoch": 0.9780708327089265, + "grad_norm": 0.41816145538674054, + "learning_rate": 1.2778804333952178e-08, + "loss": 0.4801, + "mean_token_accuracy": 0.9098292589187622, + "num_tokens": 227438049.0, + "step": 9790 + }, + { + "epoch": 0.9781707377990909, + "grad_norm": 0.42209243481967756, + "learning_rate": 1.2663469416829833e-08, + "loss": 0.4904, + "mean_token_accuracy": 0.9095204770565033, + "num_tokens": 227519509.0, + "step": 9791 + }, + { + "epoch": 0.9782706428892552, + "grad_norm": 0.40089155707620583, + "learning_rate": 1.2548656678721404e-08, + "loss": 0.4916, + "mean_token_accuracy": 0.906877189874649, + "num_tokens": 227600933.0, + "step": 9792 + }, + { + "epoch": 0.9783705479794196, + "grad_norm": 0.9298687866650339, + "learning_rate": 1.2434366131646724e-08, + "loss": 0.4869, + "mean_token_accuracy": 0.9094026684761047, + "num_tokens": 227682473.0, + "step": 9793 + }, + { + "epoch": 0.9784704530695839, + "grad_norm": 0.4564121083973904, + "learning_rate": 1.2320597787573996e-08, + "loss": 0.4882, + "mean_token_accuracy": 0.9089766442775726, + "num_tokens": 227763964.0, + "step": 9794 + }, + { + "epoch": 0.9785703581597482, + "grad_norm": 0.49622087800791165, + "learning_rate": 1.2207351658413135e-08, + "loss": 0.4902, + "mean_token_accuracy": 0.9078401625156403, + "num_tokens": 227845457.0, + "step": 9795 + }, + { + "epoch": 0.9786702632499126, + "grad_norm": 0.4863209791030773, + "learning_rate": 1.2094627756021881e-08, + "loss": 0.4868, + "mean_token_accuracy": 0.9085021018981934, + "num_tokens": 227926969.0, + "step": 9796 + }, + { + "epoch": 0.9787701683400769, + "grad_norm": 0.501921237579695, + "learning_rate": 1.1982426092203015e-08, + "loss": 0.4911, + "mean_token_accuracy": 0.9031190574169159, + "num_tokens": 228008497.0, + "step": 9797 + }, + { + "epoch": 0.9788700734302412, + "grad_norm": 0.4765900434453293, + "learning_rate": 1.1870746678703249e-08, + "loss": 0.4883, + "mean_token_accuracy": 0.9090091586112976, + "num_tokens": 228090013.0, + "step": 9798 + }, + { + "epoch": 0.9789699785204056, + "grad_norm": 0.3931990122976912, + "learning_rate": 1.1759589527216008e-08, + "loss": 0.4839, + "mean_token_accuracy": 0.9089590311050415, + "num_tokens": 228171571.0, + "step": 9799 + }, + { + "epoch": 0.97906988361057, + "grad_norm": 0.4395994780648408, + "learning_rate": 1.1648954649379208e-08, + "loss": 0.4916, + "mean_token_accuracy": 0.9079615473747253, + "num_tokens": 228253042.0, + "step": 9800 + }, + { + "epoch": 0.9791697887007343, + "grad_norm": 0.5713315064796551, + "learning_rate": 1.1538842056777466e-08, + "loss": 0.4857, + "mean_token_accuracy": 0.9062281847000122, + "num_tokens": 228334627.0, + "step": 9801 + }, + { + "epoch": 0.9792696937908987, + "grad_norm": 0.4784731646256849, + "learning_rate": 1.1429251760938232e-08, + "loss": 0.4883, + "mean_token_accuracy": 0.9082129597663879, + "num_tokens": 228416144.0, + "step": 9802 + }, + { + "epoch": 0.979369598881063, + "grad_norm": 0.5304569350212197, + "learning_rate": 1.1320183773336768e-08, + "loss": 0.4872, + "mean_token_accuracy": 0.9094142913818359, + "num_tokens": 228497643.0, + "step": 9803 + }, + { + "epoch": 0.9794695039712273, + "grad_norm": 0.41865977843557683, + "learning_rate": 1.121163810539172e-08, + "loss": 0.485, + "mean_token_accuracy": 0.9084835946559906, + "num_tokens": 228579193.0, + "step": 9804 + }, + { + "epoch": 0.9795694090613917, + "grad_norm": 0.5593518201267247, + "learning_rate": 1.1103614768468996e-08, + "loss": 0.4868, + "mean_token_accuracy": 0.9090431332588196, + "num_tokens": 228660714.0, + "step": 9805 + }, + { + "epoch": 0.979669314151556, + "grad_norm": 0.41773110498372545, + "learning_rate": 1.099611377387788e-08, + "loss": 0.4882, + "mean_token_accuracy": 0.9073678553104401, + "num_tokens": 228742234.0, + "step": 9806 + }, + { + "epoch": 0.9797692192417203, + "grad_norm": 0.4143674340362813, + "learning_rate": 1.0889135132874374e-08, + "loss": 0.4838, + "mean_token_accuracy": 0.9088511765003204, + "num_tokens": 228823813.0, + "step": 9807 + }, + { + "epoch": 0.9798691243318847, + "grad_norm": 0.4499974376793871, + "learning_rate": 1.0782678856658957e-08, + "loss": 0.4825, + "mean_token_accuracy": 0.908403754234314, + "num_tokens": 228905440.0, + "step": 9808 + }, + { + "epoch": 0.979969029422049, + "grad_norm": 0.44433277799320214, + "learning_rate": 1.0676744956377716e-08, + "loss": 0.4858, + "mean_token_accuracy": 0.9096198379993439, + "num_tokens": 228986974.0, + "step": 9809 + }, + { + "epoch": 0.9800689345122134, + "grad_norm": 0.4967680386524198, + "learning_rate": 1.0571333443121779e-08, + "loss": 0.482, + "mean_token_accuracy": 0.9090924263000488, + "num_tokens": 229068597.0, + "step": 9810 + }, + { + "epoch": 0.9801688396023778, + "grad_norm": 0.4753369825305388, + "learning_rate": 1.0466444327928427e-08, + "loss": 0.4937, + "mean_token_accuracy": 0.9053904414176941, + "num_tokens": 229150051.0, + "step": 9811 + }, + { + "epoch": 0.9802687446925421, + "grad_norm": 0.4372593344180041, + "learning_rate": 1.0362077621779431e-08, + "loss": 0.4906, + "mean_token_accuracy": 0.9074959754943848, + "num_tokens": 229231518.0, + "step": 9812 + }, + { + "epoch": 0.9803686497827064, + "grad_norm": 1.2722147520773979, + "learning_rate": 1.0258233335601609e-08, + "loss": 0.487, + "mean_token_accuracy": 0.9100510776042938, + "num_tokens": 229313024.0, + "step": 9813 + }, + { + "epoch": 0.9804685548728708, + "grad_norm": 0.5505408881279092, + "learning_rate": 1.015491148026848e-08, + "loss": 0.4918, + "mean_token_accuracy": 0.9082771241664886, + "num_tokens": 229394499.0, + "step": 9814 + }, + { + "epoch": 0.9805684599630351, + "grad_norm": 0.41067724928210325, + "learning_rate": 1.0052112066596954e-08, + "loss": 0.4922, + "mean_token_accuracy": 0.908738523721695, + "num_tokens": 229475979.0, + "step": 9815 + }, + { + "epoch": 0.9806683650531994, + "grad_norm": 0.45418894741748816, + "learning_rate": 9.94983510535119e-09, + "loss": 0.4882, + "mean_token_accuracy": 0.911241352558136, + "num_tokens": 229557466.0, + "step": 9816 + }, + { + "epoch": 0.9807682701433638, + "grad_norm": 0.45490324975532104, + "learning_rate": 9.848080607239297e-09, + "loss": 0.4852, + "mean_token_accuracy": 0.9088453352451324, + "num_tokens": 229639044.0, + "step": 9817 + }, + { + "epoch": 0.9808681752335281, + "grad_norm": 0.3925953014500071, + "learning_rate": 9.746848582914415e-09, + "loss": 0.4872, + "mean_token_accuracy": 0.9086017608642578, + "num_tokens": 229720590.0, + "step": 9818 + }, + { + "epoch": 0.9809680803236925, + "grad_norm": 0.43656633688272806, + "learning_rate": 9.646139042976399e-09, + "loss": 0.4865, + "mean_token_accuracy": 0.9088721573352814, + "num_tokens": 229802115.0, + "step": 9819 + }, + { + "epoch": 0.9810679854138569, + "grad_norm": 0.4471056689925334, + "learning_rate": 9.545951997969594e-09, + "loss": 0.4846, + "mean_token_accuracy": 0.9075141251087189, + "num_tokens": 229883680.0, + "step": 9820 + }, + { + "epoch": 0.9811678905040212, + "grad_norm": 0.46370975473373355, + "learning_rate": 9.446287458383385e-09, + "loss": 0.4903, + "mean_token_accuracy": 0.9080482423305511, + "num_tokens": 229965181.0, + "step": 9821 + }, + { + "epoch": 0.9812677955941855, + "grad_norm": 0.48997015771828134, + "learning_rate": 9.34714543465276e-09, + "loss": 0.4926, + "mean_token_accuracy": 0.9051003158092499, + "num_tokens": 230046652.0, + "step": 9822 + }, + { + "epoch": 0.9813677006843499, + "grad_norm": 0.4780986372083711, + "learning_rate": 9.248525937158303e-09, + "loss": 0.4885, + "mean_token_accuracy": 0.9099593162536621, + "num_tokens": 230128164.0, + "step": 9823 + }, + { + "epoch": 0.9814676057745142, + "grad_norm": 0.4193363632244986, + "learning_rate": 9.150428976225089e-09, + "loss": 0.4865, + "mean_token_accuracy": 0.9082150459289551, + "num_tokens": 230209688.0, + "step": 9824 + }, + { + "epoch": 0.9815675108646785, + "grad_norm": 0.4315705138118333, + "learning_rate": 9.052854562124346e-09, + "loss": 0.4872, + "mean_token_accuracy": 0.9069997370243073, + "num_tokens": 230291208.0, + "step": 9825 + }, + { + "epoch": 0.9816674159548429, + "grad_norm": 0.4366827460439371, + "learning_rate": 8.955802705072347e-09, + "loss": 0.4853, + "mean_token_accuracy": 0.9080775380134583, + "num_tokens": 230372766.0, + "step": 9826 + }, + { + "epoch": 0.9817673210450072, + "grad_norm": 0.45004584898723576, + "learning_rate": 8.859273415230406e-09, + "loss": 0.4911, + "mean_token_accuracy": 0.9066131711006165, + "num_tokens": 230454242.0, + "step": 9827 + }, + { + "epoch": 0.9818672261351716, + "grad_norm": 0.47247064191733545, + "learning_rate": 8.763266702704887e-09, + "loss": 0.4859, + "mean_token_accuracy": 0.9114541411399841, + "num_tokens": 230535757.0, + "step": 9828 + }, + { + "epoch": 0.981967131225336, + "grad_norm": 0.4813685004900967, + "learning_rate": 8.667782577547746e-09, + "loss": 0.488, + "mean_token_accuracy": 0.9073048233985901, + "num_tokens": 230617281.0, + "step": 9829 + }, + { + "epoch": 0.9820670363155003, + "grad_norm": 0.4853618677672454, + "learning_rate": 8.572821049757097e-09, + "loss": 0.486, + "mean_token_accuracy": 0.9103251695632935, + "num_tokens": 230698831.0, + "step": 9830 + }, + { + "epoch": 0.9821669414056646, + "grad_norm": 0.4291882599690643, + "learning_rate": 8.478382129274432e-09, + "loss": 0.4899, + "mean_token_accuracy": 0.9070394039154053, + "num_tokens": 230780288.0, + "step": 9831 + }, + { + "epoch": 0.982266846495829, + "grad_norm": 0.5580492849854977, + "learning_rate": 8.384465825987952e-09, + "loss": 0.4894, + "mean_token_accuracy": 0.9055090248584747, + "num_tokens": 230861769.0, + "step": 9832 + }, + { + "epoch": 0.9823667515859933, + "grad_norm": 0.48430339856358307, + "learning_rate": 8.291072149731461e-09, + "loss": 0.4899, + "mean_token_accuracy": 0.9099239408969879, + "num_tokens": 230943263.0, + "step": 9833 + }, + { + "epoch": 0.9824666566761576, + "grad_norm": 0.5161914271058704, + "learning_rate": 8.198201110282688e-09, + "loss": 0.4858, + "mean_token_accuracy": 0.9094602763652802, + "num_tokens": 231024796.0, + "step": 9834 + }, + { + "epoch": 0.982566561766322, + "grad_norm": 0.4681793596654799, + "learning_rate": 8.105852717365525e-09, + "loss": 0.4867, + "mean_token_accuracy": 0.9060107469558716, + "num_tokens": 231106320.0, + "step": 9835 + }, + { + "epoch": 0.9826664668564863, + "grad_norm": 0.42819941592979455, + "learning_rate": 8.014026980648904e-09, + "loss": 0.4818, + "mean_token_accuracy": 0.9072067141532898, + "num_tokens": 231187877.0, + "step": 9836 + }, + { + "epoch": 0.9827663719466507, + "grad_norm": 0.4251388745445472, + "learning_rate": 7.922723909746799e-09, + "loss": 0.4915, + "mean_token_accuracy": 0.9057195484638214, + "num_tokens": 231269357.0, + "step": 9837 + }, + { + "epoch": 0.9828662770368151, + "grad_norm": 0.45297590867121695, + "learning_rate": 7.831943514219343e-09, + "loss": 0.4882, + "mean_token_accuracy": 0.9100784063339233, + "num_tokens": 231350870.0, + "step": 9838 + }, + { + "epoch": 0.9829661821269794, + "grad_norm": 0.544305685425001, + "learning_rate": 7.741685803570598e-09, + "loss": 0.4865, + "mean_token_accuracy": 0.9055302143096924, + "num_tokens": 231432385.0, + "step": 9839 + }, + { + "epoch": 0.9830660872171437, + "grad_norm": 0.4723281909851481, + "learning_rate": 7.651950787251339e-09, + "loss": 0.4833, + "mean_token_accuracy": 0.9091911017894745, + "num_tokens": 231513949.0, + "step": 9840 + }, + { + "epoch": 0.9831659923073081, + "grad_norm": 0.6618124002286175, + "learning_rate": 7.562738474656828e-09, + "loss": 0.493, + "mean_token_accuracy": 0.9065011143684387, + "num_tokens": 231595386.0, + "step": 9841 + }, + { + "epoch": 0.9832658973974724, + "grad_norm": 0.40440868739055597, + "learning_rate": 7.474048875126817e-09, + "loss": 0.4848, + "mean_token_accuracy": 0.9089639782905579, + "num_tokens": 231676942.0, + "step": 9842 + }, + { + "epoch": 0.9833658024876367, + "grad_norm": 0.5277143703613111, + "learning_rate": 7.385881997948319e-09, + "loss": 0.4896, + "mean_token_accuracy": 0.9098382592201233, + "num_tokens": 231758485.0, + "step": 9843 + }, + { + "epoch": 0.983465707577801, + "grad_norm": 2.521460468317345, + "learning_rate": 7.298237852351731e-09, + "loss": 0.4889, + "mean_token_accuracy": 0.9067324101924896, + "num_tokens": 231839976.0, + "step": 9844 + }, + { + "epoch": 0.9835656126679654, + "grad_norm": 0.5076903992564746, + "learning_rate": 7.2111164475136e-09, + "loss": 0.4833, + "mean_token_accuracy": 0.9061499834060669, + "num_tokens": 231921585.0, + "step": 9845 + }, + { + "epoch": 0.9836655177581298, + "grad_norm": 0.4764598839350746, + "learning_rate": 7.124517792556074e-09, + "loss": 0.4887, + "mean_token_accuracy": 0.9091150164604187, + "num_tokens": 232003122.0, + "step": 9846 + }, + { + "epoch": 0.9837654228482942, + "grad_norm": 0.5043252275121791, + "learning_rate": 7.038441896545789e-09, + "loss": 0.4911, + "mean_token_accuracy": 0.9079459607601166, + "num_tokens": 232084593.0, + "step": 9847 + }, + { + "epoch": 0.9838653279384585, + "grad_norm": 0.43420736439712804, + "learning_rate": 6.9528887684949805e-09, + "loss": 0.485, + "mean_token_accuracy": 0.9079131186008453, + "num_tokens": 232166134.0, + "step": 9848 + }, + { + "epoch": 0.9839652330286228, + "grad_norm": 0.3782825530794972, + "learning_rate": 6.867858417360929e-09, + "loss": 0.4808, + "mean_token_accuracy": 0.9101561903953552, + "num_tokens": 232247726.0, + "step": 9849 + }, + { + "epoch": 0.9840651381187872, + "grad_norm": 0.5012750946049578, + "learning_rate": 6.783350852047066e-09, + "loss": 0.4956, + "mean_token_accuracy": 0.9056163430213928, + "num_tokens": 232329133.0, + "step": 9850 + }, + { + "epoch": 0.9841650432089515, + "grad_norm": 0.4472959861173516, + "learning_rate": 6.699366081400205e-09, + "loss": 0.4828, + "mean_token_accuracy": 0.9098173081874847, + "num_tokens": 232410708.0, + "step": 9851 + }, + { + "epoch": 0.9842649482991158, + "grad_norm": 0.4563187864408705, + "learning_rate": 6.615904114214977e-09, + "loss": 0.4826, + "mean_token_accuracy": 0.9079992771148682, + "num_tokens": 232492313.0, + "step": 9852 + }, + { + "epoch": 0.9843648533892801, + "grad_norm": 0.4282462311946401, + "learning_rate": 6.5329649592293935e-09, + "loss": 0.4859, + "mean_token_accuracy": 0.908412516117096, + "num_tokens": 232573816.0, + "step": 9853 + }, + { + "epoch": 0.9844647584794445, + "grad_norm": 0.42744181721890273, + "learning_rate": 6.450548625127062e-09, + "loss": 0.4904, + "mean_token_accuracy": 0.9081853032112122, + "num_tokens": 232655293.0, + "step": 9854 + }, + { + "epoch": 0.9845646635696089, + "grad_norm": 0.45742017230197873, + "learning_rate": 6.368655120537193e-09, + "loss": 0.4926, + "mean_token_accuracy": 0.9053717255592346, + "num_tokens": 232736798.0, + "step": 9855 + }, + { + "epoch": 0.9846645686597733, + "grad_norm": 0.3996225133221064, + "learning_rate": 6.2872844540340374e-09, + "loss": 0.4886, + "mean_token_accuracy": 0.9074409604072571, + "num_tokens": 232818269.0, + "step": 9856 + }, + { + "epoch": 0.9847644737499376, + "grad_norm": 0.6502113783467937, + "learning_rate": 6.2064366341374474e-09, + "loss": 0.4879, + "mean_token_accuracy": 0.9096308052539825, + "num_tokens": 232899796.0, + "step": 9857 + }, + { + "epoch": 0.9848643788401019, + "grad_norm": 0.4713828803440223, + "learning_rate": 6.126111669312318e-09, + "loss": 0.4853, + "mean_token_accuracy": 0.9094606637954712, + "num_tokens": 232981321.0, + "step": 9858 + }, + { + "epoch": 0.9849642839302662, + "grad_norm": 0.46024492921987237, + "learning_rate": 6.046309567968589e-09, + "loss": 0.4801, + "mean_token_accuracy": 0.9089242517948151, + "num_tokens": 233062906.0, + "step": 9859 + }, + { + "epoch": 0.9850641890204306, + "grad_norm": 0.5692313104400164, + "learning_rate": 5.9670303384612436e-09, + "loss": 0.4927, + "mean_token_accuracy": 0.9078047871589661, + "num_tokens": 233144387.0, + "step": 9860 + }, + { + "epoch": 0.9851640941105949, + "grad_norm": 0.5461957273648741, + "learning_rate": 5.88827398909142e-09, + "loss": 0.4924, + "mean_token_accuracy": 0.9068326056003571, + "num_tokens": 233225832.0, + "step": 9861 + }, + { + "epoch": 0.9852639992007592, + "grad_norm": 0.4651443799956986, + "learning_rate": 5.8100405281047435e-09, + "loss": 0.4871, + "mean_token_accuracy": 0.9083189368247986, + "num_tokens": 233307323.0, + "step": 9862 + }, + { + "epoch": 0.9853639042909236, + "grad_norm": 0.441769473652981, + "learning_rate": 5.73232996369244e-09, + "loss": 0.488, + "mean_token_accuracy": 0.9070842266082764, + "num_tokens": 233388819.0, + "step": 9863 + }, + { + "epoch": 0.985463809381088, + "grad_norm": 0.4641879588700369, + "learning_rate": 5.655142303990224e-09, + "loss": 0.4853, + "mean_token_accuracy": 0.9090209603309631, + "num_tokens": 233470378.0, + "step": 9864 + }, + { + "epoch": 0.9855637144712524, + "grad_norm": 0.4928290287605562, + "learning_rate": 5.578477557081074e-09, + "loss": 0.4869, + "mean_token_accuracy": 0.9091112315654755, + "num_tokens": 233551915.0, + "step": 9865 + }, + { + "epoch": 0.9856636195614167, + "grad_norm": 0.4505496836535013, + "learning_rate": 5.502335730990793e-09, + "loss": 0.4829, + "mean_token_accuracy": 0.9089452624320984, + "num_tokens": 233633492.0, + "step": 9866 + }, + { + "epoch": 0.985763524651581, + "grad_norm": 0.4073992528296324, + "learning_rate": 5.426716833691892e-09, + "loss": 0.4848, + "mean_token_accuracy": 0.9097857475280762, + "num_tokens": 233715063.0, + "step": 9867 + }, + { + "epoch": 0.9858634297417453, + "grad_norm": 0.44745298200669287, + "learning_rate": 5.351620873101926e-09, + "loss": 0.4893, + "mean_token_accuracy": 0.9078215062618256, + "num_tokens": 233796556.0, + "step": 9868 + }, + { + "epoch": 0.9859633348319097, + "grad_norm": 0.5726447517237709, + "learning_rate": 5.277047857082939e-09, + "loss": 0.4941, + "mean_token_accuracy": 0.9066998362541199, + "num_tokens": 233877982.0, + "step": 9869 + }, + { + "epoch": 0.986063239922074, + "grad_norm": 0.5360287369060742, + "learning_rate": 5.202997793443132e-09, + "loss": 0.4872, + "mean_token_accuracy": 0.9077039659023285, + "num_tokens": 233959497.0, + "step": 9870 + }, + { + "epoch": 0.9861631450122383, + "grad_norm": 0.44561421794522654, + "learning_rate": 5.129470689935745e-09, + "loss": 0.4879, + "mean_token_accuracy": 0.9086157381534576, + "num_tokens": 234041024.0, + "step": 9871 + }, + { + "epoch": 0.9862630501024027, + "grad_norm": 0.6552938011898369, + "learning_rate": 5.05646655425851e-09, + "loss": 0.4897, + "mean_token_accuracy": 0.9091494381427765, + "num_tokens": 234122542.0, + "step": 9872 + }, + { + "epoch": 0.9863629551925671, + "grad_norm": 0.44757895433524614, + "learning_rate": 4.983985394056423e-09, + "loss": 0.4897, + "mean_token_accuracy": 0.9084227979183197, + "num_tokens": 234204009.0, + "step": 9873 + }, + { + "epoch": 0.9864628602827314, + "grad_norm": 0.45890067063698414, + "learning_rate": 4.9120272169167484e-09, + "loss": 0.4839, + "mean_token_accuracy": 0.9078812897205353, + "num_tokens": 234285594.0, + "step": 9874 + }, + { + "epoch": 0.9865627653728958, + "grad_norm": 0.4247316812536969, + "learning_rate": 4.840592030374569e-09, + "loss": 0.4914, + "mean_token_accuracy": 0.9062552750110626, + "num_tokens": 234367053.0, + "step": 9875 + }, + { + "epoch": 0.9866626704630601, + "grad_norm": 0.646544365156015, + "learning_rate": 4.7696798419083476e-09, + "loss": 0.4852, + "mean_token_accuracy": 0.9089368879795074, + "num_tokens": 234448618.0, + "step": 9876 + }, + { + "epoch": 0.9867625755532244, + "grad_norm": 0.48233172139596053, + "learning_rate": 4.699290658943811e-09, + "loss": 0.4895, + "mean_token_accuracy": 0.9076013565063477, + "num_tokens": 234530135.0, + "step": 9877 + }, + { + "epoch": 0.9868624806433888, + "grad_norm": 0.3725956291725265, + "learning_rate": 4.629424488850065e-09, + "loss": 0.4785, + "mean_token_accuracy": 0.908269464969635, + "num_tokens": 234611777.0, + "step": 9878 + }, + { + "epoch": 0.9869623857335531, + "grad_norm": 0.39945627433206654, + "learning_rate": 4.560081338942368e-09, + "loss": 0.4855, + "mean_token_accuracy": 0.9097379744052887, + "num_tokens": 234693312.0, + "step": 9879 + }, + { + "epoch": 0.9870622908237174, + "grad_norm": 0.4316849959530191, + "learning_rate": 4.4912612164810245e-09, + "loss": 0.484, + "mean_token_accuracy": 0.9117792546749115, + "num_tokens": 234774869.0, + "step": 9880 + }, + { + "epoch": 0.9871621959138818, + "grad_norm": 0.4438160201342304, + "learning_rate": 4.422964128670826e-09, + "loss": 0.4832, + "mean_token_accuracy": 0.9079388678073883, + "num_tokens": 234856445.0, + "step": 9881 + }, + { + "epoch": 0.9872621010040462, + "grad_norm": 0.46727050065959347, + "learning_rate": 4.35519008266383e-09, + "loss": 0.4837, + "mean_token_accuracy": 0.9092470109462738, + "num_tokens": 234938030.0, + "step": 9882 + }, + { + "epoch": 0.9873620060942105, + "grad_norm": 0.4160861980424473, + "learning_rate": 4.287939085555471e-09, + "loss": 0.4836, + "mean_token_accuracy": 0.9097931087017059, + "num_tokens": 235019584.0, + "step": 9883 + }, + { + "epoch": 0.9874619111843749, + "grad_norm": 0.3882115801141471, + "learning_rate": 4.2212111443867835e-09, + "loss": 0.4827, + "mean_token_accuracy": 0.9083696901798248, + "num_tokens": 235101145.0, + "step": 9884 + }, + { + "epoch": 0.9875618162745392, + "grad_norm": 0.5442731526425099, + "learning_rate": 4.155006266143846e-09, + "loss": 0.489, + "mean_token_accuracy": 0.9069999158382416, + "num_tokens": 235182622.0, + "step": 9885 + }, + { + "epoch": 0.9876617213647035, + "grad_norm": 0.5862249250793732, + "learning_rate": 4.0893244577594474e-09, + "loss": 0.4848, + "mean_token_accuracy": 0.9076134860515594, + "num_tokens": 235264183.0, + "step": 9886 + }, + { + "epoch": 0.9877616264548679, + "grad_norm": 0.4325272703930694, + "learning_rate": 4.024165726110308e-09, + "loss": 0.4879, + "mean_token_accuracy": 0.9046730101108551, + "num_tokens": 235345674.0, + "step": 9887 + }, + { + "epoch": 0.9878615315450322, + "grad_norm": 0.5623727061647004, + "learning_rate": 3.9595300780176375e-09, + "loss": 0.4892, + "mean_token_accuracy": 0.9091968834400177, + "num_tokens": 235427154.0, + "step": 9888 + }, + { + "epoch": 0.9879614366351965, + "grad_norm": 0.4748454914319824, + "learning_rate": 3.895417520249911e-09, + "loss": 0.4883, + "mean_token_accuracy": 0.9113975465297699, + "num_tokens": 235508664.0, + "step": 9889 + }, + { + "epoch": 0.9880613417253609, + "grad_norm": 0.4442014766463063, + "learning_rate": 3.8318280595195376e-09, + "loss": 0.4817, + "mean_token_accuracy": 0.906569242477417, + "num_tokens": 235590277.0, + "step": 9890 + }, + { + "epoch": 0.9881612468155252, + "grad_norm": 0.4669141392954784, + "learning_rate": 3.7687617024839695e-09, + "loss": 0.4923, + "mean_token_accuracy": 0.9049039781093597, + "num_tokens": 235671733.0, + "step": 9891 + }, + { + "epoch": 0.9882611519056896, + "grad_norm": 0.48778920555487404, + "learning_rate": 3.706218455746813e-09, + "loss": 0.4907, + "mean_token_accuracy": 0.9076135158538818, + "num_tokens": 235753209.0, + "step": 9892 + }, + { + "epoch": 0.988361056995854, + "grad_norm": 0.4639179549032838, + "learning_rate": 3.644198325856163e-09, + "loss": 0.4866, + "mean_token_accuracy": 0.9080111384391785, + "num_tokens": 235834760.0, + "step": 9893 + }, + { + "epoch": 0.9884609620860183, + "grad_norm": 0.4191275636984395, + "learning_rate": 3.5827013193057148e-09, + "loss": 0.4848, + "mean_token_accuracy": 0.9062616527080536, + "num_tokens": 235916324.0, + "step": 9894 + }, + { + "epoch": 0.9885608671761826, + "grad_norm": 0.4277582689428032, + "learning_rate": 3.521727442534206e-09, + "loss": 0.4844, + "mean_token_accuracy": 0.9118683934211731, + "num_tokens": 235997890.0, + "step": 9895 + }, + { + "epoch": 0.988660772266347, + "grad_norm": 0.4634005888935991, + "learning_rate": 3.46127670192542e-09, + "loss": 0.4819, + "mean_token_accuracy": 0.908670961856842, + "num_tokens": 236079480.0, + "step": 9896 + }, + { + "epoch": 0.9887606773565113, + "grad_norm": 0.42112631675341966, + "learning_rate": 3.4013491038087376e-09, + "loss": 0.4912, + "mean_token_accuracy": 0.9084945023059845, + "num_tokens": 236160952.0, + "step": 9897 + }, + { + "epoch": 0.9888605824466756, + "grad_norm": 0.3966124230710889, + "learning_rate": 3.3419446544591396e-09, + "loss": 0.4816, + "mean_token_accuracy": 0.909673660993576, + "num_tokens": 236242548.0, + "step": 9898 + }, + { + "epoch": 0.98896048753684, + "grad_norm": 0.4615689770931435, + "learning_rate": 3.28306336009554e-09, + "loss": 0.4892, + "mean_token_accuracy": 0.9100191593170166, + "num_tokens": 236324017.0, + "step": 9899 + }, + { + "epoch": 0.9890603926270043, + "grad_norm": 0.4693673588331076, + "learning_rate": 3.224705226883007e-09, + "loss": 0.4889, + "mean_token_accuracy": 0.9057148396968842, + "num_tokens": 236405550.0, + "step": 9900 + }, + { + "epoch": 0.9891602977171687, + "grad_norm": 0.45683607465682946, + "learning_rate": 3.1668702609322087e-09, + "loss": 0.4828, + "mean_token_accuracy": 0.9091101884841919, + "num_tokens": 236487121.0, + "step": 9901 + }, + { + "epoch": 0.9892602028073331, + "grad_norm": 0.5276478033960428, + "learning_rate": 3.1095584682988565e-09, + "loss": 0.4872, + "mean_token_accuracy": 0.9077727794647217, + "num_tokens": 236568690.0, + "step": 9902 + }, + { + "epoch": 0.9893601078974974, + "grad_norm": 0.51819520323226, + "learning_rate": 3.0527698549820405e-09, + "loss": 0.4923, + "mean_token_accuracy": 0.9092527627944946, + "num_tokens": 236650148.0, + "step": 9903 + }, + { + "epoch": 0.9894600129876617, + "grad_norm": 0.42161830124584815, + "learning_rate": 2.9965044269286703e-09, + "loss": 0.4905, + "mean_token_accuracy": 0.9091483950614929, + "num_tokens": 236731578.0, + "step": 9904 + }, + { + "epoch": 0.9895599180778261, + "grad_norm": 0.46596490821619607, + "learning_rate": 2.940762190029589e-09, + "loss": 0.4881, + "mean_token_accuracy": 0.9083150327205658, + "num_tokens": 236813094.0, + "step": 9905 + }, + { + "epoch": 0.9896598231679904, + "grad_norm": 0.4865689903684969, + "learning_rate": 2.8855431501212396e-09, + "loss": 0.4927, + "mean_token_accuracy": 0.905411571264267, + "num_tokens": 236894537.0, + "step": 9906 + }, + { + "epoch": 0.9897597282581547, + "grad_norm": 0.9631212549566507, + "learning_rate": 2.8308473129851077e-09, + "loss": 0.4856, + "mean_token_accuracy": 0.9084171652793884, + "num_tokens": 236976136.0, + "step": 9907 + }, + { + "epoch": 0.9898596333483191, + "grad_norm": 0.5054638654037951, + "learning_rate": 2.7766746843477244e-09, + "loss": 0.4902, + "mean_token_accuracy": 0.9094071984291077, + "num_tokens": 237057643.0, + "step": 9908 + }, + { + "epoch": 0.9899595384384834, + "grad_norm": 0.4561809446376207, + "learning_rate": 2.7230252698806636e-09, + "loss": 0.4831, + "mean_token_accuracy": 0.9077201187610626, + "num_tokens": 237139233.0, + "step": 9909 + }, + { + "epoch": 0.9900594435286478, + "grad_norm": 0.42158649500059625, + "learning_rate": 2.669899075201654e-09, + "loss": 0.4792, + "mean_token_accuracy": 0.9076263606548309, + "num_tokens": 237220892.0, + "step": 9910 + }, + { + "epoch": 0.9901593486188122, + "grad_norm": 0.4280098978776012, + "learning_rate": 2.617296105872913e-09, + "loss": 0.4882, + "mean_token_accuracy": 0.9071600139141083, + "num_tokens": 237302417.0, + "step": 9911 + }, + { + "epoch": 0.9902592537089765, + "grad_norm": 0.5201321265715114, + "learning_rate": 2.5652163674017017e-09, + "loss": 0.4868, + "mean_token_accuracy": 0.9083058834075928, + "num_tokens": 237384015.0, + "step": 9912 + }, + { + "epoch": 0.9903591587991408, + "grad_norm": 0.40860843741549074, + "learning_rate": 2.5136598652408807e-09, + "loss": 0.4842, + "mean_token_accuracy": 0.9077337086200714, + "num_tokens": 237465567.0, + "step": 9913 + }, + { + "epoch": 0.9904590638893052, + "grad_norm": 0.70980697208002, + "learning_rate": 2.4626266047894642e-09, + "loss": 0.486, + "mean_token_accuracy": 0.9093123376369476, + "num_tokens": 237547109.0, + "step": 9914 + }, + { + "epoch": 0.9905589689794695, + "grad_norm": 0.5222175112944446, + "learning_rate": 2.412116591389291e-09, + "loss": 0.4868, + "mean_token_accuracy": 0.9080700576305389, + "num_tokens": 237628633.0, + "step": 9915 + }, + { + "epoch": 0.9906588740696338, + "grad_norm": 0.5194775354729176, + "learning_rate": 2.3621298303294626e-09, + "loss": 0.4887, + "mean_token_accuracy": 0.9071756303310394, + "num_tokens": 237710104.0, + "step": 9916 + }, + { + "epoch": 0.9907587791597982, + "grad_norm": 0.46049125438994415, + "learning_rate": 2.312666326843571e-09, + "loss": 0.4869, + "mean_token_accuracy": 0.9090134501457214, + "num_tokens": 237791642.0, + "step": 9917 + }, + { + "epoch": 0.9908586842499625, + "grad_norm": 0.4631399614010495, + "learning_rate": 2.2637260861102516e-09, + "loss": 0.4889, + "mean_token_accuracy": 0.9085146486759186, + "num_tokens": 237873117.0, + "step": 9918 + }, + { + "epoch": 0.9909585893401269, + "grad_norm": 0.5336719709703023, + "learning_rate": 2.215309113254294e-09, + "loss": 0.4882, + "mean_token_accuracy": 0.9088082313537598, + "num_tokens": 237954628.0, + "step": 9919 + }, + { + "epoch": 0.9910584944302913, + "grad_norm": 0.4251408109912825, + "learning_rate": 2.1674154133444202e-09, + "loss": 0.4836, + "mean_token_accuracy": 0.9072062969207764, + "num_tokens": 238036186.0, + "step": 9920 + }, + { + "epoch": 0.9911583995204556, + "grad_norm": 0.4644417703796544, + "learning_rate": 2.120044991394954e-09, + "loss": 0.4893, + "mean_token_accuracy": 0.9111781120300293, + "num_tokens": 238117690.0, + "step": 9921 + }, + { + "epoch": 0.9912583046106199, + "grad_norm": 0.40868099218577414, + "learning_rate": 2.073197852366371e-09, + "loss": 0.4849, + "mean_token_accuracy": 0.9110819399356842, + "num_tokens": 238199232.0, + "step": 9922 + }, + { + "epoch": 0.9913582097007843, + "grad_norm": 0.5518918050438149, + "learning_rate": 2.0268740011630818e-09, + "loss": 0.4866, + "mean_token_accuracy": 0.9062734544277191, + "num_tokens": 238280766.0, + "step": 9923 + }, + { + "epoch": 0.9914581147909486, + "grad_norm": 0.42364488427805547, + "learning_rate": 1.981073442635095e-09, + "loss": 0.486, + "mean_token_accuracy": 0.9087025821208954, + "num_tokens": 238362343.0, + "step": 9924 + }, + { + "epoch": 0.9915580198811129, + "grad_norm": 0.41215122665686865, + "learning_rate": 1.935796181578575e-09, + "loss": 0.489, + "mean_token_accuracy": 0.9058485627174377, + "num_tokens": 238443820.0, + "step": 9925 + }, + { + "epoch": 0.9916579249712772, + "grad_norm": 0.5505750300064027, + "learning_rate": 1.891042222733064e-09, + "loss": 0.4898, + "mean_token_accuracy": 0.90992471575737, + "num_tokens": 238525282.0, + "step": 9926 + }, + { + "epoch": 0.9917578300614416, + "grad_norm": 0.49282237674801915, + "learning_rate": 1.8468115707848121e-09, + "loss": 0.4839, + "mean_token_accuracy": 0.909336268901825, + "num_tokens": 238606869.0, + "step": 9927 + }, + { + "epoch": 0.991857735151606, + "grad_norm": 0.42915520431569537, + "learning_rate": 1.8031042303651158e-09, + "loss": 0.4855, + "mean_token_accuracy": 0.9094377160072327, + "num_tokens": 238688381.0, + "step": 9928 + }, + { + "epoch": 0.9919576402417704, + "grad_norm": 0.44888026404020426, + "learning_rate": 1.7599202060497588e-09, + "loss": 0.4891, + "mean_token_accuracy": 0.9058064222335815, + "num_tokens": 238769887.0, + "step": 9929 + }, + { + "epoch": 0.9920575453319347, + "grad_norm": 0.5605170391120703, + "learning_rate": 1.7172595023601247e-09, + "loss": 0.4903, + "mean_token_accuracy": 0.9077538251876831, + "num_tokens": 238851363.0, + "step": 9930 + }, + { + "epoch": 0.992157450422099, + "grad_norm": 0.4125159668276633, + "learning_rate": 1.6751221237631954e-09, + "loss": 0.4919, + "mean_token_accuracy": 0.909724086523056, + "num_tokens": 238932773.0, + "step": 9931 + }, + { + "epoch": 0.9922573555122634, + "grad_norm": 0.4299107420332357, + "learning_rate": 1.6335080746704424e-09, + "loss": 0.4844, + "mean_token_accuracy": 0.9097511768341064, + "num_tokens": 239014341.0, + "step": 9932 + }, + { + "epoch": 0.9923572606024277, + "grad_norm": 0.5192851853555086, + "learning_rate": 1.5924173594389358e-09, + "loss": 0.487, + "mean_token_accuracy": 0.9072272777557373, + "num_tokens": 239095861.0, + "step": 9933 + }, + { + "epoch": 0.992457165692592, + "grad_norm": 0.47648382978125037, + "learning_rate": 1.5518499823713451e-09, + "loss": 0.4869, + "mean_token_accuracy": 0.9076390862464905, + "num_tokens": 239177391.0, + "step": 9934 + }, + { + "epoch": 0.9925570707827563, + "grad_norm": 0.44899107843516056, + "learning_rate": 1.5118059477142732e-09, + "loss": 0.4892, + "mean_token_accuracy": 0.907061368227005, + "num_tokens": 239258861.0, + "step": 9935 + }, + { + "epoch": 0.9926569758729207, + "grad_norm": 0.4369566119197789, + "learning_rate": 1.4722852596615877e-09, + "loss": 0.4848, + "mean_token_accuracy": 0.9104726016521454, + "num_tokens": 239340385.0, + "step": 9936 + }, + { + "epoch": 0.9927568809630851, + "grad_norm": 0.45282240022497044, + "learning_rate": 1.4332879223499795e-09, + "loss": 0.4882, + "mean_token_accuracy": 0.9115562438964844, + "num_tokens": 239421901.0, + "step": 9937 + }, + { + "epoch": 0.9928567860532495, + "grad_norm": 0.41368026973680805, + "learning_rate": 1.3948139398628492e-09, + "loss": 0.4867, + "mean_token_accuracy": 0.9082445502281189, + "num_tokens": 239503438.0, + "step": 9938 + }, + { + "epoch": 0.9929566911434138, + "grad_norm": 0.4650518929608282, + "learning_rate": 1.3568633162286405e-09, + "loss": 0.4885, + "mean_token_accuracy": 0.9072401523590088, + "num_tokens": 239584979.0, + "step": 9939 + }, + { + "epoch": 0.9930565962335781, + "grad_norm": 0.5300595213065356, + "learning_rate": 1.3194360554213969e-09, + "loss": 0.4851, + "mean_token_accuracy": 0.9097177088260651, + "num_tokens": 239666511.0, + "step": 9940 + }, + { + "epoch": 0.9931565013237424, + "grad_norm": 0.4683060139278859, + "learning_rate": 1.2825321613585406e-09, + "loss": 0.4845, + "mean_token_accuracy": 0.9064200818538666, + "num_tokens": 239748110.0, + "step": 9941 + }, + { + "epoch": 0.9932564064139068, + "grad_norm": 0.3994489925667891, + "learning_rate": 1.2461516379047579e-09, + "loss": 0.4811, + "mean_token_accuracy": 0.9098271131515503, + "num_tokens": 239829698.0, + "step": 9942 + }, + { + "epoch": 0.9933563115040711, + "grad_norm": 0.5512840205264672, + "learning_rate": 1.2102944888686685e-09, + "loss": 0.4842, + "mean_token_accuracy": 0.9085943400859833, + "num_tokens": 239911243.0, + "step": 9943 + }, + { + "epoch": 0.9934562165942354, + "grad_norm": 0.4938360486522781, + "learning_rate": 1.174960718005047e-09, + "loss": 0.49, + "mean_token_accuracy": 0.907096803188324, + "num_tokens": 239992739.0, + "step": 9944 + }, + { + "epoch": 0.9935561216843998, + "grad_norm": 0.5019745018214216, + "learning_rate": 1.1401503290126014e-09, + "loss": 0.492, + "mean_token_accuracy": 0.9096327424049377, + "num_tokens": 240074199.0, + "step": 9945 + }, + { + "epoch": 0.9936560267745642, + "grad_norm": 1.1669424022069317, + "learning_rate": 1.1058633255373042e-09, + "loss": 0.4885, + "mean_token_accuracy": 0.9102111756801605, + "num_tokens": 240155720.0, + "step": 9946 + }, + { + "epoch": 0.9937559318647285, + "grad_norm": 0.5717159137698055, + "learning_rate": 1.072099711167951e-09, + "loss": 0.4871, + "mean_token_accuracy": 0.9080385863780975, + "num_tokens": 240237240.0, + "step": 9947 + }, + { + "epoch": 0.9938558369548929, + "grad_norm": 0.4278012671523783, + "learning_rate": 1.0388594894400472e-09, + "loss": 0.4863, + "mean_token_accuracy": 0.9070273041725159, + "num_tokens": 240318784.0, + "step": 9948 + }, + { + "epoch": 0.9939557420450572, + "grad_norm": 0.48192283138941605, + "learning_rate": 1.006142663833587e-09, + "loss": 0.4939, + "mean_token_accuracy": 0.9080570340156555, + "num_tokens": 240400251.0, + "step": 9949 + }, + { + "epoch": 0.9940556471352215, + "grad_norm": 0.4221799642756044, + "learning_rate": 9.739492377741633e-10, + "loss": 0.4823, + "mean_token_accuracy": 0.9085989892482758, + "num_tokens": 240481843.0, + "step": 9950 + }, + { + "epoch": 0.9941555522253859, + "grad_norm": 0.42254369415830223, + "learning_rate": 9.422792146329685e-10, + "loss": 0.4866, + "mean_token_accuracy": 0.9076747298240662, + "num_tokens": 240563355.0, + "step": 9951 + }, + { + "epoch": 0.9942554573155502, + "grad_norm": 0.600878526314759, + "learning_rate": 9.111325977251285e-10, + "loss": 0.4814, + "mean_token_accuracy": 0.9105695486068726, + "num_tokens": 240644938.0, + "step": 9952 + }, + { + "epoch": 0.9943553624057145, + "grad_norm": 0.5539611765310593, + "learning_rate": 8.805093903119233e-10, + "loss": 0.49, + "mean_token_accuracy": 0.9073931872844696, + "num_tokens": 240726394.0, + "step": 9953 + }, + { + "epoch": 0.9944552674958789, + "grad_norm": 0.4345369024935769, + "learning_rate": 8.504095956002323e-10, + "loss": 0.4839, + "mean_token_accuracy": 0.9068019986152649, + "num_tokens": 240807962.0, + "step": 9954 + }, + { + "epoch": 0.9945551725860433, + "grad_norm": 0.478066505666678, + "learning_rate": 8.208332167408684e-10, + "loss": 0.4892, + "mean_token_accuracy": 0.9089896082878113, + "num_tokens": 240889476.0, + "step": 9955 + }, + { + "epoch": 0.9946550776762076, + "grad_norm": 0.48061352082149617, + "learning_rate": 7.917802568307987e-10, + "loss": 0.4909, + "mean_token_accuracy": 0.9079852402210236, + "num_tokens": 240970947.0, + "step": 9956 + }, + { + "epoch": 0.994754982766372, + "grad_norm": 0.47550276634915994, + "learning_rate": 7.63250718911479e-10, + "loss": 0.493, + "mean_token_accuracy": 0.9079941809177399, + "num_tokens": 241052441.0, + "step": 9957 + }, + { + "epoch": 0.9948548878565363, + "grad_norm": 0.4443969619828285, + "learning_rate": 7.352446059705198e-10, + "loss": 0.4874, + "mean_token_accuracy": 0.9109722375869751, + "num_tokens": 241133952.0, + "step": 9958 + }, + { + "epoch": 0.9949547929467006, + "grad_norm": 0.5568518507981283, + "learning_rate": 7.0776192094002e-10, + "loss": 0.4917, + "mean_token_accuracy": 0.9056884944438934, + "num_tokens": 241215417.0, + "step": 9959 + }, + { + "epoch": 0.995054698036865, + "grad_norm": 0.5204779827820651, + "learning_rate": 6.808026666971224e-10, + "loss": 0.4928, + "mean_token_accuracy": 0.9059121310710907, + "num_tokens": 241296892.0, + "step": 9960 + }, + { + "epoch": 0.9951546031270293, + "grad_norm": 0.5162517291576081, + "learning_rate": 6.543668460651243e-10, + "loss": 0.4919, + "mean_token_accuracy": 0.9072420001029968, + "num_tokens": 241378355.0, + "step": 9961 + }, + { + "epoch": 0.9952545082171936, + "grad_norm": 0.43795764166080703, + "learning_rate": 6.284544618112565e-10, + "loss": 0.4847, + "mean_token_accuracy": 0.9087881445884705, + "num_tokens": 241459928.0, + "step": 9962 + }, + { + "epoch": 0.995354413307358, + "grad_norm": 0.6168383816150903, + "learning_rate": 6.030655166489042e-10, + "loss": 0.4864, + "mean_token_accuracy": 0.907742440700531, + "num_tokens": 241541487.0, + "step": 9963 + }, + { + "epoch": 0.9954543183975224, + "grad_norm": 0.5060108204837485, + "learning_rate": 5.782000132364962e-10, + "loss": 0.4891, + "mean_token_accuracy": 0.9072130620479584, + "num_tokens": 241623024.0, + "step": 9964 + }, + { + "epoch": 0.9955542234876867, + "grad_norm": 0.440594881525938, + "learning_rate": 5.538579541769506e-10, + "loss": 0.4821, + "mean_token_accuracy": 0.9082703292369843, + "num_tokens": 241704640.0, + "step": 9965 + }, + { + "epoch": 0.9956541285778511, + "grad_norm": 0.49284120572780027, + "learning_rate": 5.300393420193395e-10, + "loss": 0.4886, + "mean_token_accuracy": 0.906665563583374, + "num_tokens": 241786145.0, + "step": 9966 + }, + { + "epoch": 0.9957540336680154, + "grad_norm": 0.5881136081872769, + "learning_rate": 5.067441792572236e-10, + "loss": 0.4911, + "mean_token_accuracy": 0.9079246520996094, + "num_tokens": 241867644.0, + "step": 9967 + }, + { + "epoch": 0.9958539387581797, + "grad_norm": 0.4375962007780433, + "learning_rate": 4.839724683297631e-10, + "loss": 0.4842, + "mean_token_accuracy": 0.9104458093643188, + "num_tokens": 241949238.0, + "step": 9968 + }, + { + "epoch": 0.9959538438483441, + "grad_norm": 0.45479623289194765, + "learning_rate": 4.617242116211618e-10, + "loss": 0.4871, + "mean_token_accuracy": 0.9059666991233826, + "num_tokens": 242030791.0, + "step": 9969 + }, + { + "epoch": 0.9960537489385084, + "grad_norm": 0.456943471506193, + "learning_rate": 4.399994114606676e-10, + "loss": 0.4828, + "mean_token_accuracy": 0.9118790030479431, + "num_tokens": 242112359.0, + "step": 9970 + }, + { + "epoch": 0.9961536540286727, + "grad_norm": 0.44105066296960777, + "learning_rate": 4.1879807012312757e-10, + "loss": 0.4864, + "mean_token_accuracy": 0.9082905352115631, + "num_tokens": 242193870.0, + "step": 9971 + }, + { + "epoch": 0.9962535591188371, + "grad_norm": 0.45299911887635086, + "learning_rate": 3.981201898284326e-10, + "loss": 0.486, + "mean_token_accuracy": 0.9075597822666168, + "num_tokens": 242275404.0, + "step": 9972 + }, + { + "epoch": 0.9963534642090014, + "grad_norm": 0.5093682999691529, + "learning_rate": 3.7796577274096244e-10, + "loss": 0.492, + "mean_token_accuracy": 0.9059968590736389, + "num_tokens": 242356907.0, + "step": 9973 + }, + { + "epoch": 0.9964533692991658, + "grad_norm": 0.456492447043787, + "learning_rate": 3.583348209718063e-10, + "loss": 0.4903, + "mean_token_accuracy": 0.9087526500225067, + "num_tokens": 242438402.0, + "step": 9974 + }, + { + "epoch": 0.9965532743893302, + "grad_norm": 0.6242381632950854, + "learning_rate": 3.3922733657598684e-10, + "loss": 0.4919, + "mean_token_accuracy": 0.9077108502388, + "num_tokens": 242519851.0, + "step": 9975 + }, + { + "epoch": 0.9966531794794945, + "grad_norm": 0.4683535692563051, + "learning_rate": 3.206433215535709e-10, + "loss": 0.4922, + "mean_token_accuracy": 0.9069204330444336, + "num_tokens": 242601322.0, + "step": 9976 + }, + { + "epoch": 0.9967530845696588, + "grad_norm": 0.5905569662365524, + "learning_rate": 3.025827778507795e-10, + "loss": 0.4801, + "mean_token_accuracy": 0.9123786389827728, + "num_tokens": 242682944.0, + "step": 9977 + }, + { + "epoch": 0.9968529896598232, + "grad_norm": 0.4147703434094908, + "learning_rate": 2.850457073588775e-10, + "loss": 0.4821, + "mean_token_accuracy": 0.9091896116733551, + "num_tokens": 242764554.0, + "step": 9978 + }, + { + "epoch": 0.9969528947499875, + "grad_norm": 0.5170858496524833, + "learning_rate": 2.6803211191306356e-10, + "loss": 0.4891, + "mean_token_accuracy": 0.906772792339325, + "num_tokens": 242846077.0, + "step": 9979 + }, + { + "epoch": 0.9970527998401518, + "grad_norm": 0.45809157326225997, + "learning_rate": 2.5154199329580077e-10, + "loss": 0.4852, + "mean_token_accuracy": 0.9082243740558624, + "num_tokens": 242927614.0, + "step": 9980 + }, + { + "epoch": 0.9971527049303162, + "grad_norm": 0.5068921929197872, + "learning_rate": 2.355753532329308e-10, + "loss": 0.4872, + "mean_token_accuracy": 0.9074807465076447, + "num_tokens": 243009162.0, + "step": 9981 + }, + { + "epoch": 0.9972526100204805, + "grad_norm": 0.4967329921018459, + "learning_rate": 2.2013219339644953e-10, + "loss": 0.4945, + "mean_token_accuracy": 0.9058125913143158, + "num_tokens": 243090615.0, + "step": 9982 + }, + { + "epoch": 0.9973525151106449, + "grad_norm": 0.5035053074650437, + "learning_rate": 2.052125154028417e-10, + "loss": 0.4889, + "mean_token_accuracy": 0.9079805314540863, + "num_tokens": 243172113.0, + "step": 9983 + }, + { + "epoch": 0.9974524202008093, + "grad_norm": 0.4565661035270196, + "learning_rate": 1.9081632081474622e-10, + "loss": 0.4859, + "mean_token_accuracy": 0.9083934724330902, + "num_tokens": 243253642.0, + "step": 9984 + }, + { + "epoch": 0.9975523252909736, + "grad_norm": 0.49979009520400713, + "learning_rate": 1.7694361113929081e-10, + "loss": 0.4843, + "mean_token_accuracy": 0.9090737104415894, + "num_tokens": 243335218.0, + "step": 9985 + }, + { + "epoch": 0.9976522303811379, + "grad_norm": 0.3751974011939974, + "learning_rate": 1.6359438782864724e-10, + "loss": 0.4801, + "mean_token_accuracy": 0.9113207161426544, + "num_tokens": 243416801.0, + "step": 9986 + }, + { + "epoch": 0.9977521354713023, + "grad_norm": 0.4532196687579408, + "learning_rate": 1.5076865228114136e-10, + "loss": 0.4927, + "mean_token_accuracy": 0.9088998138904572, + "num_tokens": 243498220.0, + "step": 9987 + }, + { + "epoch": 0.9978520405614666, + "grad_norm": 0.6967568093769425, + "learning_rate": 1.3846640583903282e-10, + "loss": 0.4897, + "mean_token_accuracy": 0.9043579399585724, + "num_tokens": 243579725.0, + "step": 9988 + }, + { + "epoch": 0.9979519456516309, + "grad_norm": 0.4138918566896966, + "learning_rate": 1.2668764979018034e-10, + "loss": 0.4869, + "mean_token_accuracy": 0.9099591374397278, + "num_tokens": 243661243.0, + "step": 9989 + }, + { + "epoch": 0.9980518507417953, + "grad_norm": 0.5019074277362282, + "learning_rate": 1.154323853685968e-10, + "loss": 0.4875, + "mean_token_accuracy": 0.9082396626472473, + "num_tokens": 243742770.0, + "step": 9990 + }, + { + "epoch": 0.9981517558319596, + "grad_norm": 0.5292950767220274, + "learning_rate": 1.0470061375278396e-10, + "loss": 0.4856, + "mean_token_accuracy": 0.9071274101734161, + "num_tokens": 243824321.0, + "step": 9991 + }, + { + "epoch": 0.998251660922124, + "grad_norm": 0.4941462723175463, + "learning_rate": 9.449233606573238e-11, + "loss": 0.4888, + "mean_token_accuracy": 0.9069647490978241, + "num_tokens": 243905891.0, + "step": 9992 + }, + { + "epoch": 0.9983515660122884, + "grad_norm": 0.4393464545079283, + "learning_rate": 8.480755337603175e-11, + "loss": 0.486, + "mean_token_accuracy": 0.9106847047805786, + "num_tokens": 243987452.0, + "step": 9993 + }, + { + "epoch": 0.9984514711024527, + "grad_norm": 0.44634231559075604, + "learning_rate": 7.56462666984259e-11, + "loss": 0.4876, + "mean_token_accuracy": 0.9084382951259613, + "num_tokens": 244068999.0, + "step": 9994 + }, + { + "epoch": 0.998551376192617, + "grad_norm": 0.43518158812504626, + "learning_rate": 6.700847699214753e-11, + "loss": 0.4853, + "mean_token_accuracy": 0.907879501581192, + "num_tokens": 244150528.0, + "step": 9995 + }, + { + "epoch": 0.9986512812827814, + "grad_norm": 0.578753642404188, + "learning_rate": 5.889418516091816e-11, + "loss": 0.493, + "mean_token_accuracy": 0.9087878167629242, + "num_tokens": 244231992.0, + "step": 9996 + }, + { + "epoch": 0.9987511863729457, + "grad_norm": 0.5250928315772868, + "learning_rate": 5.1303392054613546e-11, + "loss": 0.4835, + "mean_token_accuracy": 0.9101284444332123, + "num_tokens": 244313594.0, + "step": 9997 + }, + { + "epoch": 0.99885109146311, + "grad_norm": 0.4677446272308099, + "learning_rate": 4.423609846815335e-11, + "loss": 0.4876, + "mean_token_accuracy": 0.9085477590560913, + "num_tokens": 244395128.0, + "step": 9998 + }, + { + "epoch": 0.9989509965532744, + "grad_norm": 0.4739756333247457, + "learning_rate": 3.769230514094613e-11, + "loss": 0.4847, + "mean_token_accuracy": 0.9081336557865143, + "num_tokens": 244476640.0, + "step": 9999 + }, + { + "epoch": 0.9990509016434387, + "grad_norm": 0.6533121580768001, + "learning_rate": 3.1672012759109694e-11, + "loss": 0.4849, + "mean_token_accuracy": 0.9110117852687836, + "num_tokens": 244558246.0, + "step": 10000 } ], "logging_steps": 1, @@ -81027,7 +90027,7 @@ "attributes": {} } }, - "total_flos": 4007088441458688.0, + "total_flos": 4452351382650880.0, "train_batch_size": 8, "trial_name": null, "trial_params": null